In [1]:
from utilities.data_loader import load_modeling_data, load_testing_data, prepare_kaggle_submission
from utilities.text_cleaner import advanced_data_cleaning


In [41]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from imblearn.over_sampling import SMOTE
from sklearn.naive_bayes import MultinomialNB, ComplementNB
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

In [5]:
train_data, train_labels = load_modeling_data()

In [6]:
le = LabelEncoder()
train_labels['target'] = le.fit_transform(train_labels['target'])

In [8]:
train_data['text'] =train_data['text'].apply(advanced_data_cleaning)

In [13]:
vectorizer = TfidfVectorizer()
train_data = vectorizer.fit_transform(train_data['text'])

In [14]:
X_train, X_val, y_train, y_val = train_test_split(train_data, train_labels, test_size = 0.2, random_state=8)

In [17]:
smote = SMOTE(random_state=8, n_jobs=-1)
X_SMOTE, y_SMOTE = smote.fit_resample(X_train, y_train['target'])

In [18]:
nb = MultinomialNB(alpha=3.0)
nb.fit(X_SMOTE, y_SMOTE)

In [22]:
y_pred = nb.predict(X_val)

In [23]:
print(accuracy_score(y_val,y_pred))
print(confusion_matrix(y_val, y_pred))
print(classification_report(y_val, y_pred))

0.7681301516353063
[[86493   360 16794]
 [    3     4     4]
 [30469   614 73324]]
              precision    recall  f1-score   support

           0       0.74      0.83      0.78    103647
           1       0.00      0.36      0.01        11
           2       0.81      0.70      0.75    104407

    accuracy                           0.77    208065
   macro avg       0.52      0.63      0.52    208065
weighted avg       0.78      0.77      0.77    208065



In [24]:
nb2 = MultinomialNB(alpha=3.0)
nb2.fit(X_train, y_train['target'])

In [26]:
y_pred = nb2.predict(X_val)

In [27]:
print(accuracy_score(y_val,y_pred))
print(confusion_matrix(y_val, y_pred))
print(classification_report(y_val, y_pred))

0.7712445629971403
[[86680     0 16967]
 [    3     0     8]
 [30618     0 73789]]
              precision    recall  f1-score   support

           0       0.74      0.84      0.78    103647
           1       0.00      0.00      0.00        11
           2       0.81      0.71      0.76    104407

    accuracy                           0.77    208065
   macro avg       0.52      0.51      0.51    208065
weighted avg       0.78      0.77      0.77    208065



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [28]:
nb3 = ComplementNB(alpha=3.0)

In [31]:
nb3.fit(X_train, y_train['target'])

In [32]:
y_pred = nb3.predict(X_val)

In [33]:
print(accuracy_score(y_val,y_pred))
print(confusion_matrix(y_val, y_pred))
print(classification_report(y_val, y_pred))

0.7599932713334775
[[85889  1429 16329]
 [    3     0     8]
 [29911  2257 72239]]
              precision    recall  f1-score   support

           0       0.74      0.83      0.78    103647
           1       0.00      0.00      0.00        11
           2       0.82      0.69      0.75    104407

    accuracy                           0.76    208065
   macro avg       0.52      0.51      0.51    208065
weighted avg       0.78      0.76      0.77    208065



In [34]:
nb4 = ComplementNB(alpha=3.0)

In [38]:
nb4.fit(X_SMOTE, y_SMOTE)
y_pred = nb4.predict(X_val)

In [39]:
print(accuracy_score(y_val,y_pred))
print(confusion_matrix(y_val, y_pred))
print(classification_report(y_val, y_pred))

0.7494821329872876
[[85160  2189 16298]
 [    1     8     2]
 [29816  3818 70773]]
              precision    recall  f1-score   support

           0       0.74      0.82      0.78    103647
           1       0.00      0.73      0.00        11
           2       0.81      0.68      0.74    104407

    accuracy                           0.75    208065
   macro avg       0.52      0.74      0.51    208065
weighted avg       0.78      0.75      0.76    208065



In [44]:
logreg = LogisticRegression(C=0.5, max_iter=300, solver='saga',
                   tol=0.00022294400779122961)

In [46]:
logreg.fit(X_train, y_train['target'])

In [47]:
y_pred = logreg.predict(X_val)

In [48]:
print(accuracy_score(y_val,y_pred))
print(confusion_matrix(y_val, y_pred))
print(classification_report(y_val, y_pred))

0.8006824790329945
[[81337     0 22310]
 [    2     0     9]
 [19150     0 85257]]
              precision    recall  f1-score   support

           0       0.81      0.78      0.80    103647
           1       0.00      0.00      0.00        11
           2       0.79      0.82      0.80    104407

    accuracy                           0.80    208065
   macro avg       0.53      0.53      0.53    208065
weighted avg       0.80      0.80      0.80    208065



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [49]:
logreg2 = LogisticRegression(C=0.5, max_iter=300, solver='saga',
                   tol=0.00022294400779122961)

In [50]:
logreg2.fit(X_SMOTE, y_SMOTE)

In [53]:
y_pred = logreg2.predict(X_val)

In [54]:
print(accuracy_score(y_val,y_pred))
print(confusion_matrix(y_val, y_pred))
print(classification_report(y_val, y_pred))

0.8003316271357509
[[81310    11 22326]
 [    2     1     8]
 [19158    39 85210]]
              precision    recall  f1-score   support

           0       0.81      0.78      0.80    103647
           1       0.02      0.09      0.03        11
           2       0.79      0.82      0.80    104407

    accuracy                           0.80    208065
   macro avg       0.54      0.56      0.54    208065
weighted avg       0.80      0.80      0.80    208065

