In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
import skopt
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, classification_report


In [2]:
from utilities.data_loader import load_modeling_data, load_testing_data
from utilities.text_cleaner import basic_data_cleaning, advanced_data_cleaning

In [3]:
train_data, train_labels = load_modeling_data()

In [4]:
test_data = load_testing_data()

In [5]:
dict_vals = {'negative': 0, 'neutral':1, 'positive':2}
train_labels['target'] = train_labels['target'].apply(lambda x: dict_vals[x])

In [6]:
train_labels

Unnamed: 0_level_0,target
id,Unnamed: 1_level_1
0,2
1,0
2,2
3,2
4,0
...,...
1040318,2
1040319,0
1040320,0
1040321,2


# Experiment 1: Making Baseline

In [7]:
x_train, x_validation, y_train, y_validation = train_test_split(train_data, train_labels, test_size = 0.2, random_state=10)

In [8]:
print(x_train.shape)

(832258, 1)


In [9]:
print(x_validation.shape)

(208065, 1)


In [10]:
bow = CountVectorizer()

In [11]:
x_train_bow = bow.fit_transform(x_train['text'])
x_validation_bow = bow.transform(x_validation['text'])

In [12]:
print(x_train_bow.shape)

(832258, 439807)


In [13]:
nb_clf_1 = MultinomialNB()

In [14]:
nb_clf_1.fit(x_train_bow, y_train['target'].values)

In [15]:
y_pred_validation = nb_clf_1.predict(x_validation_bow)

In [16]:
print('Accuracy score: ', accuracy_score(y_validation['target'].values, y_pred_validation))

Accuracy score:  0.7795448537716579


In [17]:
print(classification_report(y_validation['target'].values, y_pred_validation))

              precision    recall  f1-score   support

           0       0.76      0.82      0.79    104093
           1       0.00      0.00      0.00        16
           2       0.80      0.74      0.77    103956

    accuracy                           0.78    208065
   macro avg       0.52      0.52      0.52    208065
weighted avg       0.78      0.78      0.78    208065



In [18]:
# Retraining for Kaggle Submission

In [19]:
final_training_data = train_data.copy()

In [20]:
final_training_labels = train_labels.copy()

In [21]:
final_test_data = test_data.copy()

In [22]:
bow = CountVectorizer()
final_training_data = bow.fit_transform(final_training_data['text'])

In [23]:
final_test_data = bow.transform(final_test_data['text'])

In [24]:
nb_clf = MultinomialNB()
nb_clf.fit(final_training_data, final_training_labels['target'])
final_test_pred = nb_clf.predict(final_test_data) 

In [25]:
final_test_pred = pd.DataFrame(final_test_pred).reset_index()
final_test_pred.columns = ['id', 'target']
final_test_pred.to_csv('kaggle-submissions/nb-bow-baseline.csv',index=False)

# Experiment 2: With Basic preprocessing 

In [26]:
x_train, x_validation, y_train, y_validation = train_test_split(train_data, train_labels, test_size = 0.2, random_state=10)

In [27]:
x_train['text'] = x_train['text'].apply(basic_data_cleaning)

In [28]:
x_validation['text'] = x_validation['text'].apply(basic_data_cleaning)

In [29]:
bow = CountVectorizer()

In [30]:
x_train_bow = bow.fit_transform(x_train['text'])
x_validation_bow = bow.transform(x_validation['text'])

In [31]:
print(x_train_bow.shape)

(832258, 490026)


In [32]:
nb_clf_2 = MultinomialNB()

In [33]:
nb_clf_2.fit(x_train_bow, y_train['target'].values)

In [34]:
y_pred_validation = nb_clf_2.predict(x_validation_bow)

In [35]:
print('Accuracy score: ', accuracy_score(y_validation['target'].values, y_pred_validation))

Accuracy score:  0.7654819407396727


In [36]:
print(classification_report(y_validation['target'].values, y_pred_validation))

              precision    recall  f1-score   support

           0       0.75      0.80      0.77    104093
           1       0.00      0.00      0.00        16
           2       0.78      0.73      0.76    103956

    accuracy                           0.77    208065
   macro avg       0.51      0.51      0.51    208065
weighted avg       0.77      0.77      0.77    208065



In [37]:
# Retraining for Kaggle Submission

In [38]:
final_training_data = train_data.copy()

In [39]:
final_training_labels = train_labels.copy()

In [40]:
final_test_data = test_data.copy()

In [41]:
final_training_data['text'] = final_training_data['text'].apply(basic_data_cleaning)

In [42]:
final_test_data['text'] = final_test_data['text'].apply(basic_data_cleaning)

In [43]:
bow = CountVectorizer()
final_training_data = bow.fit_transform(final_training_data['text'])

In [44]:
final_test_data = bow.transform(final_test_data['text'])

In [45]:
nb_clf = MultinomialNB()
nb_clf.fit(final_training_data, final_training_labels['target'])
final_test_pred = nb_clf.predict(final_test_data) 

In [46]:
final_test_pred = pd.DataFrame(final_test_pred).reset_index()
final_test_pred.columns = ['id', 'target']
final_test_pred.to_csv('kaggle-submissions/nb-bow-exp-1-basic.csv',index=False)

# Experiment 3: With Advanced data preprocessing

In [47]:
x_train, x_validation, y_train, y_validation = train_test_split(train_data, train_labels, test_size = 0.2, random_state=10)

In [48]:
x_train['text'] = x_train['text'].apply(advanced_data_cleaning)

In [49]:
x_validation['text'] = x_validation['text'].apply(advanced_data_cleaning)

In [50]:
bow = CountVectorizer()

In [51]:
x_train_bow = bow.fit_transform(x_train['text'])
x_validation_bow = bow.transform(x_validation['text'])

In [52]:
print(x_train_bow.shape)

(832258, 374128)


In [53]:
nb_clf_3 = MultinomialNB()

In [54]:
nb_clf_3.fit(x_train_bow, y_train['target'].values)

In [55]:
y_pred_validation = nb_clf_3.predict(x_validation_bow)

In [56]:
print('Accuracy score: ', accuracy_score(y_validation['target'].values, y_pred_validation))

Accuracy score:  0.7793766371085958


In [57]:
print(classification_report(y_validation['target'].values, y_pred_validation))

              precision    recall  f1-score   support

           0       0.76      0.81      0.79    104093
           1       0.00      0.00      0.00        16
           2       0.80      0.74      0.77    103956

    accuracy                           0.78    208065
   macro avg       0.52      0.52      0.52    208065
weighted avg       0.78      0.78      0.78    208065



In [58]:
# Retraining for Kaggle Submission

In [59]:
final_training_data = train_data.copy()

In [60]:
final_training_labels = train_labels.copy()

In [61]:
final_test_data = test_data.copy()

In [62]:
final_training_data['text'] = final_training_data['text'].apply(advanced_data_cleaning)

In [63]:
final_test_data['text'] = final_test_data['text'].apply(advanced_data_cleaning)

In [64]:
bow = CountVectorizer()
final_training_data = bow.fit_transform(final_training_data['text'])

In [65]:
final_test_data = bow.transform(final_test_data['text'])

In [66]:
nb_clf = MultinomialNB()
nb_clf.fit(final_training_data, final_training_labels['target'])
final_test_pred = nb_clf.predict(final_test_data) 

In [67]:
final_test_pred = pd.DataFrame(final_test_pred).reset_index()
final_test_pred.columns = ['id', 'target']
final_test_pred.to_csv('kaggle-submissions/nb-bow-exp-2-advanced.csv',index=False)