In [52]:
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
import seaborn as sns
import spacy
import time
from collections import Counter
from imblearn.over_sampling import RandomOverSampler


from sklearn.metrics import precision_recall_fscore_support 
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer, ENGLISH_STOP_WORDS
from sklearn.model_selection import train_test_split
from sklearn import metrics
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
from sklearn.model_selection import cross_val_score
import xgboost as xgb


In [9]:
#HyperParameter Tuning with Crossvalidation in GridSearch 

In [10]:
df = pd.read_json('DiSmldata.json')

In [11]:
df

Unnamed: 0,hash_count,emoji_count,clean_text,label,anger,anticipation,disgust,fear,joy,sadness,surprise,trust,punc_count,tweet_len,cap_count,user_follower,user_favorite
0,1,0,new mickey flower bed going make event photo l...,2,0,0,0,0,0,0,0,0,9,193,20,1214.0,982.0
2,1,0,thought different yesterday mickey flower bed ...,1,0,1,0,0,0,0,0,0,10,115,13,638.0,195775.0
5,0,0,mickey flower bed renewed tokyo disneyland tod...,1,0,0,0,0,0,0,0,0,12,119,16,157.0,4158.0
17,1,0,mickey flower bed back tdrnow,1,0,0,0,0,0,0,0,0,10,72,12,83.0,8334.0
19,0,0,god has face changed,1,0,1,0,1,1,0,0,1,0,29,3,882.0,41436.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
87503,3,0,hurtado help cypress overpower pacifica finish...,5,0,0,0,0,0,0,0,0,11,124,11,572.0,124266.0
87504,1,1,oct agricultural counsellor attended award cer...,5,1,2,0,1,4,1,4,4,9,256,24,82.0,16.0
87506,0,0,being single move,4,0,0,0,0,0,0,0,0,0,23,1,788.0,22525.0
87508,0,0,time pose hard without laughing disneyland def...,3,0,1,0,0,1,0,0,0,2,104,5,24.0,1819.0


In [12]:
my_stop_words = ENGLISH_STOP_WORDS.union(['disneyland','tokyo','disney', 'im', 'tdrnow','paris','california','amp','disneysea','got',
                                         'ºc', 'ºf', 'ºoº','𝗧𝗵𝗲','くまのプーさん', 'ディズニー', 'ディズニーシー','ディズニーハロウィーン',
                                         'ディズニーランド', 'ディズニー好きと繋がりたい', 'フェスティバルオブミスティーク', 'マルマン',
                                         'ㅋㅋㅋ', '場所', '更新', '月released', '東京ディズニーシー', '東京ディズニーランド', '東京ディズニーリゾート',
                                         '香港迪士尼樂園', 'ºº', 'hong', 'kong',"disneylandresort", "disneyland", "disneyresort",
                                          "californiaadventure",'downtowndisney','disneyanaheim','disneylandanaheim',
                                          'disneycalifornia','californiadisney','disneysea', 'disneytokyo', 'disneytokyoresort', 
                                          'tokyodisney','tokyodisneyresort', 'tokyodisneyland','東京ディズニーランド', 'ディズニーランド',
                                          '東京ディズニーシー', 'ズニーシー', 'tdr_now', 'tdr_md','tdr','dca','dl'])

In [13]:
y = df['label']
# TF-IDF
tfidf_vect = TfidfVectorizer(stop_words = my_stop_words, min_df = 0.005)
X_tfidf = tfidf_vect.fit_transform(df['clean_text'])
X_tfidf_feat = pd.concat([df[['hash_count', 'emoji_count', 'anger','anticipation','disgust','fear',
                                                       'joy', 'sadness', 'surprise', 'trust', 'punc_count',
                                                       'tweet_len','cap_count']], pd.DataFrame(X_tfidf.toarray())], axis=1)

# CountVectorizer
count_vect = CountVectorizer(stop_words = my_stop_words, min_df = 0.005)
X_count = count_vect.fit_transform(df['clean_text'])
X_count_feat = pd.concat([df[['hash_count', 'emoji_count', 'anger','anticipation','disgust','fear',
                                                       'joy', 'sadness', 'surprise', 'trust', 'punc_count',
                                                       'tweet_len','cap_count']], pd.DataFrame(X_count.toarray())], axis=1)

X_count_feat.head()

Unnamed: 0,hash_count,emoji_count,anger,anticipation,disgust,fear,joy,sadness,surprise,trust,...,206,207,208,209,210,211,212,213,214,215
0,1,0,0,0,0,0,0,0,0,0,...,0,0,1,0,0,0,0,0,0,0
1,0,0,0,0,1,1,0,2,0,0,...,0,0,0,0,0,0,0,0,1,0
2,1,0,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,8,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,2,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [15]:
#Scale data
from sklearn import preprocessing
X_count_scaled = preprocessing.scale(X_count_feat)
X_tfidf_scaled = preprocessing.scale(X_tfidf_feat)

In [16]:
#Balance train Data 
ros = RandomOverSampler(random_state=77)
X_ros_count, y_ros = ros.fit_resample(X_count_scaled, y)
X_ros_tfidf, y_ros = ros.fit_resample(X_tfidf_scaled, y)
print(sorted(Counter(y_ros).items()))

[(0, 29504), (1, 29504), (2, 29504), (3, 29504), (4, 29504), (5, 29504)]


In [17]:
ros = RandomOverSampler(random_state=77)
X_count_r, y_r = ros.fit_resample(X_count_feat, y)
X_tfidf_r, y_r = ros.fit_resample(X_tfidf_feat, y)
print(sorted(Counter(y_r).items()))

[(0, 29504), (1, 29504), (2, 29504), (3, 29504), (4, 29504), (5, 29504)]


In [29]:
lr = LogisticRegression(solver = 'newton-cg', multi_class = 'multinomial')
cv_lr_count = cross_val_score(lr, X_ros_count, y_ros, cv = 5)
cv_lr_tfidf = cross_val_score(lr, X_ros_tfidf, y_ros, cv = 5)

In [24]:
print(f' 5 Fold CV: {cv_lr_count} \n \n Mean:{cv_lr_count.mean()} \n \n')
print(f' 5 Fold CV: {cv_lr_tfidf} \n \n Mean:{cv_lr_tfidf.mean()})

 5 Fold CV: [0.59336076 0.6011313  0.58513313 0.59793166 0.58661867] 
 
 Mean:0.592835104559479 
 

 5 Fold CV: [0.59353217 0.59701748 0.58164781 0.59410353 0.58221917] 
 
 Mean:0.5897040338247057 
 



In [None]:
### Logistic Regression 

In [29]:
lr = LogisticRegression(multi_class = 'multinomial')

#Hyper tuning
max_iter = [100, 300, 500]
solver = ['newton-cg', 'sag', 'saga']
C = np.logspace(0, 4, 10)

# Create hyperparameter options
hyperparameters = dict(max_iter=max_iter, solver = solver, C=C)

clf = RandomizedSearchCV(lr, hyperparameters, cv=5, n_jobs=-1, 
                         n_iter = 10, random_state = 77)
%time lr_cv_fit = clf.fit(X_ros_count, y_ros)



CPU times: user 1min 16s, sys: 401 ms, total: 1min 16s
Wall time: 1h 4min 47s




In [70]:
print(lr_cv_fit.best_params_)

{'solver': 'sag', 'max_iter': 100, 'C': 3593.813663804626}


In [65]:
lr_cv_fit.best_estimator_

LogisticRegression(C=3593.813663804626, class_weight=None, dual=False,
                   fit_intercept=True, intercept_scaling=1, l1_ratio=None,
                   max_iter=100, multi_class='multinomial', n_jobs=None,
                   penalty='l2', random_state=None, solver='sag', tol=0.0001,
                   verbose=0, warm_start=False)

In [66]:
lr_cv_fit.best_score_

0.49128366775126536

In [32]:
lr = LogisticRegression(multi_class = 'multinomial')

#Hyper tuning 
max_iter = [100, 300, 500]

solver = ['newton-cg', 'sag', 'saga']
C = np.logspace(0, 4, 10)

# Create hyperparameter options
hyperparameters = dict(C = C, max_iter = max_iter, solver = solver)

clf = RandomizedSearchCV(lr, hyperparameters, cv=5, n_jobs=-1, 
                         n_iter = 10, random_state = 77)
%time lr_cv_fit_t = clf.fit(X_ros_tfidf, y_ros)



CPU times: user 12min 49s, sys: 3.4 s, total: 12min 52s
Wall time: 40min 29s


In [33]:
lr_cv_fit_t.best_params_

{'solver': 'newton-cg', 'max_iter': 100, 'C': 21.544346900318832}

In [72]:
lr_cv_fit_t.best_estimator_

LogisticRegression(C=21.544346900318832, class_weight=None, dual=False,
                   fit_intercept=True, intercept_scaling=1, l1_ratio=None,
                   max_iter=100, multi_class='multinomial', n_jobs=None,
                   penalty='l2', random_state=None, solver='newton-cg',
                   tol=0.0001, verbose=0, warm_start=False)

In [74]:
lr_cv_fit_t.best_score_

0.48837445770065074

In [None]:
### Multinomial Bayes 

In [43]:
nb_clf = MultinomialNB()

alpha = np.linspace(0.05,1,1000)

# Create hyperparameter options
hyperparameters = dict(alpha = alpha)

clf = GridSearchCV(nb_clf, hyperparameters, cv=5, n_jobs=-1)
%time nb_cv_fit = clf.fit(X_count_r, y_r)



CPU times: user 32.9 s, sys: 35.8 s, total: 1min 8s
Wall time: 13min 46s


In [44]:
nb_cv_fit.best_params_

{'alpha': 0.05}

In [75]:
nb_cv_fit.best_estimator_

MultinomialNB(alpha=0.05, class_prior=None, fit_prior=True)

In [76]:
nb_cv_fit.best_score_

0.458960366955893

In [45]:
nb_clf = MultinomialNB()

alpha = np.linspace(0.05,1,1000)

# Create hyperparameter options
hyperparameters = dict(alpha = alpha)

clf = GridSearchCV(nb_clf, hyperparameters, cv=5, n_jobs=-1)
%time nb_cv_fit_t = clf.fit(X_tfidf_r, y_r)



CPU times: user 32 s, sys: 41.6 s, total: 1min 13s
Wall time: 12min 6s


In [46]:
nb_cv_fit_t.best_params_

{'alpha': 0.08993993993993994}

In [79]:
nb_cv_fit_t.best_estimator_

MultinomialNB(alpha=0.08993993993993994, class_prior=None, fit_prior=True)

In [80]:
nb_cv_fit_t.best_score_

0.46316883586406365

In [47]:
#Random Forest Classifier Countvectorizer
rf_clf = RandomForestClassifier()

hyperparameters = {
    'n_estimators': range(50,1000,100), 
    'max_depth': [5,500,50],
    'max_features': ['auto', 'sqrt', 'log2']
}

clf = RandomizedSearchCV(rf_clf, hyperparameters, cv=5, n_jobs=-1, 
                         n_iter = 10, random_state = 77)
%time rf_cv_fit = clf.fit(X_count_r, y_r)



CPU times: user 5min 47s, sys: 2.41 s, total: 5min 49s
Wall time: 2h 12min 1s


In [50]:
rf_cv_fit.best_params_

{'n_estimators': 250, 'max_features': 'log2', 'max_depth': 500}

In [81]:
rf_cv_fit.best_estimator_

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
                       max_depth=500, max_features='log2', max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=250,
                       n_jobs=None, oob_score=False, random_state=None,
                       verbose=0, warm_start=False)

In [82]:
rf_cv_fit.best_score_

0.876547812725958

In [48]:
#Random Forest Classifier TFIDF
rf_clf = RandomForestClassifier()

hyperparameters = {
    'n_estimators': range(50,1000,100), 
    'max_depth': [5,500,50],
    'max_features': ['auto', 'sqrt', 'log2']
}

clf = RandomizedSearchCV(rf_clf, hyperparameters, cv=5, n_jobs=-1, 
                         n_iter = 10, random_state = 77)
%time rf_cv_fit_t = clf.fit(X_tfidf_r, y_r)



CPU times: user 6min 22s, sys: 2.59 s, total: 6min 24s
Wall time: 2h 21min 35s


In [49]:
rf_cv_fit_t.best_params_

{'n_estimators': 250, 'max_features': 'log2', 'max_depth': 500}

In [83]:
rf_cv_fit_t.best_estimator_

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
                       max_depth=500, max_features='log2', max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=250,
                       n_jobs=None, oob_score=False, random_state=None,
                       verbose=0, warm_start=False)

In [84]:
rf_cv_fit_t.best_score_

0.8776324114244396

In [None]:
#XGBooostClassifier 

In [58]:
#Instantiate our model 
xg_clf = xgb.XGBClassifier()

hyperparameters = {
    'n_estimators': [50, 100, 150, 200, 250], 
    'max_depth': [5, 7, 11, 15],
    'learning_rate': [0.1,0.3,0.5,0.7,0.9,1],
    'alpha': [5,10,15,20]
}

clf = RandomizedSearchCV(xg_clf, hyperparameters, cv=5, n_jobs=-1, 
                         n_iter = 10, random_state = 77)
%time xg_cv_fit = clf.fit(X_count_r, y_r)



CPU times: user 2h 2min 27s, sys: 2.51 s, total: 2h 2min 30s
Wall time: 6h 6min 18s


In [59]:
xg_cv_fit.best_params_

{'n_estimators': 250, 'max_depth': 15, 'learning_rate': 0.7, 'alpha': 20}

In [85]:
xg_cv_fit.best_estimator_

XGBClassifier(alpha=20, base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=1, gamma=0,
              learning_rate=0.7, max_delta_step=0, max_depth=15,
              min_child_weight=1, missing=None, n_estimators=250, n_jobs=1,
              nthread=None, objective='multi:softprob', random_state=0,
              reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=None,
              silent=None, subsample=1, verbosity=1)

In [86]:
xg_cv_fit.best_score_

0.8602223427331888

In [60]:
#Instantiate our model 
xg_clf = xgb.XGBClassifier()

hyperparameters = {
    'n_estimators': [50, 100, 150, 200, 250], 
    'max_depth': [5, 7, 11, 15],
    'learning_rate': [0.1,0.3,0.5,0.7,0.9,1],
    'alpha': [5,10,15,20]
}

clf = RandomizedSearchCV(xg_clf, hyperparameters, cv=5, n_jobs=-1, 
                         n_iter = 10, random_state = 77)
%time xg_cv_fit_t = clf.fit(X_tfidf_r, y_r)



CPU times: user 2h 3min 7s, sys: 2.48 s, total: 2h 3min 10s
Wall time: 5h 58min 59s


In [62]:
xg_cv_fit_t.best_params_

{'n_estimators': 250, 'max_depth': 15, 'learning_rate': 0.7, 'alpha': 20}

In [87]:
xg_cv_fit_t.best_estimator_

XGBClassifier(alpha=20, base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=1, gamma=0,
              learning_rate=0.7, max_delta_step=0, max_depth=15,
              min_child_weight=1, missing=None, n_estimators=250, n_jobs=1,
              nthread=None, objective='multi:softprob', random_state=0,
              reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=None,
              silent=None, subsample=1, verbosity=1)

In [88]:
xg_cv_fit_t.best_score_

0.8545169016630514

In [None]:
#K Nearest Neighbor 

In [54]:
#KNN CountVectorizer
knn = KNeighborsClassifier()

hyperparameters = {
    'n_neighbors': range(5,50,10), 
    'weights': ['uniform', 'distance'],
    'algorithm': ['auto', 'ball_tree', 'kd_tree', 'brute']
}

clf = RandomizedSearchCV(knn, hyperparameters, cv=5, n_jobs=-1, 
                         n_iter = 10, random_state = 77)
%time knn_cv_fit = clf.fit(X_count_r, y_r)



CPU times: user 476 ms, sys: 790 ms, total: 1.27 s
Wall time: 23min 17s


In [55]:
knn_cv_fit.best_params_

{'weights': 'distance', 'n_neighbors': 15, 'algorithm': 'brute'}

In [89]:
knn_cv_fit.best_estimator_

KNeighborsClassifier(algorithm='brute', leaf_size=30, metric='minkowski',
                     metric_params=None, n_jobs=None, n_neighbors=15, p=2,
                     weights='distance')

In [90]:
knn_cv_fit.best_score_

0.7214276030368764

In [56]:
#KNN TFIDF 
knn = KNeighborsClassifier()

hyperparameters = {
    'n_neighbors': range(5,50,10), 
    'weights': ['uniform', 'distance'],
    'algorithm': ['auto', 'ball_tree', 'kd_tree', 'brute']
}

clf = RandomizedSearchCV(knn, hyperparameters, cv=5, n_jobs=-1, 
                         n_iter = 10, random_state = 77)
%time knn_cv_fit_t = clf.fit(X_tfidf_r, y_r)



CPU times: user 628 ms, sys: 995 ms, total: 1.62 s
Wall time: 21min 13s


In [57]:
knn_cv_fit_t.best_params_

{'weights': 'distance', 'n_neighbors': 15, 'algorithm': 'brute'}

In [91]:
knn_cv_fit_t.best_estimator_

KNeighborsClassifier(algorithm='brute', leaf_size=30, metric='minkowski',
                     metric_params=None, n_jobs=None, n_neighbors=15, p=2,
                     weights='distance')

In [92]:
knn_cv_fit_t.best_score_

0.7181455621836587

In [None]:
#Choosing the Best Model 

In [98]:
#Split into train and test 
X_train, X_test, y_train, y_test = train_test_split(df[['hash_count', 'emoji_count','clean_text',
                                                        'anger','anticipation','disgust','fear',
                                                       'joy', 'sadness', 'surprise', 'trust', 'punc_count',
                                                       'tweet_len','cap_count']], df['label'],test_size =0.20, random_state = 77)

In [99]:
#Count vectorizer Document term matrix

count_vecto = CountVectorizer(stop_words = my_stop_words, min_df = 0.005)
count_vecto_fit = count_vecto.fit(X_train['clean_text'])

count_train = count_vecto_fit.transform(X_train['clean_text'])
count_test = count_vecto_fit.transform(X_test['clean_text'])

X_train_vect = pd.concat([X_train[['hash_count', 'emoji_count', 'anger','anticipation','disgust','fear',
                                                       'joy', 'sadness', 'surprise', 'trust', 'punc_count',
                                                       'tweet_len','cap_count']].reset_index(drop=True), 
           pd.DataFrame(count_train.toarray())], axis=1)
X_test_vect = pd.concat([X_test[['hash_count', 'emoji_count', 'anger','anticipation','disgust','fear',
                                                       'joy', 'sadness', 'surprise', 'trust', 'punc_count',
                                                       'tweet_len','cap_count']].reset_index(drop=True), 
           pd.DataFrame(count_test.toarray())], axis=1)

In [100]:
X_train_vect.shape

(70008, 230)

In [101]:
#Balance train Data 
ros = RandomOverSampler(random_state=77)
X_resampled_ros, y_resampled_ros = ros.fit_resample(X_train_vect.values, y_train.values)
print(sorted(Counter(y_resampled_ros).items()))

[(0, 23522), (1, 23522), (2, 23522), (3, 23522), (4, 23522), (5, 23522)]


In [102]:
#Balance test data
X_resampled_ros_test, y_resampled_ros_test = ros.fit_resample(X_test_vect.values, y_test.values)
print(sorted(Counter(y_resampled_ros_test).items()))

[(0, 5982), (1, 5982), (2, 5982), (3, 5982), (4, 5982), (5, 5982)]


In [103]:
#Xgboost 

#{'n_estimators': 250, 'max_depth': 15, 'learning_rate': 0.7, 'alpha': 20}


#Instantiate our model 
xg_clf = xgb.XGBClassifier(learning_rate = 0.7,
                max_depth = 15, alpha = 20, n_estimators = 250)


#Model Fit 
start = time.time()
xg_clf.fit(X_resampled_ros, y_resampled_ros)
end = time.time()
fit_time = (end - start)

#Model Predict 
start = time.time()
y_pred = xg_clf.predict(X_resampled_ros_test)
end = time.time()
pred_time = (end - start)

#Model Scoring 
precision, recall, fscore, support = precision_recall_fscore_support(y_resampled_ros_test, y_pred, average='macro')
print('Fit time: {} / Predict time: {} ---- Precision: {} / Recall: {} / Accuracy: {}'.format(
    round(fit_time, 3), round(pred_time, 3), round(precision, 3), round(recall, 3), round((y_pred==y_resampled_ros_test).sum()/len(y_pred), 3)))

Fit time: 5855.502 / Predict time: 7.546 ---- Precision: 0.618 / Recall: 0.59 / Accuracy: 0.59


In [104]:
xgb_fea_imp = pd.DataFrame(xg_clf.get_booster().get_fscore().items(),columns=['feature','importance']).sort_values('importance', ascending=False)
xgb_fea_imp[0:20]

Unnamed: 0,feature,importance
10,f11,86119
6,f12,46782
4,f10,37863
3,f0,12931
15,f1,12777
22,f3,8906
67,f9,7148
20,f6,5669
0,f5,5457
1,f7,4581


In [105]:
#Random Forest

#{'n_estimators': 250, 'max_features': 'log2', 'max_depth': 500}


#Instantiate our model 
rf = RandomForestClassifier(n_estimators = 250, max_depth = 500, max_features = 'log2',
                            n_jobs=-1)

#Model Fit 
start = time.time()
rf.fit(X_resampled_ros, y_resampled_ros)
end = time.time()
fit_time = (end - start)

#Model Predict 
start = time.time()
y_pred = rf.predict(X_resampled_ros_test)
end = time.time()
pred_time = (end - start)

#Model Scoring 
precision, recall, fscore, support = precision_recall_fscore_support(y_resampled_ros_test, y_pred, average='macro')
print('Fit time: {} / Predict time: {} ---- Precision: {} / Recall: {} / Accuracy: {}'.format(
    round(fit_time, 3), round(pred_time, 3), round(precision, 3), round(recall, 3), round((y_pred==y_resampled_ros_test).sum()/len(y_pred), 3)))

Fit time: 27.188 / Predict time: 0.732 ---- Precision: 0.63 / Recall: 0.596 / Accuracy: 0.596


In [106]:
importances = rf.feature_importances_
(sorted(zip(importances, X_train_vect.columns), reverse=True))[0:20]

[(0.12870380391849415, 'tweet_len'),
 (0.07867563171545214, 'cap_count'),
 (0.07063985959616721, 'punc_count'),
 (0.05441362132758469, 'hash_count'),
 (0.028060034877217623, 'sadness'),
 (0.026234639341441757, 'emoji_count'),
 (0.024265398872350682, 'joy'),
 (0.024125873865076173, 'fear'),
 (0.01916960208726278, 'anticipation'),
 (0.01807146593302367, 'anger'),
 (0.01756454389351802, 'trust'),
 (0.015189076780071465, 'disgust'),
 (0.012374833514527287, 'surprise'),
 (0.011428051480100393, 29),
 (0.010688031921792449, 90),
 (0.0073147790903301075, 197),
 (0.00715028248076272, 31),
 (0.005981077915899386, 69),
 (0.005931986299466755, 181),
 (0.00583769231558105, 64)]

In [107]:
#Multinomial Bayes

#Instantiate our model 
nb_clf = MultinomialNB(alpha = 0.05)

#Train our Model 
start = time.time()
nb_clf.fit(X_resampled_ros, y_resampled_ros)
end = time.time()
fit_time = (end - start)

#Model Predict 
start = time.time()
y_pred = nb_clf.predict(X_resampled_ros_test)
end = time.time()
pred_time = (end - start)

#Model Scoring 
precision, recall, fscore, support = precision_recall_fscore_support(y_resampled_ros_test, y_pred, average='macro')
print('Fit time: {} / Predict time: {} ---- Precision: {} / Recall: {} / Accuracy: {}'.format(
    round(fit_time, 3), round(pred_time, 3), round(precision, 3), 
    round(recall, 3), round((y_pred==y_resampled_ros_test).sum()/len(y_pred), 3)))

Fit time: 0.111 / Predict time: 0.018 ---- Precision: 0.518 / Recall: 0.514 / Accuracy: 0.514


In [None]:
#Logitstic Regression

In [109]:
#Scale data
from sklearn import preprocessing
X_train_scaled = preprocessing.scale(X_resampled_ros)
X_test_scaled = preprocessing.scale(X_resampled_ros_test)

In [110]:
#Logistic Regression 

#Instantiate our model
lr = LogisticRegression(max_iter = 100, solver = 'sag', multi_class = 'multinomial', 
                        C = 3593.81)

#Train our Model
start = time.time()
lr.fit(X_train_scaled, y_resampled_ros)
end = time.time()
fit_time = (end - start)

#Model Predict 
start = time.time()
y_pred = lr.predict(X_test_scaled)
end = time.time()
pred_time = (end - start)

#Model Scoring 
precision, recall, fscore, support = precision_recall_fscore_support(y_resampled_ros_test, y_pred, average='macro')
print('Fit time: {} / Predict time: {} ---- Precision: {} / Recall: {} / Accuracy: {}'.format(
    round(fit_time, 3), round(pred_time, 3), round(precision, 3), round(recall, 3), 
    round((y_pred==y_resampled_ros_test).sum()/len(y_pred), 3)))

Fit time: 63.143 / Predict time: 0.006 ---- Precision: 0.601 / Recall: 0.586 / Accuracy: 0.586




In [111]:
#KNN
from sklearn.neighbors import KNeighborsClassifier

#Instantiate our model 
knn = KNeighborsClassifier(n_neighbors = 15, weights = 'distance', algorithm = 'brute')

#Model Fit 
start = time.time()
knn.fit(X_resampled_ros, y_resampled_ros)
end = time.time()
fit_time = (end - start)

#Model Predict 
start = time.time()
y_pred = knn.predict(X_resampled_ros_test)
end = time.time()
pred_time = (end - start)

#Model Scoring 
precision, recall, fscore, support = precision_recall_fscore_support(y_resampled_ros_test, y_pred, average='macro')
print('Fit time: {} / Predict time: {} ---- Precision: {} / Recall: {} / Accuracy: {}'.format(
    round(fit_time, 3), round(pred_time, 3), round(precision, 3), round(recall, 3), round((y_pred==y_resampled_ros_test).sum()/len(y_pred), 3)))

Fit time: 0.007 / Predict time: 91.01 ---- Precision: 0.375 / Recall: 0.372 / Accuracy: 0.372


In [112]:
#TFIDF vectorizer Document term matrix

tfidf_vecto = TfidfVectorizer(stop_words = my_stop_words, min_df = 0.005)
tfidf_vecto_fit = tfidf_vecto.fit(X_train['clean_text'])

tfidf_train = tfidf_vecto_fit.transform(X_train['clean_text'])
tfidf_test = tfidf_vecto_fit.transform(X_test['clean_text'])

X_train_t_vect = pd.concat([X_train[['hash_count', 'emoji_count', 'anger','anticipation','disgust','fear',
                                                       'joy', 'sadness', 'surprise', 'trust', 'punc_count',
                                                       'tweet_len','cap_count']].reset_index(drop=True), 
           pd.DataFrame(tfidf_train.toarray())], axis=1)
X_test_t_vect = pd.concat([X_test[['hash_count', 'emoji_count', 'anger','anticipation','disgust','fear',
                                                       'joy', 'sadness', 'surprise', 'trust', 'punc_count',
                                                       'tweet_len','cap_count']].reset_index(drop=True), 
           pd.DataFrame(tfidf_test.toarray())], axis=1)

In [113]:
#Balance train Data 
ros = RandomOverSampler(random_state=77)
X_tfidf_ros, y_tfidf_ros = ros.fit_resample(X_train_t_vect.values, y_train.values)
print(sorted(Counter(y_tfidf_ros).items()))

[(0, 23522), (1, 23522), (2, 23522), (3, 23522), (4, 23522), (5, 23522)]


In [114]:
#Balance test data
X_tfidf_ros_test, y_tfidf_ros_test = ros.fit_resample(X_test_vect.values, y_test.values)
print(sorted(Counter(y_tfidf_ros_test).items()))

[(0, 5982), (1, 5982), (2, 5982), (3, 5982), (4, 5982), (5, 5982)]


In [115]:
#Xgboost 

#{'n_estimators': 250, 'max_depth': 15, 'learning_rate': 0.7, 'alpha': 20}


#Instantiate our model 
xg_clf = xgb.XGBClassifier(learning_rate = 0.7,
                max_depth = 15, alpha = 20, n_estimators = 250)


#Model Fit 
start = time.time()
xg_clf.fit(X_tfidf_ros, y_tfidf_ros)
end = time.time()
fit_time = (end - start)

#Model Predict 
start = time.time()
y_pred = xg_clf.predict(X_tfidf_ros_test)
end = time.time()
pred_time = (end - start)

#Model Scoring 
precision, recall, fscore, support = precision_recall_fscore_support(y_tfidf_ros_test, y_pred, average='macro')
print('Fit time: {} / Predict time: {} ---- Precision: {} / Recall: {} / Accuracy: {}'.format(
    round(fit_time, 3), round(pred_time, 3), round(precision, 3), round(recall, 3), round((y_pred==y_tfidf_ros_test).sum()/len(y_pred), 3)))

Fit time: 5972.105 / Predict time: 6.773 ---- Precision: 0.573 / Recall: 0.531 / Accuracy: 0.531


In [None]:
xgb_fea_imp = pd.DataFrame(xg_clf.get_booster().get_fscore().items(),columns=['feature','importance']).sort_values('importance', ascending=False)
xgb_fea_imp[0:20]

In [116]:
#Random Forest

#{'n_estimators': 250, 'max_features': 'log2', 'max_depth': 500}


#Instantiate our model 
rf = RandomForestClassifier(n_estimators = 250, max_depth = 500, max_features = 'log2',
                            n_jobs=-1)

#Model Fit 
start = time.time()
rf.fit(X_tfidf_ros, y_tfidf_ros)
end = time.time()
fit_time = (end - start)

#Model Predict 
start = time.time()
y_pred = rf.predict(X_tfidf_ros_test)
end = time.time()
pred_time = (end - start)

#Model Scoring 
precision, recall, fscore, support = precision_recall_fscore_support(y_tfidf_ros_test, y_pred, average='macro')
print('Fit time: {} / Predict time: {} ---- Precision: {} / Recall: {} / Accuracy: {}'.format(
    round(fit_time, 3), round(pred_time, 3), round(precision, 3), round(recall, 3), round((y_pred==y_tfidf_ros_test).sum()/len(y_pred), 3)))

Fit time: 28.714 / Predict time: 0.732 ---- Precision: 0.622 / Recall: 0.587 / Accuracy: 0.587


In [117]:
#Multinomial Bayes

#Instantiate our model 
nb_clf = MultinomialNB(alpha = 0.0899399)

#Train our Model 
start = time.time()
nb_clf.fit(X_tfidf_ros, y_tfidf_ros)
end = time.time()
fit_time = (end - start)

#Model Predict 
start = time.time()
y_pred = nb_clf.predict(X_tfidf_ros_test)
end = time.time()
pred_time = (end - start)

#Model Scoring 
precision, recall, fscore, support = precision_recall_fscore_support(y_tfidf_ros_test, y_pred, average='macro')
print('Fit time: {} / Predict time: {} ---- Precision: {} / Recall: {} / Accuracy: {}'.format(
    round(fit_time, 3), round(pred_time, 3), round(precision, 3), 
    round(recall, 3), round((y_pred==y_tfidf_ros_test).sum()/len(y_pred), 3)))

Fit time: 0.069 / Predict time: 0.019 ---- Precision: 0.521 / Recall: 0.518 / Accuracy: 0.518


In [118]:
#KNN
from sklearn.neighbors import KNeighborsClassifier

#Instantiate our model 
knn = KNeighborsClassifier(n_neighbors = 15, weights = 'distance', algorithm = 'brute')

#Model Fit 
start = time.time()
knn.fit(X_tfidf_ros, y_tfidf_ros)
end = time.time()
fit_time = (end - start)

#Model Predict 
start = time.time()
y_pred = knn.predict(X_tfidf_ros_test)
end = time.time()
pred_time = (end - start)

#Model Scoring 
precision, recall, fscore, support = precision_recall_fscore_support(y_tfidf_ros_test, y_pred, average='macro')
print('Fit time: {} / Predict time: {} ---- Precision: {} / Recall: {} / Accuracy: {}'.format(
    round(fit_time, 3), round(pred_time, 3), round(precision, 3), round(recall, 3), round((y_pred==y_tfidf_ros_test).sum()/len(y_pred), 3)))

Fit time: 0.036 / Predict time: 90.02 ---- Precision: 0.387 / Recall: 0.386 / Accuracy: 0.386


In [119]:
#Scale data
from sklearn import preprocessing
X_train_scaled_t = preprocessing.scale(X_tfidf_ros)
X_test_scaled_t = preprocessing.scale(X_tfidf_ros_test)

In [120]:
#Logistic Regression 

#Instantiate our model
lr = LogisticRegression(max_iter = 100, solver = 'newton-cg', multi_class = 'multinomial', 
                        C = 21.5443469)

#Train our Model
start = time.time()
lr.fit(X_train_scaled_t, y_tfidf_ros)
end = time.time()
fit_time = (end - start)

#Model Predict 
start = time.time()
y_pred = lr.predict(X_test_scaled_t)
end = time.time()
pred_time = (end - start)

#Model Scoring 
precision, recall, fscore, support = precision_recall_fscore_support(y_tfidf_ros_test, y_pred, average='macro')
print('Fit time: {} / Predict time: {} ---- Precision: {} / Recall: {} / Accuracy: {}'.format(
    round(fit_time, 3), round(pred_time, 3), round(precision, 3), round(recall, 3), 
    round((y_pred==y_tfidf_ros_test).sum()/len(y_pred), 3)))

Fit time: 61.095 / Predict time: 0.008 ---- Precision: 0.598 / Recall: 0.581 / Accuracy: 0.581


In [None]:
#Countvectorizer document matrix with Random Forest Classifier produced the best model. We will test it out with Naive Bayes and Logistic Regression 