In [1]:
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
import seaborn as sns
import spacy
import time
from collections import Counter
from imblearn.over_sampling import RandomOverSampler


from sklearn.metrics import precision_recall_fscore_support 
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer, ENGLISH_STOP_WORDS
from sklearn.model_selection import train_test_split
from sklearn import metrics
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
from sklearn.model_selection import cross_val_score
import xgboost as xgb


In [2]:
#HyperParameter Tuning with Crossvalidation in GridSearch 

In [3]:
df = pd.read_json('DiSmldata.json')

In [4]:
df

Unnamed: 0,hash_count,emoji_count,clean_text,label,anger,anticipation,disgust,fear,joy,sadness,surprise,trust,punc_count,tweet_len,cap_count,user_follower,user_favorite
0,1,0,new mickey flower bed going make event photo l...,2,0,0,0,0,0,0,0,0,9,193,20,1214.0,982.0
2,1,0,thought different yesterday mickey flower bed ...,1,0,1,0,0,0,0,0,0,10,115,13,638.0,195775.0
5,0,0,mickey flower bed renewed tokyo disneyland tod...,1,0,0,0,0,0,0,0,0,12,119,16,157.0,4158.0
17,1,0,mickey flower bed back tdrnow,1,0,0,0,0,0,0,0,0,10,72,12,83.0,8334.0
19,0,0,god has face changed,1,0,1,0,1,1,0,0,1,0,29,3,882.0,41436.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
87503,3,0,hurtado help cypress overpower pacifica finish...,5,0,0,0,0,0,0,0,0,11,124,11,572.0,124266.0
87504,1,1,oct agricultural counsellor attended award cer...,5,1,2,0,1,4,1,4,4,9,256,24,82.0,16.0
87506,0,0,being single move,4,0,0,0,0,0,0,0,0,0,23,1,788.0,22525.0
87508,0,0,time pose hard without laughing disneyland def...,3,0,1,0,0,1,0,0,0,2,104,5,24.0,1819.0


In [5]:
my_stop_words = ENGLISH_STOP_WORDS.union(['disneyland','tokyo','disney', 'im', 'tdrnow','paris','california','amp','disneysea','got',
                                         'ºc', 'ºf', 'ºoº','𝗧𝗵𝗲','くまのプーさん', 'ディズニー', 'ディズニーシー','ディズニーハロウィーン',
                                         'ディズニーランド', 'ディズニー好きと繋がりたい', 'フェスティバルオブミスティーク', 'マルマン',
                                         'ㅋㅋㅋ', '場所', '更新', '月released', '東京ディズニーシー', '東京ディズニーランド', '東京ディズニーリゾート',
                                         '香港迪士尼樂園', 'ºº', 'hong', 'kong',"disneylandresort", "disneyland", "disneyresort",
                                          "californiaadventure",'downtowndisney','disneyanaheim','disneylandanaheim',
                                          'disneycalifornia','californiadisney','disneysea', 'disneytokyo', 'disneytokyoresort', 
                                          'tokyodisney','tokyodisneyresort', 'tokyodisneyland','東京ディズニーランド', 'ディズニーランド',
                                          '東京ディズニーシー', 'ズニーシー', 'tdr_now', 'tdr_md','tdr','dca','dl'])

In [6]:
y = df['label']
# TF-IDF
tfidf_vect = TfidfVectorizer(stop_words = my_stop_words, max_features = 1000)
X_tfidf = tfidf_vect.fit_transform(df['clean_text'])
X_tfidf_feat = pd.concat([df[['hash_count', 'emoji_count', 'anger','anticipation','disgust','fear',
                                                       'joy', 'sadness', 'surprise', 'trust', 'punc_count',
                                                       'tweet_len','cap_count']], pd.DataFrame(X_tfidf.toarray())], axis=1)

# CountVectorizer
count_vect = CountVectorizer(stop_words = my_stop_words, max_features = 1000)
X_count = count_vect.fit_transform(df['clean_text'])
X_count_feat = pd.concat([df[['hash_count', 'emoji_count', 'anger','anticipation','disgust','fear',
                                                       'joy', 'sadness', 'surprise', 'trust', 'punc_count',
                                                       'tweet_len','cap_count']], pd.DataFrame(X_count.toarray())], axis=1)

X_count_feat.head()

Unnamed: 0,hash_count,emoji_count,anger,anticipation,disgust,fear,joy,sadness,surprise,trust,...,990,991,992,993,994,995,996,997,998,999
0,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,1,1,0,2,0,0,...,0,1,0,0,0,0,0,0,0,0
2,1,0,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,8,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,2,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [7]:
#Scale data
from sklearn import preprocessing
X_count_scaled = preprocessing.scale(X_count_feat)
X_tfidf_scaled = preprocessing.scale(X_tfidf_feat)

In [8]:
#Balance train Data 
ros = RandomOverSampler(random_state=77)
X_ros_count, y_ros = ros.fit_resample(X_count_scaled, y)
X_ros_tfidf, y_ros = ros.fit_resample(X_tfidf_scaled, y)
print(sorted(Counter(y_ros).items()))

[(0, 29504), (1, 29504), (2, 29504), (3, 29504), (4, 29504), (5, 29504)]


In [9]:
ros = RandomOverSampler(random_state=77)
X_count_r, y_r = ros.fit_resample(X_count_feat, y)
X_tfidf_r, y_r = ros.fit_resample(X_tfidf_feat, y)
print(sorted(Counter(y_r).items()))

[(0, 29504), (1, 29504), (2, 29504), (3, 29504), (4, 29504), (5, 29504)]


In [None]:
### Logistic Regression 

In [14]:
lr = LogisticRegression(multi_class = 'multinomial')

#Hyper tuning
max_iter = [20, 50, 100, 300, 500]
solver = ['newton-cg', 'sag', 'saga']
C = [0.001, 0.01, 0.05, 0.1, 1, 3, 6, 10]

# Create hyperparameter options
hyperparameters = dict(max_iter=max_iter, solver = solver, C=C)

clf = RandomizedSearchCV(lr, hyperparameters, cv=5, n_jobs=-1, 
                         n_iter = 10, random_state = 77)
%time lr_cv_fit = clf.fit(X_ros_count, y_ros)



CPU times: user 55min 12s, sys: 8.03 s, total: 55min 20s
Wall time: 1h 41min 42s


In [15]:
print(lr_cv_fit.best_params_)

{'solver': 'newton-cg', 'max_iter': 300, 'C': 1}


In [16]:
lr_cv_fit.best_estimator_

LogisticRegression(C=1, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=300,
                   multi_class='multinomial', n_jobs=None, penalty='l2',
                   random_state=None, solver='newton-cg', tol=0.0001, verbose=0,
                   warm_start=False)

In [17]:
lr_cv_fit.best_score_

0.6684912980805369

In [18]:
lr = LogisticRegression(multi_class = 'multinomial')

#Hyper tuning 
max_iter = [20, 50, 100, 300, 500]
solver = ['newton-cg', 'sag', 'saga']
C = [0.001, 0.01, 0.05, 0.1, 1, 3, 6, 10]

# Create hyperparameter options
hyperparameters = dict(C = C, max_iter = max_iter, solver = solver)

clf = RandomizedSearchCV(lr, hyperparameters, cv=5, n_jobs=-1, 
                         n_iter = 10, random_state = 77)
%time lr_cv_fit_t = clf.fit(X_ros_tfidf, y_ros)



CPU times: user 18min 24s, sys: 2.73 s, total: 18min 27s
Wall time: 1h 58min 10s




In [19]:
lr_cv_fit_t.best_params_

{'solver': 'saga', 'max_iter': 300, 'C': 0.05}

In [20]:
lr_cv_fit_t.best_estimator_

LogisticRegression(C=0.05, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=300,
                   multi_class='multinomial', n_jobs=None, penalty='l2',
                   random_state=None, solver='saga', tol=0.0001, verbose=0,
                   warm_start=False)

In [21]:
lr_cv_fit_t.best_score_

0.6641642036144183

In [22]:
### Multinomial Bayes 

In [23]:
nb_clf = MultinomialNB()

alpha = np.linspace(0.05,1,1000)

# Create hyperparameter options
hyperparameters = dict(alpha = alpha)

clf = GridSearchCV(nb_clf, hyperparameters, cv=5, n_jobs=-1)
%time nb_cv_fit = clf.fit(X_count_r, y_r)



CPU times: user 1min 23s, sys: 13.6 s, total: 1min 37s
Wall time: 3h 59min 2s


In [24]:
nb_cv_fit.best_params_

{'alpha': 0.05}

In [25]:
nb_cv_fit.best_estimator_

MultinomialNB(alpha=0.05, class_prior=None, fit_prior=True)

In [26]:
nb_cv_fit.best_score_

0.5839095336145423

In [27]:
nb_clf = MultinomialNB()

alpha = np.linspace(0.05,1,1000)

# Create hyperparameter options
hyperparameters = dict(alpha = alpha)

clf = GridSearchCV(nb_clf, hyperparameters, cv=5, n_jobs=-1)
%time nb_cv_fit_t = clf.fit(X_tfidf_r, y_r)

CPU times: user 29.4 s, sys: 7.94 s, total: 37.4 s
Wall time: 49min 8s


In [28]:
nb_cv_fit_t.best_params_

{'alpha': 0.054754754754754754}

In [29]:
nb_cv_fit_t.best_estimator_

MultinomialNB(alpha=0.054754754754754754, class_prior=None, fit_prior=True)

In [30]:
nb_cv_fit_t.best_score_

0.6006586855067381

In [31]:
#Random Forest Classifier Countvectorizer
rf_clf = RandomForestClassifier()

hyperparameters = {
    'n_estimators': range(50,1000,100), 
    'max_depth': [5,50,100, 200, 300, 400, 500],
    'max_features': ['auto', 'sqrt', 'log2']
}

clf = RandomizedSearchCV(rf_clf, hyperparameters, cv=5, n_jobs=-1, 
                         n_iter = 10, random_state = 77)
%time rf_cv_fit = clf.fit(X_count_r, y_r)



CPU times: user 19min 10s, sys: 6.68 s, total: 19min 17s
Wall time: 2h 42min 28s


In [32]:
rf_cv_fit.best_params_

{'n_estimators': 650, 'max_features': 'log2', 'max_depth': 500}

In [33]:
rf_cv_fit.best_estimator_

RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,
                       criterion='gini', max_depth=500, max_features='log2',
                       max_leaf_nodes=None, max_samples=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=650,
                       n_jobs=None, oob_score=False, random_state=None,
                       verbose=0, warm_start=False)

In [34]:
rf_cv_fit.best_score_

0.9142885741441684

In [35]:
#Random Forest Classifier TFIDF
rf_clf = RandomForestClassifier()

hyperparameters = {
    'n_estimators': range(50,1000,100), 
    'max_depth': [5,50,100, 200, 300, 400, 500],
    'max_features': ['auto', 'sqrt', 'log2']
}

clf = RandomizedSearchCV(rf_clf, hyperparameters, cv=5, n_jobs=-1, 
                         n_iter = 10, random_state = 77)
%time rf_cv_fit_t = clf.fit(X_tfidf_r, y_r)



CPU times: user 28min 53s, sys: 7.31 s, total: 29min
Wall time: 2h 58min 22s


In [36]:
rf_cv_fit_t.best_params_

{'n_estimators': 350, 'max_features': 'sqrt', 'max_depth': 500}

In [37]:
rf_cv_fit_t.best_estimator_

RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,
                       criterion='gini', max_depth=500, max_features='sqrt',
                       max_leaf_nodes=None, max_samples=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=350,
                       n_jobs=None, oob_score=False, random_state=None,
                       verbose=0, warm_start=False)

In [38]:
rf_cv_fit_t.best_score_

0.9105150782707406

In [None]:
#K Nearest Neighbor 

In [39]:
#KNN CountVectorizer
knn = KNeighborsClassifier()

hyperparameters = {
    'n_neighbors': range(5,50,10), 
    'weights': ['uniform', 'distance'],
    'algorithm': ['auto', 'ball_tree', 'kd_tree', 'brute']
}

clf = RandomizedSearchCV(knn, hyperparameters, cv=5, n_jobs=-1, 
                         n_iter = 10, random_state = 77)
%time knn_cv_fit = clf.fit(X_count_r, y_r)



CPU times: user 1.1 s, sys: 2.47 s, total: 3.57 s
Wall time: 1h 17min 18s


In [40]:
knn_cv_fit.best_params_

{'weights': 'distance', 'n_neighbors': 15, 'algorithm': 'brute'}

In [41]:
knn_cv_fit.best_estimator_

KNeighborsClassifier(algorithm='brute', leaf_size=30, metric='minkowski',
                     metric_params=None, n_jobs=None, n_neighbors=15, p=2,
                     weights='distance')

In [42]:
knn_cv_fit.best_score_

0.7314040282553842

In [43]:
#KNN TFIDF 
knn = KNeighborsClassifier()

hyperparameters = {
    'n_neighbors': range(5,50,10), 
    'weights': ['uniform', 'distance'],
    'algorithm': ['auto', 'ball_tree', 'kd_tree', 'brute']
}

clf = RandomizedSearchCV(knn, hyperparameters, cv=5, n_jobs=-1, 
                         n_iter = 10, random_state = 77)
%time knn_cv_fit_t = clf.fit(X_tfidf_r, y_r)



CPU times: user 3.21 s, sys: 4.29 s, total: 7.49 s
Wall time: 1h 1min 19s


In [44]:
knn_cv_fit_t.best_params_

{'weights': 'distance', 'n_neighbors': 15, 'algorithm': 'brute'}

In [45]:
knn_cv_fit_t.best_estimator_

KNeighborsClassifier(algorithm='brute', leaf_size=30, metric='minkowski',
                     metric_params=None, n_jobs=None, n_neighbors=15, p=2,
                     weights='distance')

In [46]:
knn_cv_fit_t.best_score_

0.7205072643361081

In [None]:
#Choosing the Best Model lets test now to see which models we should further tune 

In [47]:
#Split into train and test 
X_train, X_test, y_train, y_test = train_test_split(df[['hash_count', 'emoji_count','clean_text',
                                                        'anger','anticipation','disgust','fear',
                                                       'joy', 'sadness', 'surprise', 'trust', 'punc_count',
                                                       'tweet_len','cap_count']], df['label'],test_size =0.20, random_state = 77)

In [None]:
#### Count Vectorizer ####

In [48]:
#Count vectorizer Document term matrix

count_vecto = CountVectorizer(stop_words = my_stop_words, max_features = 1000)
count_vecto_fit = count_vecto.fit(X_train['clean_text'])

count_train = count_vecto_fit.transform(X_train['clean_text'])
count_test = count_vecto_fit.transform(X_test['clean_text'])

X_train_vect = pd.concat([X_train[['hash_count', 'emoji_count', 'anger','anticipation','disgust','fear',
                                                       'joy', 'sadness', 'surprise', 'trust', 'punc_count',
                                                       'tweet_len','cap_count']].reset_index(drop=True), 
           pd.DataFrame(count_train.toarray())], axis=1)
X_test_vect = pd.concat([X_test[['hash_count', 'emoji_count', 'anger','anticipation','disgust','fear',
                                                       'joy', 'sadness', 'surprise', 'trust', 'punc_count',
                                                       'tweet_len','cap_count']].reset_index(drop=True), 
           pd.DataFrame(count_test.toarray())], axis=1)

In [49]:
X_train_vect.shape

(70008, 1013)

In [50]:
#Balance train Data 
ros = RandomOverSampler(random_state=77)
X_resampled_ros, y_resampled_ros = ros.fit_resample(X_train_vect.values, y_train.values)
print(sorted(Counter(y_resampled_ros).items()))

[(0, 23522), (1, 23522), (2, 23522), (3, 23522), (4, 23522), (5, 23522)]


In [102]:
# #Balance test data
# X_resampled_ros_test, y_resampled_ros_test = ros.fit_resample(X_test_vect.values, y_test.values)
# print(sorted(Counter(y_resampled_ros_test).items()))

[(0, 5982), (1, 5982), (2, 5982), (3, 5982), (4, 5982), (5, 5982)]


In [52]:
#Random Forest Count vectorizer 

#{'n_estimators': 650, 'max_features': 'log2', 'max_depth': 500}



#Instantiate our model 
rf = RandomForestClassifier(n_estimators = 650, max_depth = 500, max_features = 'log2',
                            n_jobs=-1)

#Model Fit 
start = time.time()
rf.fit(X_resampled_ros, y_resampled_ros)
end = time.time()
fit_time = (end - start)

#Model Predict 
start = time.time()
y_pred = rf.predict(X_test_vect)
end = time.time()
pred_time = (end - start)

#Model Scoring 
precision, recall, fscore, support = precision_recall_fscore_support(y_test, y_pred, average='macro')
print('Fit time: {} / Predict time: {} ---- Precision: {} / Recall: {} / Accuracy: {}'.format(
    round(fit_time, 3), round(pred_time, 3), round(precision, 3), round(recall, 3), round((y_pred==y_test).sum()/len(y_pred), 3)))

Fit time: 304.388 / Predict time: 2.32 ---- Precision: 0.716 / Recall: 0.656 / Accuracy: 0.738


In [53]:
importances = rf.feature_importances_
(sorted(zip(importances, X_train_vect.columns), reverse=True))[0:20]

[(0.07264998189604532, 'tweet_len'),
 (0.04641853216744782, 'cap_count'),
 (0.04272377734677494, 'punc_count'),
 (0.038893919132389765, 'hash_count'),
 (0.02146655519317955, 'sadness'),
 (0.018894953953547726, 'joy'),
 (0.017807906473101077, 'fear'),
 (0.016860613435313247, 'emoji_count'),
 (0.014306119628743563, 'anger'),
 (0.014305397515462942, 'anticipation'),
 (0.012611158225566781, 'trust'),
 (0.012546505297928749, 'disgust'),
 (0.009523934533368706, 172),
 (0.008760837448836236, 'surprise'),
 (0.008609675090072008, 470),
 (0.0050045463033912265, 547),
 (0.004917858903756468, 937),
 (0.004756201692522895, 496),
 (0.004426082061033796, 318),
 (0.004410321722824947, 182)]

In [54]:
#Multinomial Bayes

#Instantiate our model 
nb_clf = MultinomialNB(alpha = 0.05)

#Train our Model 
start = time.time()
nb_clf.fit(X_resampled_ros, y_resampled_ros)
end = time.time()
fit_time = (end - start)

#Model Predict 
start = time.time()
y_pred = nb_clf.predict(X_test_vect)
end = time.time()
pred_time = (end - start)

#Model Scoring 
precision, recall, fscore, support = precision_recall_fscore_support(y_test, y_pred, average='macro')
print('Fit time: {} / Predict time: {} ---- Precision: {} / Recall: {} / Accuracy: {}'.format(
    round(fit_time, 3), round(pred_time, 3), round(precision, 3), 
    round(recall, 3), round((y_pred==y_test).sum()/len(y_pred), 3)))

Fit time: 12.957 / Predict time: 0.075 ---- Precision: 0.541 / Recall: 0.602 / Accuracy: 0.612


In [55]:
#Logitstic Regression

In [56]:
#Scale data
from sklearn import preprocessing
X_train_scaled = preprocessing.scale(X_resampled_ros)
X_test_scaled = preprocessing.scale(X_test_vect)

In [57]:
#Logistic Regression 
#{'solver': 'newton-cg', 'max_iter': 300, 'C': 1}



#Instantiate our model
lr = LogisticRegression(max_iter = 300, solver = 'newton-cg', multi_class = 'multinomial', 
                        C = 1)

#Train our Model
start = time.time()
lr.fit(X_train_scaled, y_resampled_ros)
end = time.time()
fit_time = (end - start)

#Model Predict 
start = time.time()
y_pred = lr.predict(X_test_scaled)
end = time.time()
pred_time = (end - start)

#Model Scoring 
precision, recall, fscore, support = precision_recall_fscore_support(y_test, y_pred, average='macro')
print('Fit time: {} / Predict time: {} ---- Precision: {} / Recall: {} / Accuracy: {}'.format(
    round(fit_time, 3), round(pred_time, 3), round(precision, 3), round(recall, 3), 
    round((y_pred==y_test).sum()/len(y_pred), 3)))

Fit time: 266.405 / Predict time: 0.017 ---- Precision: 0.535 / Recall: 0.617 / Accuracy: 0.533


In [58]:
#KNN
#{'weights': 'distance', 'n_neighbors': 15, 'algorithm': 'brute'}


from sklearn.neighbors import KNeighborsClassifier

#Instantiate our model 
knn = KNeighborsClassifier(n_neighbors = 15, weights = 'distance', algorithm = 'brute')

#Model Fit 
start = time.time()
knn.fit(X_resampled_ros, y_resampled_ros)
end = time.time()
fit_time = (end - start)

#Model Predict 
start = time.time()
y_pred = knn.predict(X_test_vect)
end = time.time()
pred_time = (end - start)

#Model Scoring 
precision, recall, fscore, support = precision_recall_fscore_support(y_test, y_pred, average='macro')
print('Fit time: {} / Predict time: {} ---- Precision: {} / Recall: {} / Accuracy: {}'.format(
    round(fit_time, 3), round(pred_time, 3), round(precision, 3), round(recall, 3), round((y_pred==y_test).sum()/len(y_pred), 3)))

Fit time: 0.008 / Predict time: 59.804 ---- Precision: 0.354 / Recall: 0.38 / Accuracy: 0.386


In [59]:
#### TFIDF ####

In [60]:
#TFIDF vectorizer Document term matrix

tfidf_vecto = TfidfVectorizer(stop_words = my_stop_words, max_features = 1000)
tfidf_vecto_fit = tfidf_vecto.fit(X_train['clean_text'])

tfidf_train = tfidf_vecto_fit.transform(X_train['clean_text'])
tfidf_test = tfidf_vecto_fit.transform(X_test['clean_text'])

X_train_t_vect = pd.concat([X_train[['hash_count', 'emoji_count', 'anger','anticipation','disgust','fear',
                                                       'joy', 'sadness', 'surprise', 'trust', 'punc_count',
                                                       'tweet_len','cap_count']].reset_index(drop=True), 
           pd.DataFrame(tfidf_train.toarray())], axis=1)
X_test_t_vect = pd.concat([X_test[['hash_count', 'emoji_count', 'anger','anticipation','disgust','fear',
                                                       'joy', 'sadness', 'surprise', 'trust', 'punc_count',
                                                       'tweet_len','cap_count']].reset_index(drop=True), 
           pd.DataFrame(tfidf_test.toarray())], axis=1)

In [61]:
#Balance train Data 
ros = RandomOverSampler(random_state=77)
X_tfidf_ros, y_tfidf_ros = ros.fit_resample(X_train_t_vect.values, y_train.values)
print(sorted(Counter(y_tfidf_ros).items()))

[(0, 23522), (1, 23522), (2, 23522), (3, 23522), (4, 23522), (5, 23522)]


In [62]:
# #Balance test data
# X_tfidf_ros_test, y_tfidf_ros_test = ros.fit_resample(X_test_vect.values, y_test.values)
# print(sorted(Counter(y_tfidf_ros_test).items()))

In [63]:
#Random Forest

#{'n_estimators': 350, 'max_features': 'sqrt', 'max_depth': 500}


#Instantiate our model 
rf = RandomForestClassifier(n_estimators = 350, max_depth = 500, max_features = 'sqrt',
                            n_jobs=-1)

#Model Fit 
start = time.time()
rf.fit(X_tfidf_ros, y_tfidf_ros)
end = time.time()
fit_time = (end - start)

#Model Predict 
start = time.time()
y_pred = rf.predict(X_test_t_vect)
end = time.time()
pred_time = (end - start)

#Model Scoring 
precision, recall, fscore, support = precision_recall_fscore_support(y_test, y_pred, average='macro')
print('Fit time: {} / Predict time: {} ---- Precision: {} / Recall: {} / Accuracy: {}'.format(
    round(fit_time, 3), round(pred_time, 3), round(precision, 3), round(recall, 3), round((y_pred==y_test).sum()/len(y_pred), 3)))

Fit time: 262.597 / Predict time: 1.098 ---- Precision: 0.682 / Recall: 0.644 / Accuracy: 0.715


In [64]:
#Multinomial Bayes
#{'alpha': 0.054754754754754754}



#Instantiate our model 
nb_clf = MultinomialNB(alpha = 0.054754754754754754)

#Train our Model 
start = time.time()
nb_clf.fit(X_tfidf_ros, y_tfidf_ros)
end = time.time()
fit_time = (end - start)

#Model Predict 
start = time.time()
y_pred = nb_clf.predict(X_test_t_vect)
end = time.time()
pred_time = (end - start)

#Model Scoring 
precision, recall, fscore, support = precision_recall_fscore_support(y_test, y_pred, average='macro')
print('Fit time: {} / Predict time: {} ---- Precision: {} / Recall: {} / Accuracy: {}'.format(
    round(fit_time, 3), round(pred_time, 3), round(precision, 3), 
    round(recall, 3), round((y_pred==y_test).sum()/len(y_pred), 3)))

Fit time: 0.219 / Predict time: 0.042 ---- Precision: 0.508 / Recall: 0.571 / Accuracy: 0.573


In [65]:
#KNN
from sklearn.neighbors import KNeighborsClassifier
#{'weights': 'distance', 'n_neighbors': 15, 'algorithm': 'brute'}


#Instantiate our model 
knn = KNeighborsClassifier(n_neighbors = 15, weights = 'distance', algorithm = 'brute')

#Model Fit 
start = time.time()
knn.fit(X_tfidf_ros, y_tfidf_ros)
end = time.time()
fit_time = (end - start)

#Model Predict 
start = time.time()
y_pred = knn.predict(X_test_t_vect)
end = time.time()
pred_time = (end - start)

#Model Scoring 
precision, recall, fscore, support = precision_recall_fscore_support(y_test, y_pred, average='macro')
print('Fit time: {} / Predict time: {} ---- Precision: {} / Recall: {} / Accuracy: {}'.format(
    round(fit_time, 3), round(pred_time, 3), round(precision, 3), round(recall, 3), round((y_pred==y_test).sum()/len(y_pred), 3)))

Fit time: 0.122 / Predict time: 48.819 ---- Precision: 0.346 / Recall: 0.38 / Accuracy: 0.384


In [66]:
#Scale data
from sklearn import preprocessing
X_train_scaled_t = preprocessing.scale(X_tfidf_ros)
X_test_scaled_t = preprocessing.scale(X_test_t_vect)

In [67]:
#Logistic Regression 
#{'solver': 'saga', 'max_iter': 300, 'C': 0.05}


#Instantiate our model
lr = LogisticRegression(max_iter = 300, solver = 'saga', multi_class = 'multinomial', 
                        C = 0.05)

#Train our Model
start = time.time()
lr.fit(X_train_scaled_t, y_tfidf_ros)
end = time.time()
fit_time = (end - start)

#Model Predict 
start = time.time()
y_pred = lr.predict(X_test_scaled_t)
end = time.time()
pred_time = (end - start)

#Model Scoring 
precision, recall, fscore, support = precision_recall_fscore_support(y_test, y_pred, average='macro')
print('Fit time: {} / Predict time: {} ---- Precision: {} / Recall: {} / Accuracy: {}'.format(
    round(fit_time, 3), round(pred_time, 3), round(precision, 3), round(recall, 3), 
    round((y_pred==y_test).sum()/len(y_pred), 3)))

Fit time: 869.668 / Predict time: 0.013 ---- Precision: 0.545 / Recall: 0.635 / Accuracy: 0.57




In [68]:
# Choose the best model two models and then we will do max feature vs min_df, max_df vs with ngrams and without

In [None]:
#Go with countvectorizer Multinomial Naive Bayes and Random Forest 