# Natural Language Processing with Disaster Tweets (Kaggle Competition)

In [14]:
# imports

from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.neighbors import KNeighborsClassifier
from sklearn.pipeline import Pipeline
from sklearn.metrics import classification_report

# Starter Notebook

In [15]:
# imports
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O
from sklearn import feature_extraction, linear_model, model_selection, preprocessing

In [16]:
# read in data
train_df = pd.read_csv("Data/train.csv")
test_df = pd.read_csv("Data/test.csv")

# check
train_df.info()
test_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7613 entries, 0 to 7612
Data columns (total 5 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   id        7613 non-null   int64 
 1   keyword   7552 non-null   object
 2   location  5080 non-null   object
 3   text      7613 non-null   object
 4   target    7613 non-null   int64 
dtypes: int64(2), object(3)
memory usage: 297.5+ KB
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3263 entries, 0 to 3262
Data columns (total 4 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   id        3263 non-null   int64 
 1   keyword   3237 non-null   object
 2   location  2158 non-null   object
 3   text      3263 non-null   object
dtypes: int64(1), object(3)
memory usage: 102.1+ KB


In [17]:
# check for duplicates
train_df.duplicated().sum()

0

In [18]:
# check class balance
train_df['target'].value_counts()

# the classes are a bit unbalanced (57% to 43%),
# so we may want to try some class balancing techniques

0    4342
1    3271
Name: target, dtype: int64

In [19]:
# check uniqueness of id column
train_df['id'].nunique()

7613

In [20]:
# check examples of non-disaster tweets (target = 0)
nondisaster_df = train_df[train_df['target'] == 0]

for tweet in nondisaster_df['text'].values[1:10]:
    print(tweet)

I love fruits
Summer is lovely
My car is so fast
What a goooooooaaaaaal!!!!!!
this is ridiculous....
London is cool ;)
Love skiing
What a wonderful day!
LOOOOOOL


In [21]:
# check examples of disaster tweets (target = 1)
disaster_df = train_df[train_df['target'] == 1]

for tweet in disaster_df['text'].values[1:10]:
    print(tweet)

Forest fire near La Ronge Sask. Canada
All residents asked to 'shelter in place' are being notified by officers. No other evacuation or shelter in place orders are expected
13,000 people receive #wildfires evacuation orders in California 
Just got sent this photo from Ruby #Alaska as smoke from #wildfires pours into a school 
#RockyFire Update => California Hwy. 20 closed in both directions due to Lake County fire - #CAfire #wildfires
#flood #disaster Heavy rain causes flash flooding of streets in Manitou, Colorado Springs areas
I'm on top of the hill and I can see a fire in the woods...
There's an emergency evacuation happening now in the building across the street
I'm afraid that the tornado is coming to our area...


In [22]:
# count the words in each tweet and turn them into word vectors
count_vectorizer = feature_extraction.text.CountVectorizer()

# check
example_train_vectors = count_vectorizer.fit_transform(train_df['text'][0:5])
print(example_train_vectors[0].todense().shape)
print(example_train_vectors[0].todense())

(1, 54)
[[0 0 0 1 1 1 0 0 0 0 0 0 1 1 0 0 0 0 1 0 0 0 0 0 0 1 0 0 0 1 0 0 0 0 1 0
  0 0 0 1 0 0 0 0 0 0 0 0 0 1 1 0 1 0]]


In [23]:
# create all vectors
train_vectors = count_vectorizer.fit_transform(train_df['text'])

In [24]:
# baseline model: linear ridge regression
clf = linear_model.RidgeClassifier()

In [25]:
# get f1 score of baseline model
scores = model_selection.cross_val_score(clf, 
                                         train_vectors, 
                                         train_df['target'], 
                                         cv = 3, 
                                         scoring = 'f1')
scores

array([0.59453669, 0.5642787 , 0.64117647])

In [26]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(
    train_df.text,
    train_df.target,
    test_size = 0.2,
    random_state = 2022,
    stratify = train_df.target)

clf = Pipeline([
    ('count_vectorizer', feature_extraction.text.CountVectorizer()),
    ('ridge_classifier', linear_model.RidgeClassifier())
])

clf.fit(X_train, y_train)

y_pred = clf.predict(X_test)

print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.78      0.85      0.81       869
           1       0.77      0.68      0.72       654

    accuracy                           0.78      1523
   macro avg       0.78      0.76      0.77      1523
weighted avg       0.78      0.78      0.77      1523



In [27]:
# helper function to save scores
from sklearn.metrics import f1_score

scores_df = pd.DataFrame()

def save_scores(model_pipe, X_train, X_test, y_train, y_test, name):
  
    # calculate predictions
    train_pred = model_pipe.predict(X_train)
    test_pred = model_pipe.predict(X_test)
    
    # save f1 scores for each class
    f1_train_scores = f1_score(y_train, train_pred, average = None)
    for i, f1 in enumerate(f1_train_scores):
        if i == 0:
            f1_0_train = f1
        elif i == 1:
            f1_1_train = f1
            
    f1_test_scores = f1_score(y_test, test_pred, average = None)
    for i, f1 in enumerate(f1_test_scores):
        if i == 0:
            f1_0_test = f1
        elif i == 1:
            f1_1_test = f1
        
    # store scores
    scores_df.at[name, 'F1_0_Train'] = f1_0_train
    scores_df.at[name, 'F1_1_Train'] = f1_1_train
    scores_df.at[name, 'F1_Avg_Train'] = f1_score(y_train, train_pred, average = 'macro')
    scores_df.at[name, 'F1_0_Test'] = f1_0_test
    scores_df.at[name, 'F1_1_Test'] = f1_1_test
    scores_df.at[name, 'F1_Avg_Test'] = f1_score(y_test, test_pred, average = 'macro')
    
    # show scores for this model only (can call scores_df to see all scores)
    print(scores_df.loc[name, :])

# TF-IDF

## No class balancing, no preprocessing (KNN, MNB, RF)

In [28]:
train_df.head()

Unnamed: 0,id,keyword,location,text,target
0,1,,,Our Deeds are the Reason of this #earthquake M...,1
1,4,,,Forest fire near La Ronge Sask. Canada,1
2,5,,,All residents asked to 'shelter in place' are ...,1
3,6,,,"13,000 people receive #wildfires evacuation or...",1
4,7,,,Just got sent this photo from Ruby #Alaska as ...,1


In [29]:
# tts on unprocessed data
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(
    train_df.text,
    train_df.target,
    test_size = 0.2,
    random_state = 2022,
    stratify = train_df.target)

In [30]:
# knn, tfidf vectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.neighbors import KNeighborsClassifier
from sklearn.pipeline import Pipeline
from sklearn.metrics import classification_report

clf = Pipeline([
    ('vectorizer_tfidf', TfidfVectorizer()),
    ('KNN', KNeighborsClassifier())
])

clf.fit(X_train, y_train)

y_pred = clf.predict(X_test)

print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.76      0.86      0.81       869
           1       0.77      0.65      0.71       654

    accuracy                           0.77      1523
   macro avg       0.77      0.75      0.76      1523
weighted avg       0.77      0.77      0.76      1523



In [31]:
save_scores(clf, X_train, X_test, y_train, y_test, "knn-tfidf-unb")

F1_0_Train      0.865827
F1_1_Train      0.799016
F1_Avg_Train    0.832421
F1_0_Test       0.808234
F1_1_Test       0.705000
F1_Avg_Test     0.756617
Name: knn-tfidf-unb, dtype: float64


In [32]:
scores_df

Unnamed: 0,F1_0_Train,F1_1_Train,F1_Avg_Train,F1_0_Test,F1_1_Test,F1_Avg_Test
knn-tfidf-unb,0.865827,0.799016,0.832421,0.808234,0.705,0.756617


In [33]:
# multinomial naive bayes

from sklearn.naive_bayes import MultinomialNB

clf = Pipeline([
    ('vectorizer_tfidf', TfidfVectorizer()),
    ('Multi NB', MultinomialNB())
])

clf.fit(X_train, y_train)

y_pred = clf.predict(X_test)

print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.76      0.93      0.84       869
           1       0.87      0.61      0.72       654

    accuracy                           0.79      1523
   macro avg       0.81      0.77      0.78      1523
weighted avg       0.81      0.79      0.78      1523



In [34]:
save_scores(clf, X_train, X_test, y_train, y_test, "mnb-tfidf-unb")

F1_0_Train      0.910580
F1_1_Train      0.859256
F1_Avg_Train    0.884918
F1_0_Test       0.835836
F1_1_Test       0.715695
F1_Avg_Test     0.775766
Name: mnb-tfidf-unb, dtype: float64


In [35]:
# random forest

from sklearn.ensemble import RandomForestClassifier

clf = Pipeline([
    ('vectorizer_tfidf', TfidfVectorizer()),
    ('Random Forest', RandomForestClassifier())
])

clf.fit(X_train, y_train)

y_pred = clf.predict(X_test)

print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.74      0.92      0.82       869
           1       0.85      0.57      0.68       654

    accuracy                           0.77      1523
   macro avg       0.79      0.75      0.75      1523
weighted avg       0.79      0.77      0.76      1523



In [36]:
save_scores(clf, X_train, X_test, y_train, y_test, "rf-tfidf-unb")

F1_0_Train      0.997412
F1_1_Train      0.996554
F1_Avg_Train    0.996983
F1_0_Test       0.821904
F1_1_Test       0.681319
F1_Avg_Test     0.751611
Name: rf-tfidf-unb, dtype: float64


## No class balancing, minimal preprocessing (KNN, MNB, RF)

In [37]:
# preprocessing: removing spacy stopwords and punctuation, lemmatizing

import spacy

nlp = spacy.load("en_core_web_sm")

def preprocess(text):
    doc = nlp(text)
    
    filtered_tokens = []
    
    # take out stopwords and punctuation
    for token in doc:
        if token.is_stop or token.is_punct:
            continue
        
        # convert to lemmas
        filtered_tokens.append(token.lemma_)
            
    return " ".join(filtered_tokens)

In [38]:
train_df['preprocessed_txt'] = train_df['text'].apply(preprocess)

In [39]:
# check
train_df.head()

Unnamed: 0,id,keyword,location,text,target,preprocessed_txt
0,1,,,Our Deeds are the Reason of this #earthquake M...,1,deed Reason earthquake ALLAH forgive
1,4,,,Forest fire near La Ronge Sask. Canada,1,forest fire near La Ronge Sask Canada
2,5,,,All residents asked to 'shelter in place' are ...,1,resident ask shelter place notify officer evac...
3,6,,,"13,000 people receive #wildfires evacuation or...",1,"13,000 people receive wildfire evacuation orde..."
4,7,,,Just got sent this photo from Ruby #Alaska as ...,1,got send photo Ruby Alaska smoke wildfire pour...


In [40]:
# tts on processed data

X_train, X_test, y_train, y_test = train_test_split(
    train_df.preprocessed_txt,
    train_df.target,
    test_size = 0.2,
    random_state = 2022,
    stratify = train_df.target)

In [41]:
# knn on preprocessed data
clf = Pipeline([
    ('vectorizer_tfidf', TfidfVectorizer()),
    ('KNN', KNeighborsClassifier())
])

clf.fit(X_train, y_train)

y_pred = clf.predict(X_test)

print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.76      0.86      0.81       869
           1       0.77      0.63      0.70       654

    accuracy                           0.76      1523
   macro avg       0.77      0.75      0.75      1523
weighted avg       0.76      0.76      0.76      1523



In [42]:
save_scores(clf, X_train, X_test, y_train, y_test, "knn-tfidf-unb-prep")

F1_0_Train      0.865442
F1_1_Train      0.791545
F1_Avg_Train    0.828493
F1_0_Test       0.805391
F1_1_Test       0.696893
F1_Avg_Test     0.751142
Name: knn-tfidf-unb-prep, dtype: float64


In [43]:
# multinomial naive bayes on preprocessed text

clf = Pipeline([
    ('vectorizer_tfidf', TfidfVectorizer()),
    ('Multi NB', MultinomialNB())
])

clf.fit(X_train, y_train)

y_pred = clf.predict(X_test)

print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.78      0.91      0.84       869
           1       0.84      0.65      0.73       654

    accuracy                           0.80      1523
   macro avg       0.81      0.78      0.78      1523
weighted avg       0.80      0.80      0.79      1523



In [44]:
save_scores(clf, X_train, X_test, y_train, y_test, "mnb-tfidf-unb-prep")

F1_0_Train      0.926736
F1_1_Train      0.889803
F1_Avg_Train    0.908269
F1_0_Test       0.836074
F1_1_Test       0.733850
F1_Avg_Test     0.784962
Name: mnb-tfidf-unb-prep, dtype: float64


In [45]:
# random forest on preprocessed text

clf = Pipeline([
    ('vectorizer_tfidf', TfidfVectorizer()),
    ('Random Forest', RandomForestClassifier())
])

clf.fit(X_train, y_train)

y_pred = clf.predict(X_test)

print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.75      0.92      0.82       869
           1       0.84      0.58      0.69       654

    accuracy                           0.77      1523
   macro avg       0.79      0.75      0.76      1523
weighted avg       0.79      0.77      0.77      1523



In [46]:
save_scores(clf, X_train, X_test, y_train, y_test, "rf-tfidf-unb-prep")

F1_0_Train      0.997555
F1_1_Train      0.996748
F1_Avg_Train    0.997151
F1_0_Test       0.822497
F1_1_Test       0.689531
F1_Avg_Test     0.756014
Name: rf-tfidf-unb-prep, dtype: float64


In [47]:
scores_df.sort_values(by = 'F1_Avg_Test', ascending = False)

Unnamed: 0,F1_0_Train,F1_1_Train,F1_Avg_Train,F1_0_Test,F1_1_Test,F1_Avg_Test
mnb-tfidf-unb-prep,0.926736,0.889803,0.908269,0.836074,0.73385,0.784962
mnb-tfidf-unb,0.91058,0.859256,0.884918,0.835836,0.715695,0.775766
knn-tfidf-unb,0.865827,0.799016,0.832421,0.808234,0.705,0.756617
rf-tfidf-unb-prep,0.997555,0.996748,0.997151,0.822497,0.689531,0.756014
rf-tfidf-unb,0.997412,0.996554,0.996983,0.821904,0.681319,0.751611
knn-tfidf-unb-prep,0.865442,0.791545,0.828493,0.805391,0.696893,0.751142


## Class balancing, minimal preprocessing (KNN, MNB, RF)

In [48]:
# class_weight, undersampling, oversampling, smote

In [49]:
# class_weight param
# knn: none
# mnb: fit_prior = False
# rf: class_weight = 'balanced'

In [51]:
# use class_weight param (if avail) WITH other
# sampling techniques

### Undersampling & class_weight

In [52]:
from imblearn.pipeline import make_pipeline as resample_pipeline
from imblearn.under_sampling import RandomUnderSampler

In [50]:
X_train, X_test, y_train, y_test = train_test_split(
    train_df.preprocessed_txt,
    train_df.target,
    test_size = 0.2,
    random_state = 2022,
    stratify = train_df.target)

In [53]:
# undersampled knn on preprocessed data
clf = resample_pipeline(TfidfVectorizer(),
                              RandomUnderSampler(),
                              KNeighborsClassifier())

clf.fit(X_train, y_train)

save_scores(clf, X_train, X_test, y_train, y_test, "knn-tfidf-under-prep")

F1_0_Train      0.850968
F1_1_Train      0.789869
F1_Avg_Train    0.820419
F1_0_Test       0.794429
F1_1_Test       0.705036
F1_Avg_Test     0.749732
Name: knn-tfidf-under-prep, dtype: float64


In [54]:
# undersampled, balanced mnb on preprocessed text

clf = resample_pipeline(TfidfVectorizer(),
                        RandomUnderSampler(),
                        MultinomialNB(fit_prior = False))

clf.fit(X_train, y_train)

save_scores(clf, X_train, X_test, y_train, y_test, "mnb-tfidf-under-prep")

F1_0_Train      0.917846
F1_1_Train      0.894312
F1_Avg_Train    0.906079
F1_0_Test       0.794135
F1_1_Test       0.738255
F1_Avg_Test     0.766195
Name: mnb-tfidf-under-prep, dtype: float64


In [56]:
# random forest on preprocessed text

clf = resample_pipeline(TfidfVectorizer(),
                        RandomUnderSampler(),
                        RandomForestClassifier(class_weight = 'balanced'))

clf.fit(X_train, y_train)

save_scores(clf, X_train, X_test, y_train, y_test, "rf-tfidf-under-prep")

F1_0_Train      0.982323
F1_1_Train      0.977320
F1_Avg_Train    0.979821
F1_0_Test       0.817447
F1_1_Test       0.714886
F1_Avg_Test     0.766167
Name: rf-tfidf-under-prep, dtype: float64


In [57]:
scores_df.sort_values(by = 'F1_Avg_Test', ascending = False)

Unnamed: 0,F1_0_Train,F1_1_Train,F1_Avg_Train,F1_0_Test,F1_1_Test,F1_Avg_Test
mnb-tfidf-unb-prep,0.926736,0.889803,0.908269,0.836074,0.73385,0.784962
mnb-tfidf-unb,0.91058,0.859256,0.884918,0.835836,0.715695,0.775766
mnb-tfidf-under-prep,0.917846,0.894312,0.906079,0.794135,0.738255,0.766195
rf-tfidf-under-prep,0.982323,0.97732,0.979821,0.817447,0.714886,0.766167
knn-tfidf-unb,0.865827,0.799016,0.832421,0.808234,0.705,0.756617
rf-tfidf-unb-prep,0.997555,0.996748,0.997151,0.822497,0.689531,0.756014
rf-tfidf-unb,0.997412,0.996554,0.996983,0.821904,0.681319,0.751611
knn-tfidf-unb-prep,0.865442,0.791545,0.828493,0.805391,0.696893,0.751142
knn-tfidf-under-prep,0.850968,0.789869,0.820419,0.794429,0.705036,0.749732


### Oversampling and class_weight

In [58]:
from imblearn.over_sampling import RandomOverSampler

In [59]:
X_train, X_test, y_train, y_test = train_test_split(
    train_df.preprocessed_txt,
    train_df.target,
    test_size = 0.2,
    random_state = 2022,
    stratify = train_df.target)

In [60]:
# oversampled knn on preprocessed data
clf = resample_pipeline(TfidfVectorizer(),
                        RandomOverSampler(),
                        KNeighborsClassifier())

clf.fit(X_train, y_train)

save_scores(clf, X_train, X_test, y_train, y_test, "knn-tfidf-over-prep")

F1_0_Train      0.858153
F1_1_Train      0.803680
F1_Avg_Train    0.830916
F1_0_Test       0.784380
F1_1_Test       0.702111
F1_Avg_Test     0.743246
Name: knn-tfidf-over-prep, dtype: float64


In [61]:
# oversampled, balanced mnb on preprocessed text

clf = resample_pipeline(TfidfVectorizer(),
                        RandomOverSampler(),
                        MultinomialNB(fit_prior = False))

clf.fit(X_train, y_train)

save_scores(clf, X_train, X_test, y_train, y_test, "mnb-tfidf-over-prep")

F1_0_Train      0.933086
F1_1_Train      0.909757
F1_Avg_Train    0.921421
F1_0_Test       0.798149
F1_1_Test       0.735004
F1_Avg_Test     0.766577
Name: mnb-tfidf-over-prep, dtype: float64


In [62]:
# oversampled random forest on preprocessed text

clf = resample_pipeline(TfidfVectorizer(),
                        RandomOverSampler(),
                        RandomForestClassifier(class_weight = 'balanced'))

clf.fit(X_train, y_train)

save_scores(clf, X_train, X_test, y_train, y_test, "rf-tfidf-over-prep")

F1_0_Train      0.997552
F1_1_Train      0.996753
F1_Avg_Train    0.997152
F1_0_Test       0.823777
F1_1_Test       0.707424
F1_Avg_Test     0.765600
Name: rf-tfidf-over-prep, dtype: float64


In [63]:
scores_df.sort_values(by = 'F1_Avg_Test', ascending = False)

Unnamed: 0,F1_0_Train,F1_1_Train,F1_Avg_Train,F1_0_Test,F1_1_Test,F1_Avg_Test
mnb-tfidf-unb-prep,0.926736,0.889803,0.908269,0.836074,0.73385,0.784962
mnb-tfidf-unb,0.91058,0.859256,0.884918,0.835836,0.715695,0.775766
mnb-tfidf-over-prep,0.933086,0.909757,0.921421,0.798149,0.735004,0.766577
mnb-tfidf-under-prep,0.917846,0.894312,0.906079,0.794135,0.738255,0.766195
rf-tfidf-under-prep,0.982323,0.97732,0.979821,0.817447,0.714886,0.766167
rf-tfidf-over-prep,0.997552,0.996753,0.997152,0.823777,0.707424,0.7656
knn-tfidf-unb,0.865827,0.799016,0.832421,0.808234,0.705,0.756617
rf-tfidf-unb-prep,0.997555,0.996748,0.997151,0.822497,0.689531,0.756014
rf-tfidf-unb,0.997412,0.996554,0.996983,0.821904,0.681319,0.751611
knn-tfidf-unb-prep,0.865442,0.791545,0.828493,0.805391,0.696893,0.751142


### SMOTE and class_weight

In [64]:
from imblearn.over_sampling import SMOTE

In [65]:
X_train, X_test, y_train, y_test = train_test_split(
    train_df.preprocessed_txt,
    train_df.target,
    test_size = 0.2,
    random_state = 2022,
    stratify = train_df.target)

In [66]:
# smote knn on preprocessed data
clf = resample_pipeline(TfidfVectorizer(),
                        SMOTE(),
                        KNeighborsClassifier())

clf.fit(X_train, y_train)

save_scores(clf, X_train, X_test, y_train, y_test, "knn-tfidf-smote-prep")

F1_0_Train      0.440872
F1_1_Train      0.667627
F1_Avg_Train    0.554249
F1_0_Test       0.409408
F1_1_Test       0.642782
F1_Avg_Test     0.526095
Name: knn-tfidf-smote-prep, dtype: float64


In [67]:
# smote, balanced mnb on preprocessed text

clf = resample_pipeline(TfidfVectorizer(),
                        SMOTE(),
                        MultinomialNB(fit_prior = False))

clf.fit(X_train, y_train)

save_scores(clf, X_train, X_test, y_train, y_test, "mnb-tfidf-smote-prep")

F1_0_Train      0.931173
F1_1_Train      0.907799
F1_Avg_Train    0.919486
F1_0_Test       0.798382
F1_1_Test       0.734601
F1_Avg_Test     0.766492
Name: mnb-tfidf-smote-prep, dtype: float64


In [69]:
# smote, default mnb on preprocessed text

clf = resample_pipeline(TfidfVectorizer(),
                        SMOTE(),
                        MultinomialNB())

clf.fit(X_train, y_train)

save_scores(clf, X_train, X_test, y_train, y_test, "mnb-tfidf-smote-def-prep")

F1_0_Train      0.934099
F1_1_Train      0.911985
F1_Avg_Train    0.923042
F1_0_Test       0.801390
F1_1_Test       0.739955
F1_Avg_Test     0.770672
Name: mnb-tfidf-smote-def-prep, dtype: float64


In [68]:
# smote random forest on preprocessed text

clf = resample_pipeline(TfidfVectorizer(),
                        SMOTE(),
                        RandomForestClassifier(class_weight = 'balanced'))

clf.fit(X_train, y_train)

save_scores(clf, X_train, X_test, y_train, y_test, "rf-tfidf-smote-prep")

F1_0_Train      0.997554
F1_1_Train      0.996749
F1_Avg_Train    0.997152
F1_0_Test       0.825117
F1_1_Test       0.698838
F1_Avg_Test     0.761978
Name: rf-tfidf-smote-prep, dtype: float64


In [71]:
scores_df.sort_values(by = 'F1_Avg_Test', ascending = False).head()

Unnamed: 0,F1_0_Train,F1_1_Train,F1_Avg_Train,F1_0_Test,F1_1_Test,F1_Avg_Test
mnb-tfidf-unb-prep,0.926736,0.889803,0.908269,0.836074,0.73385,0.784962
mnb-tfidf-unb,0.91058,0.859256,0.884918,0.835836,0.715695,0.775766
mnb-tfidf-smote-def-prep,0.934099,0.911985,0.923042,0.80139,0.739955,0.770672
mnb-tfidf-over-prep,0.933086,0.909757,0.921421,0.798149,0.735004,0.766577
mnb-tfidf-smote-prep,0.931173,0.907799,0.919486,0.798382,0.734601,0.766492


In [None]:
# categorical nb?