# Natural Language Processing with Disaster Tweets (Kaggle Competition)

In [4]:
# imports

from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.neighbors import KNeighborsClassifier
from sklearn.pipeline import Pipeline
from sklearn.metrics import classification_report

# Starter Notebook

In [5]:
# imports
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O
from sklearn import feature_extraction, linear_model, model_selection, preprocessing

In [6]:
# read in data
train_df = pd.read_csv("Data/train.csv")
test_df = pd.read_csv("Data/test.csv")

# check
train_df.info()
test_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7613 entries, 0 to 7612
Data columns (total 5 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   id        7613 non-null   int64 
 1   keyword   7552 non-null   object
 2   location  5080 non-null   object
 3   text      7613 non-null   object
 4   target    7613 non-null   int64 
dtypes: int64(2), object(3)
memory usage: 297.5+ KB
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3263 entries, 0 to 3262
Data columns (total 4 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   id        3263 non-null   int64 
 1   keyword   3237 non-null   object
 2   location  2158 non-null   object
 3   text      3263 non-null   object
dtypes: int64(1), object(3)
memory usage: 102.1+ KB


In [5]:
# check for duplicates
train_df.duplicated().sum()

0

In [6]:
# check class balance
train_df['target'].value_counts()

# the classes are a bit unbalanced (57% to 43%),
# so we may want to try some class balancing techniques

0    4342
1    3271
Name: target, dtype: int64

In [7]:
# check uniqueness of id column
train_df['id'].nunique()

7613

In [8]:
# check examples of non-disaster tweets (target = 0)
nondisaster_df = train_df[train_df['target'] == 0]

for tweet in nondisaster_df['text'].values[1:10]:
    print(tweet)

I love fruits
Summer is lovely
My car is so fast
What a goooooooaaaaaal!!!!!!
this is ridiculous....
London is cool ;)
Love skiing
What a wonderful day!
LOOOOOOL


In [9]:
# check examples of disaster tweets (target = 1)
disaster_df = train_df[train_df['target'] == 1]

for tweet in disaster_df['text'].values[1:10]:
    print(tweet)

Forest fire near La Ronge Sask. Canada
All residents asked to 'shelter in place' are being notified by officers. No other evacuation or shelter in place orders are expected
13,000 people receive #wildfires evacuation orders in California 
Just got sent this photo from Ruby #Alaska as smoke from #wildfires pours into a school 
#RockyFire Update => California Hwy. 20 closed in both directions due to Lake County fire - #CAfire #wildfires
#flood #disaster Heavy rain causes flash flooding of streets in Manitou, Colorado Springs areas
I'm on top of the hill and I can see a fire in the woods...
There's an emergency evacuation happening now in the building across the street
I'm afraid that the tornado is coming to our area...


In [10]:
# count the words in each tweet and turn them into word vectors
count_vectorizer = feature_extraction.text.CountVectorizer()

# check
example_train_vectors = count_vectorizer.fit_transform(train_df['text'][0:5])
print(example_train_vectors[0].todense().shape)
print(example_train_vectors[0].todense())

(1, 54)
[[0 0 0 1 1 1 0 0 0 0 0 0 1 1 0 0 0 0 1 0 0 0 0 0 0 1 0 0 0 1 0 0 0 0 1 0
  0 0 0 1 0 0 0 0 0 0 0 0 0 1 1 0 1 0]]


In [11]:
# create all vectors
train_vectors = count_vectorizer.fit_transform(train_df['text'])

In [12]:
# baseline model: linear ridge regression
clf = linear_model.RidgeClassifier()

In [13]:
# get f1 score of baseline model
scores = model_selection.cross_val_score(clf, 
                                         train_vectors, 
                                         train_df['target'], 
                                         cv = 3, 
                                         scoring = 'f1')
scores

array([0.59453669, 0.5642787 , 0.64117647])

In [14]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(
    train_df.text,
    train_df.target,
    test_size = 0.2,
    random_state = 2022,
    stratify = train_df.target)

clf = Pipeline([
    ('count_vectorizer', feature_extraction.text.CountVectorizer()),
    ('ridge_classifier', linear_model.RidgeClassifier())
])

clf.fit(X_train, y_train)

y_pred = clf.predict(X_test)

print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.78      0.85      0.81       869
           1       0.77      0.68      0.72       654

    accuracy                           0.78      1523
   macro avg       0.78      0.76      0.77      1523
weighted avg       0.78      0.78      0.77      1523



In [18]:
# helper function to save scores
from sklearn.metrics import f1_score

scores_df = pd.DataFrame()

def save_scores(model_pipe, X_train, X_test, y_train, y_test, name):
  
    # calculate predictions
    train_pred = model_pipe.predict(X_train)
    test_pred = model_pipe.predict(X_test)
    
    # save f1 scores for each class
    f1_train_scores = f1_score(y_train, train_pred, average = None)
    for i, f1 in enumerate(f1_train_scores):
        if i == 0:
            f1_0_train = f1
        elif i == 1:
            f1_1_train = f1
            
    f1_test_scores = f1_score(y_test, test_pred, average = None)
    for i, f1 in enumerate(f1_test_scores):
        if i == 0:
            f1_0_test = f1
        elif i == 1:
            f1_1_test = f1
        
    # store scores
    scores_df.at[name, 'F1_0_Train'] = f1_0_train
    scores_df.at[name, 'F1_1_Train'] = f1_1_train
    scores_df.at[name, 'F1_Avg_Train'] = f1_score(y_train, train_pred, average = 'macro')
    scores_df.at[name, 'F1_0_Test'] = f1_0_test
    scores_df.at[name, 'F1_1_Test'] = f1_1_test
    scores_df.at[name, 'F1_Avg_Test'] = f1_score(y_test, test_pred, average = 'macro')
    
    # show scores for this model only (can call scores_df to see all scores)
    print(scores_df.loc[name, :])

# TF-IDF

## No class balancing, no preprocessing (KNN, MNB, RF)

In [16]:
train_df.head()

Unnamed: 0,id,keyword,location,text,target
0,1,,,Our Deeds are the Reason of this #earthquake M...,1
1,4,,,Forest fire near La Ronge Sask. Canada,1
2,5,,,All residents asked to 'shelter in place' are ...,1
3,6,,,"13,000 people receive #wildfires evacuation or...",1
4,7,,,Just got sent this photo from Ruby #Alaska as ...,1


In [17]:
# tts on unprocessed data
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(
    train_df.text,
    train_df.target,
    test_size = 0.2,
    random_state = 2022,
    stratify = train_df.target)

In [18]:
# knn, tfidf vectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.neighbors import KNeighborsClassifier
from sklearn.pipeline import Pipeline
from sklearn.metrics import classification_report

clf = Pipeline([
    ('vectorizer_tfidf', TfidfVectorizer()),
    ('KNN', KNeighborsClassifier())
])

clf.fit(X_train, y_train)

y_pred = clf.predict(X_test)

print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.76      0.86      0.81       869
           1       0.77      0.65      0.71       654

    accuracy                           0.77      1523
   macro avg       0.77      0.75      0.76      1523
weighted avg       0.77      0.77      0.76      1523



In [19]:
save_scores(clf, X_train, X_test, y_train, y_test, "knn-tfidf-unb")

F1_0_Train      0.865827
F1_1_Train      0.799016
F1_Avg_Train    0.832421
F1_0_Test       0.808234
F1_1_Test       0.705000
F1_Avg_Test     0.756617
Name: knn-tfidf-unb, dtype: float64


In [20]:
scores_df

Unnamed: 0,F1_0_Train,F1_1_Train,F1_Avg_Train,F1_0_Test,F1_1_Test,F1_Avg_Test
knn-tfidf-unb,0.865827,0.799016,0.832421,0.808234,0.705,0.756617


In [21]:
# multinomial naive bayes

from sklearn.naive_bayes import MultinomialNB

clf = Pipeline([
    ('vectorizer_tfidf', TfidfVectorizer()),
    ('Multi NB', MultinomialNB())
])

clf.fit(X_train, y_train)

y_pred = clf.predict(X_test)

print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.76      0.93      0.84       869
           1       0.87      0.61      0.72       654

    accuracy                           0.79      1523
   macro avg       0.81      0.77      0.78      1523
weighted avg       0.81      0.79      0.78      1523



In [22]:
save_scores(clf, X_train, X_test, y_train, y_test, "mnb-tfidf-unb")

F1_0_Train      0.910580
F1_1_Train      0.859256
F1_Avg_Train    0.884918
F1_0_Test       0.835836
F1_1_Test       0.715695
F1_Avg_Test     0.775766
Name: mnb-tfidf-unb, dtype: float64


In [23]:
# random forest

from sklearn.ensemble import RandomForestClassifier

clf = Pipeline([
    ('vectorizer_tfidf', TfidfVectorizer()),
    ('Random Forest', RandomForestClassifier())
])

clf.fit(X_train, y_train)

y_pred = clf.predict(X_test)

print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.74      0.92      0.82       869
           1       0.84      0.56      0.67       654

    accuracy                           0.76      1523
   macro avg       0.79      0.74      0.74      1523
weighted avg       0.78      0.76      0.75      1523



In [24]:
save_scores(clf, X_train, X_test, y_train, y_test, "rf-tfidf-unb")

F1_0_Train      0.997409
F1_1_Train      0.996561
F1_Avg_Train    0.996985
F1_0_Test       0.816786
F1_1_Test       0.672161
F1_Avg_Test     0.744474
Name: rf-tfidf-unb, dtype: float64


## No class balancing, minimal preprocessing (KNN, MNB, RF)

In [25]:
# preprocessing: removing spacy stopwords and punctuation, lemmatizing

import spacy

nlp = spacy.load("en_core_web_sm")

def preprocess(text):
    doc = nlp(text)
    
    filtered_tokens = []
    
    # take out stopwords and punctuation
    for token in doc:
        if token.is_stop or token.is_punct:
            continue
        
        # convert to lemmas
        filtered_tokens.append(token.lemma_)
            
    return " ".join(filtered_tokens)

In [26]:
train_df['preprocessed_txt'] = train_df['text'].apply(preprocess)

In [27]:
# check
train_df.head()

Unnamed: 0,id,keyword,location,text,target,preprocessed_txt
0,1,,,Our Deeds are the Reason of this #earthquake M...,1,deed Reason earthquake ALLAH forgive
1,4,,,Forest fire near La Ronge Sask. Canada,1,forest fire near La Ronge Sask Canada
2,5,,,All residents asked to 'shelter in place' are ...,1,resident ask shelter place notify officer evac...
3,6,,,"13,000 people receive #wildfires evacuation or...",1,"13,000 people receive wildfire evacuation orde..."
4,7,,,Just got sent this photo from Ruby #Alaska as ...,1,got send photo Ruby Alaska smoke wildfire pour...


In [28]:
# tts on processed data

X_train, X_test, y_train, y_test = train_test_split(
    train_df.preprocessed_txt,
    train_df.target,
    test_size = 0.2,
    random_state = 2022,
    stratify = train_df.target)

In [29]:
# knn on preprocessed data
clf = Pipeline([
    ('vectorizer_tfidf', TfidfVectorizer()),
    ('KNN', KNeighborsClassifier())
])

clf.fit(X_train, y_train)

y_pred = clf.predict(X_test)

print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.76      0.86      0.81       869
           1       0.77      0.63      0.70       654

    accuracy                           0.76      1523
   macro avg       0.77      0.75      0.75      1523
weighted avg       0.76      0.76      0.76      1523



In [30]:
save_scores(clf, X_train, X_test, y_train, y_test, "knn-tfidf-unb-prep")

F1_0_Train      0.865442
F1_1_Train      0.791545
F1_Avg_Train    0.828493
F1_0_Test       0.805391
F1_1_Test       0.696893
F1_Avg_Test     0.751142
Name: knn-tfidf-unb-prep, dtype: float64


In [31]:
# multinomial naive bayes on preprocessed text

clf = Pipeline([
    ('vectorizer_tfidf', TfidfVectorizer()),
    ('Multi NB', MultinomialNB())
])

clf.fit(X_train, y_train)

y_pred = clf.predict(X_test)

print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.78      0.91      0.84       869
           1       0.84      0.65      0.73       654

    accuracy                           0.80      1523
   macro avg       0.81      0.78      0.78      1523
weighted avg       0.80      0.80      0.79      1523



In [32]:
save_scores(clf, X_train, X_test, y_train, y_test, "mnb-tfidf-unb-prep")

F1_0_Train      0.926736
F1_1_Train      0.889803
F1_Avg_Train    0.908269
F1_0_Test       0.836074
F1_1_Test       0.733850
F1_Avg_Test     0.784962
Name: mnb-tfidf-unb-prep, dtype: float64


In [33]:
# random forest on preprocessed text

clf = Pipeline([
    ('vectorizer_tfidf', TfidfVectorizer()),
    ('Random Forest', RandomForestClassifier())
])

clf.fit(X_train, y_train)

y_pred = clf.predict(X_test)

print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.75      0.92      0.83       869
           1       0.85      0.59      0.70       654

    accuracy                           0.78      1523
   macro avg       0.80      0.76      0.76      1523
weighted avg       0.79      0.78      0.77      1523



In [34]:
save_scores(clf, X_train, X_test, y_train, y_test, "rf-tfidf-unb-prep")

F1_0_Train      0.997554
F1_1_Train      0.996749
F1_Avg_Train    0.997152
F1_0_Test       0.826873
F1_1_Test       0.698470
F1_Avg_Test     0.762672
Name: rf-tfidf-unb-prep, dtype: float64


In [35]:
scores_df.sort_values(by = 'F1_Avg_Test', ascending = False)

Unnamed: 0,F1_0_Train,F1_1_Train,F1_Avg_Train,F1_0_Test,F1_1_Test,F1_Avg_Test
mnb-tfidf-unb-prep,0.926736,0.889803,0.908269,0.836074,0.73385,0.784962
mnb-tfidf-unb,0.91058,0.859256,0.884918,0.835836,0.715695,0.775766
rf-tfidf-unb-prep,0.997554,0.996749,0.997152,0.826873,0.69847,0.762672
knn-tfidf-unb,0.865827,0.799016,0.832421,0.808234,0.705,0.756617
knn-tfidf-unb-prep,0.865442,0.791545,0.828493,0.805391,0.696893,0.751142
rf-tfidf-unb,0.997409,0.996561,0.996985,0.816786,0.672161,0.744474


## Class balancing, minimal preprocessing (KNN, MNB, RF)

In [36]:
# class_weight, undersampling, oversampling, smote

In [37]:
# class_weight param
# knn: none
# mnb: fit_prior = False
# rf: class_weight = 'balanced'

In [38]:
# use class_weight param (if avail) WITH other
# sampling techniques

### Undersampling & class_weight

In [39]:
from imblearn.pipeline import make_pipeline as resample_pipeline
from imblearn.under_sampling import RandomUnderSampler

In [40]:
X_train, X_test, y_train, y_test = train_test_split(
    train_df.preprocessed_txt,
    train_df.target,
    test_size = 0.2,
    random_state = 2022,
    stratify = train_df.target)

In [41]:
# undersampled knn on preprocessed data
clf = resample_pipeline(TfidfVectorizer(),
                              RandomUnderSampler(),
                              KNeighborsClassifier())

clf.fit(X_train, y_train)

save_scores(clf, X_train, X_test, y_train, y_test, "knn-tfidf-under-prep")

F1_0_Train      0.848382
F1_1_Train      0.789732
F1_Avg_Train    0.819057
F1_0_Test       0.784667
F1_1_Test       0.699686
F1_Avg_Test     0.742176
Name: knn-tfidf-under-prep, dtype: float64


In [42]:
# undersampled, balanced mnb on preprocessed text

clf = resample_pipeline(TfidfVectorizer(),
                        RandomUnderSampler(),
                        MultinomialNB(fit_prior = False))

clf.fit(X_train, y_train)

save_scores(clf, X_train, X_test, y_train, y_test, "mnb-tfidf-under-prep")

F1_0_Train      0.920773
F1_1_Train      0.897189
F1_Avg_Train    0.908981
F1_0_Test       0.794393
F1_1_Test       0.736132
F1_Avg_Test     0.765262
Name: mnb-tfidf-under-prep, dtype: float64


In [43]:
# random forest on preprocessed text

clf = resample_pipeline(TfidfVectorizer(),
                        RandomUnderSampler(),
                        RandomForestClassifier(class_weight = 'balanced'))

clf.fit(X_train, y_train)

save_scores(clf, X_train, X_test, y_train, y_test, "rf-tfidf-under-prep")

F1_0_Train      0.981882
F1_1_Train      0.976762
F1_Avg_Train    0.979322
F1_0_Test       0.814895
F1_1_Test       0.712490
F1_Avg_Test     0.763692
Name: rf-tfidf-under-prep, dtype: float64


In [44]:
scores_df.sort_values(by = 'F1_Avg_Test', ascending = False)

Unnamed: 0,F1_0_Train,F1_1_Train,F1_Avg_Train,F1_0_Test,F1_1_Test,F1_Avg_Test
mnb-tfidf-unb-prep,0.926736,0.889803,0.908269,0.836074,0.73385,0.784962
mnb-tfidf-unb,0.91058,0.859256,0.884918,0.835836,0.715695,0.775766
mnb-tfidf-under-prep,0.920773,0.897189,0.908981,0.794393,0.736132,0.765262
rf-tfidf-under-prep,0.981882,0.976762,0.979322,0.814895,0.71249,0.763692
rf-tfidf-unb-prep,0.997554,0.996749,0.997152,0.826873,0.69847,0.762672
knn-tfidf-unb,0.865827,0.799016,0.832421,0.808234,0.705,0.756617
knn-tfidf-unb-prep,0.865442,0.791545,0.828493,0.805391,0.696893,0.751142
rf-tfidf-unb,0.997409,0.996561,0.996985,0.816786,0.672161,0.744474
knn-tfidf-under-prep,0.848382,0.789732,0.819057,0.784667,0.699686,0.742176


### Oversampling and class_weight

In [45]:
from imblearn.over_sampling import RandomOverSampler

In [46]:
X_train, X_test, y_train, y_test = train_test_split(
    train_df.preprocessed_txt,
    train_df.target,
    test_size = 0.2,
    random_state = 2022,
    stratify = train_df.target)

In [47]:
# oversampled knn on preprocessed data
clf = resample_pipeline(TfidfVectorizer(),
                        RandomOverSampler(),
                        KNeighborsClassifier())

clf.fit(X_train, y_train)

save_scores(clf, X_train, X_test, y_train, y_test, "knn-tfidf-over-prep")

F1_0_Train      0.857022
F1_1_Train      0.801646
F1_Avg_Train    0.829334
F1_0_Test       0.782854
F1_1_Test       0.697565
F1_Avg_Test     0.740209
Name: knn-tfidf-over-prep, dtype: float64


In [48]:
# oversampled, balanced mnb on preprocessed text

clf = resample_pipeline(TfidfVectorizer(),
                        RandomOverSampler(),
                        MultinomialNB(fit_prior = False))

clf.fit(X_train, y_train)

save_scores(clf, X_train, X_test, y_train, y_test, "mnb-tfidf-over-prep")

F1_0_Train      0.934497
F1_1_Train      0.911719
F1_Avg_Train    0.923108
F1_0_Test       0.797688
F1_1_Test       0.734043
F1_Avg_Test     0.765865
Name: mnb-tfidf-over-prep, dtype: float64


In [49]:
# oversampled random forest on preprocessed text

clf = resample_pipeline(TfidfVectorizer(),
                        RandomOverSampler(),
                        RandomForestClassifier(class_weight = 'balanced'))

clf.fit(X_train, y_train)

save_scores(clf, X_train, X_test, y_train, y_test, "rf-tfidf-over-prep")

F1_0_Train      0.997409
F1_1_Train      0.996561
F1_Avg_Train    0.996985
F1_0_Test       0.820513
F1_1_Test       0.697797
F1_Avg_Test     0.759155
Name: rf-tfidf-over-prep, dtype: float64


In [50]:
scores_df.sort_values(by = 'F1_Avg_Test', ascending = False)

Unnamed: 0,F1_0_Train,F1_1_Train,F1_Avg_Train,F1_0_Test,F1_1_Test,F1_Avg_Test
mnb-tfidf-unb-prep,0.926736,0.889803,0.908269,0.836074,0.73385,0.784962
mnb-tfidf-unb,0.91058,0.859256,0.884918,0.835836,0.715695,0.775766
mnb-tfidf-over-prep,0.934497,0.911719,0.923108,0.797688,0.734043,0.765865
mnb-tfidf-under-prep,0.920773,0.897189,0.908981,0.794393,0.736132,0.765262
rf-tfidf-under-prep,0.981882,0.976762,0.979322,0.814895,0.71249,0.763692
rf-tfidf-unb-prep,0.997554,0.996749,0.997152,0.826873,0.69847,0.762672
rf-tfidf-over-prep,0.997409,0.996561,0.996985,0.820513,0.697797,0.759155
knn-tfidf-unb,0.865827,0.799016,0.832421,0.808234,0.705,0.756617
knn-tfidf-unb-prep,0.865442,0.791545,0.828493,0.805391,0.696893,0.751142
rf-tfidf-unb,0.997409,0.996561,0.996985,0.816786,0.672161,0.744474


### SMOTE and class_weight

In [51]:
from imblearn.over_sampling import SMOTE

In [52]:
X_train, X_test, y_train, y_test = train_test_split(
    train_df.preprocessed_txt,
    train_df.target,
    test_size = 0.2,
    random_state = 2022,
    stratify = train_df.target)

In [53]:
# smote knn on preprocessed data
clf = resample_pipeline(TfidfVectorizer(),
                        SMOTE(),
                        KNeighborsClassifier())

clf.fit(X_train, y_train)

save_scores(clf, X_train, X_test, y_train, y_test, "knn-tfidf-smote-prep")

F1_0_Train      0.453258
F1_1_Train      0.669477
F1_Avg_Train    0.561367
F1_0_Test       0.422778
F1_1_Test       0.645469
F1_Avg_Test     0.534124
Name: knn-tfidf-smote-prep, dtype: float64


In [54]:
# smote, balanced mnb on preprocessed text

clf = resample_pipeline(TfidfVectorizer(),
                        SMOTE(),
                        MultinomialNB(fit_prior = False))

clf.fit(X_train, y_train)

save_scores(clf, X_train, X_test, y_train, y_test, "mnb-tfidf-smote-prep")

F1_0_Train      0.934886
F1_1_Train      0.913268
F1_Avg_Train    0.924077
F1_0_Test       0.798841
F1_1_Test       0.737320
F1_Avg_Test     0.768080
Name: mnb-tfidf-smote-prep, dtype: float64


In [55]:
# smote, default mnb on preprocessed text

clf = resample_pipeline(TfidfVectorizer(),
                        SMOTE(),
                        MultinomialNB())

clf.fit(X_train, y_train)

save_scores(clf, X_train, X_test, y_train, y_test, "mnb-tfidf-smote-def-prep")

F1_0_Train      0.931020
F1_1_Train      0.907624
F1_Avg_Train    0.919322
F1_0_Test       0.801615
F1_1_Test       0.737805
F1_Avg_Test     0.769710
Name: mnb-tfidf-smote-def-prep, dtype: float64


In [56]:
# smote random forest on preprocessed text

clf = resample_pipeline(TfidfVectorizer(),
                        SMOTE(),
                        RandomForestClassifier(class_weight = 'balanced'))

clf.fit(X_train, y_train)

save_scores(clf, X_train, X_test, y_train, y_test, "rf-tfidf-smote-prep")

F1_0_Train      0.997554
F1_1_Train      0.996750
F1_Avg_Train    0.997152
F1_0_Test       0.827550
F1_1_Test       0.701345
F1_Avg_Test     0.764448
Name: rf-tfidf-smote-prep, dtype: float64


In [57]:
scores_df.sort_values(by = 'F1_Avg_Test', ascending = False).head()

Unnamed: 0,F1_0_Train,F1_1_Train,F1_Avg_Train,F1_0_Test,F1_1_Test,F1_Avg_Test
mnb-tfidf-unb-prep,0.926736,0.889803,0.908269,0.836074,0.73385,0.784962
mnb-tfidf-unb,0.91058,0.859256,0.884918,0.835836,0.715695,0.775766
mnb-tfidf-smote-def-prep,0.93102,0.907624,0.919322,0.801615,0.737805,0.76971
mnb-tfidf-smote-prep,0.934886,0.913268,0.924077,0.798841,0.73732,0.76808
mnb-tfidf-over-prep,0.934497,0.911719,0.923108,0.797688,0.734043,0.765865


# spaCy Word Embeddings

## MNB, no class balancing

In [59]:
train_df.head()

Unnamed: 0,id,keyword,location,text,target,preprocessed_txt
0,1,,,Our Deeds are the Reason of this #earthquake M...,1,deed Reason earthquake ALLAH forgive
1,4,,,Forest fire near La Ronge Sask. Canada,1,forest fire near La Ronge Sask Canada
2,5,,,All residents asked to 'shelter in place' are ...,1,resident ask shelter place notify officer evac...
3,6,,,"13,000 people receive #wildfires evacuation or...",1,"13,000 people receive wildfire evacuation orde..."
4,7,,,Just got sent this photo from Ruby #Alaska as ...,1,got send photo Ruby Alaska smoke wildfire pour...


In [60]:
import spacy
nlp = spacy.load("en_core_web_lg")

In [62]:
# make spacy vectors (takes awhile!)
train_df['spacy_vector'] = train_df['text'].apply(lambda x: nlp(x).vector)

In [64]:
# tts
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(
    train_df.spacy_vector.values,
    train_df.target,
    test_size = 0.2,
    random_state = 2022,
    stratify = train_df.target)

In [65]:
# sets are of format numpy array of numpy arrays
# need to flatten the arrays because clf is expecting
# just a 2d numpy array

import numpy as np

X_train_2d = np.stack(X_train)
X_test_2d = np.stack(X_test)

In [66]:
# scale values so there are no negative values
# MultinomialNB doesn't accept negative values
from sklearn.preprocessing import MinMaxScaler

scaler = MinMaxScaler()
scaled_train_embed = scaler.fit_transform(X_train_2d)
scaled_test_embed = scaler.transform(X_test_2d)

In [73]:
# mnb, spacy word vectors
from sklearn.naive_bayes import MultinomialNB

clf = MultinomialNB()
clf.fit(scaled_train_embed, y_train)

save_scores(clf, 
            scaled_train_embed, 
            scaled_test_embed, 
            y_train, 
            y_test, 
            "mnb-spacyvec-unb")

scores_df.sort_values(by = 'F1_Avg_Test', ascending = False)

F1_0_Train      0.707195
F1_1_Train      0.628678
F1_Avg_Train    0.667937
F1_0_Test       0.704639
F1_1_Test       0.625465
F1_Avg_Test     0.665052
Name: mnb-spacyvec-unb, dtype: float64


Unnamed: 0,F1_0_Train,F1_1_Train,F1_Avg_Train,F1_0_Test,F1_1_Test,F1_Avg_Test
mnb-tfidf-unb-prep,0.926736,0.889803,0.908269,0.836074,0.73385,0.784962
mnb-tfidf-unb,0.91058,0.859256,0.884918,0.835836,0.715695,0.775766
mnb-tfidf-smote-def-prep,0.93102,0.907624,0.919322,0.801615,0.737805,0.76971
mnb-tfidf-smote-prep,0.934886,0.913268,0.924077,0.798841,0.73732,0.76808
mnb-tfidf-over-prep,0.934497,0.911719,0.923108,0.797688,0.734043,0.765865
mnb-tfidf-under-prep,0.920773,0.897189,0.908981,0.794393,0.736132,0.765262
rf-tfidf-smote-prep,0.997554,0.99675,0.997152,0.82755,0.701345,0.764448
rf-tfidf-under-prep,0.981882,0.976762,0.979322,0.814895,0.71249,0.763692
rf-tfidf-unb-prep,0.997554,0.996749,0.997152,0.826873,0.69847,0.762672
rf-tfidf-over-prep,0.997409,0.996561,0.996985,0.820513,0.697797,0.759155


## KNN, no class balancing

In [74]:
# knn

from sklearn.neighbors import KNeighborsClassifier

clf = KNeighborsClassifier(n_neighbors = 5, metric = 'euclidean')

clf.fit(X_train_2d, y_train)

save_scores(clf, 
            X_train_2d, 
            X_test_2d, 
            y_train, 
            y_test, 
            "knn-spacyvec-unb")

scores_df.sort_values(by = 'F1_Avg_Test', ascending = False)

F1_0_Train      0.842459
F1_1_Train      0.776741
F1_Avg_Train    0.809600
F1_0_Test       0.754886
F1_1_Test       0.650199
F1_Avg_Test     0.702542
Name: knn-spacyvec-unb, dtype: float64


Unnamed: 0,F1_0_Train,F1_1_Train,F1_Avg_Train,F1_0_Test,F1_1_Test,F1_Avg_Test
mnb-tfidf-unb-prep,0.926736,0.889803,0.908269,0.836074,0.73385,0.784962
mnb-tfidf-unb,0.91058,0.859256,0.884918,0.835836,0.715695,0.775766
mnb-tfidf-smote-def-prep,0.93102,0.907624,0.919322,0.801615,0.737805,0.76971
mnb-tfidf-smote-prep,0.934886,0.913268,0.924077,0.798841,0.73732,0.76808
mnb-tfidf-over-prep,0.934497,0.911719,0.923108,0.797688,0.734043,0.765865
mnb-tfidf-under-prep,0.920773,0.897189,0.908981,0.794393,0.736132,0.765262
rf-tfidf-smote-prep,0.997554,0.99675,0.997152,0.82755,0.701345,0.764448
rf-tfidf-under-prep,0.981882,0.976762,0.979322,0.814895,0.71249,0.763692
rf-tfidf-unb-prep,0.997554,0.996749,0.997152,0.826873,0.69847,0.762672
rf-tfidf-over-prep,0.997409,0.996561,0.996985,0.820513,0.697797,0.759155


## RF, no class balancing

In [76]:
# rf

clf = RandomForestClassifier()

clf.fit(X_train_2d, y_train)

save_scores(clf, 
            X_train_2d, 
            X_test_2d, 
            y_train, 
            y_test, 
            "rf-spacyvec-unb")

scores_df.sort_values(by = 'F1_Avg_Test', ascending = False).head()

F1_0_Train      0.990831
F1_1_Train      0.987692
F1_Avg_Train    0.989262
F1_0_Test       0.801498
F1_1_Test       0.684792
F1_Avg_Test     0.743145
Name: rf-spacyvec-unb, dtype: float64


Unnamed: 0,F1_0_Train,F1_1_Train,F1_Avg_Train,F1_0_Test,F1_1_Test,F1_Avg_Test
mnb-tfidf-unb-prep,0.926736,0.889803,0.908269,0.836074,0.73385,0.784962
mnb-tfidf-unb,0.91058,0.859256,0.884918,0.835836,0.715695,0.775766
mnb-tfidf-smote-def-prep,0.93102,0.907624,0.919322,0.801615,0.737805,0.76971
mnb-tfidf-smote-prep,0.934886,0.913268,0.924077,0.798841,0.73732,0.76808
mnb-tfidf-over-prep,0.934497,0.911719,0.923108,0.797688,0.734043,0.765865


## MNB, class balancing

In [79]:
# undersampled, balanced mnb on preprocessed text

# tts
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(
    train_df.spacy_vector.values,
    train_df.target,
    test_size = 0.2,
    random_state = 2022,
    stratify = train_df.target)

# sets are of format numpy array of numpy arrays
# need to flatten the arrays because clf is expecting
# just a 2d numpy array

import numpy as np

X_train_2d = np.stack(X_train)
X_test_2d = np.stack(X_test)

# scale values so there are no negative values
# MultinomialNB doesn't accept negative values
from sklearn.preprocessing import MinMaxScaler

scaler = MinMaxScaler()
scaled_train_embed = scaler.fit_transform(X_train_2d)
scaled_test_embed = scaler.transform(X_test_2d)

clf = resample_pipeline(RandomUnderSampler(),
                        MultinomialNB(fit_prior = False))
clf.fit(scaled_train_embed, y_train)

save_scores(clf, 
            scaled_train_embed, 
            scaled_test_embed, 
            y_train, 
            y_test, 
            "mnb-spacyvec-under")

scores_df.sort_values(by = 'F1_Avg_Test', ascending = False).head()

F1_0_Train      0.676200
F1_1_Train      0.653755
F1_Avg_Train    0.664977
F1_0_Test       0.666240
F1_1_Test       0.647773
F1_Avg_Test     0.657007
Name: mnb-spacyvec-under, dtype: float64


Unnamed: 0,F1_0_Train,F1_1_Train,F1_Avg_Train,F1_0_Test,F1_1_Test,F1_Avg_Test
mnb-tfidf-unb-prep,0.926736,0.889803,0.908269,0.836074,0.73385,0.784962
mnb-tfidf-unb,0.91058,0.859256,0.884918,0.835836,0.715695,0.775766
mnb-tfidf-smote-def-prep,0.93102,0.907624,0.919322,0.801615,0.737805,0.76971
mnb-tfidf-smote-prep,0.934886,0.913268,0.924077,0.798841,0.73732,0.76808
mnb-tfidf-over-prep,0.934497,0.911719,0.923108,0.797688,0.734043,0.765865


In [80]:
# oversampled, balanced mnb on preprocessed text

# tts
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(
    train_df.spacy_vector.values,
    train_df.target,
    test_size = 0.2,
    random_state = 2022,
    stratify = train_df.target)

# sets are of format numpy array of numpy arrays
# need to flatten the arrays because clf is expecting
# just a 2d numpy array

import numpy as np

X_train_2d = np.stack(X_train)
X_test_2d = np.stack(X_test)

# scale values so there are no negative values
# MultinomialNB doesn't accept negative values
from sklearn.preprocessing import MinMaxScaler

scaler = MinMaxScaler()
scaled_train_embed = scaler.fit_transform(X_train_2d)
scaled_test_embed = scaler.transform(X_test_2d)

clf = resample_pipeline(RandomOverSampler(),
                        MultinomialNB(fit_prior = False))
clf.fit(scaled_train_embed, y_train)

save_scores(clf, 
            scaled_train_embed, 
            scaled_test_embed, 
            y_train, 
            y_test, 
            "mnb-spacyvec-over")

scores_df.sort_values(by = 'F1_Avg_Test', ascending = False).head()

F1_0_Train      0.676620
F1_1_Train      0.653977
F1_Avg_Train    0.665298
F1_0_Test       0.667093
F1_1_Test       0.648211
F1_Avg_Test     0.657652
Name: mnb-spacyvec-over, dtype: float64


Unnamed: 0,F1_0_Train,F1_1_Train,F1_Avg_Train,F1_0_Test,F1_1_Test,F1_Avg_Test
mnb-tfidf-unb-prep,0.926736,0.889803,0.908269,0.836074,0.73385,0.784962
mnb-tfidf-unb,0.91058,0.859256,0.884918,0.835836,0.715695,0.775766
mnb-tfidf-smote-def-prep,0.93102,0.907624,0.919322,0.801615,0.737805,0.76971
mnb-tfidf-smote-prep,0.934886,0.913268,0.924077,0.798841,0.73732,0.76808
mnb-tfidf-over-prep,0.934497,0.911719,0.923108,0.797688,0.734043,0.765865


In [81]:
# smoted, balanced mnb on preprocessed text

# tts
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(
    train_df.spacy_vector.values,
    train_df.target,
    test_size = 0.2,
    random_state = 2022,
    stratify = train_df.target)

# sets are of format numpy array of numpy arrays
# need to flatten the arrays because clf is expecting
# just a 2d numpy array

import numpy as np

X_train_2d = np.stack(X_train)
X_test_2d = np.stack(X_test)

# scale values so there are no negative values
# MultinomialNB doesn't accept negative values
from sklearn.preprocessing import MinMaxScaler

scaler = MinMaxScaler()
scaled_train_embed = scaler.fit_transform(X_train_2d)
scaled_test_embed = scaler.transform(X_test_2d)

clf = resample_pipeline(SMOTE(),
                        MultinomialNB(fit_prior = False))
clf.fit(scaled_train_embed, y_train)

save_scores(clf, 
            scaled_train_embed, 
            scaled_test_embed, 
            y_train, 
            y_test, 
            "mnb-spacyvec-smote")

scores_df.sort_values(by = 'F1_Avg_Test', ascending = False).head()

F1_0_Train      0.676298
F1_1_Train      0.653290
F1_Avg_Train    0.664794
F1_0_Test       0.663677
F1_1_Test       0.646465
F1_Avg_Test     0.655071
Name: mnb-spacyvec-smote, dtype: float64


Unnamed: 0,F1_0_Train,F1_1_Train,F1_Avg_Train,F1_0_Test,F1_1_Test,F1_Avg_Test
mnb-tfidf-unb-prep,0.926736,0.889803,0.908269,0.836074,0.73385,0.784962
mnb-tfidf-unb,0.91058,0.859256,0.884918,0.835836,0.715695,0.775766
mnb-tfidf-smote-def-prep,0.93102,0.907624,0.919322,0.801615,0.737805,0.76971
mnb-tfidf-smote-prep,0.934886,0.913268,0.924077,0.798841,0.73732,0.76808
mnb-tfidf-over-prep,0.934497,0.911719,0.923108,0.797688,0.734043,0.765865


In [82]:
# smoted, unbalanced mnb on preprocessed text

# tts
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(
    train_df.spacy_vector.values,
    train_df.target,
    test_size = 0.2,
    random_state = 2022,
    stratify = train_df.target)

# sets are of format numpy array of numpy arrays
# need to flatten the arrays because clf is expecting
# just a 2d numpy array

import numpy as np

X_train_2d = np.stack(X_train)
X_test_2d = np.stack(X_test)

# scale values so there are no negative values
# MultinomialNB doesn't accept negative values
from sklearn.preprocessing import MinMaxScaler

scaler = MinMaxScaler()
scaled_train_embed = scaler.fit_transform(X_train_2d)
scaled_test_embed = scaler.transform(X_test_2d)

clf = resample_pipeline(SMOTE(),
                        MultinomialNB())
clf.fit(scaled_train_embed, y_train)

save_scores(clf, 
            scaled_train_embed, 
            scaled_test_embed, 
            y_train, 
            y_test, 
            "mnb-spacyvec-smote")

scores_df.sort_values(by = 'F1_Avg_Test', ascending = False).head()

F1_0_Train      0.676284
F1_1_Train      0.652248
F1_Avg_Train    0.664266
F1_0_Test       0.667093
F1_1_Test       0.648211
F1_Avg_Test     0.657652
Name: mnb-spacyvec-smote, dtype: float64


Unnamed: 0,F1_0_Train,F1_1_Train,F1_Avg_Train,F1_0_Test,F1_1_Test,F1_Avg_Test
mnb-tfidf-unb-prep,0.926736,0.889803,0.908269,0.836074,0.73385,0.784962
mnb-tfidf-unb,0.91058,0.859256,0.884918,0.835836,0.715695,0.775766
mnb-tfidf-smote-def-prep,0.93102,0.907624,0.919322,0.801615,0.737805,0.76971
mnb-tfidf-smote-prep,0.934886,0.913268,0.924077,0.798841,0.73732,0.76808
mnb-tfidf-over-prep,0.934497,0.911719,0.923108,0.797688,0.734043,0.765865


## KNN, class balancing

In [83]:
# undersampled knn

clf = resample_pipeline(RandomUnderSampler(),
                        KNeighborsClassifier(n_neighbors = 5,
                                            metric = 'euclidean'))

clf.fit(X_train_2d, y_train)

save_scores(clf, 
            X_train_2d, 
            X_test_2d, 
            y_train, 
            y_test, 
            "knn-spacyvec-under")

F1_0_Train      0.817106
F1_1_Train      0.772040
F1_Avg_Train    0.794573
F1_0_Test       0.720903
F1_1_Test       0.654919
F1_Avg_Test     0.687911
Name: knn-spacyvec-under, dtype: float64


In [84]:
# oversampled knn

clf = resample_pipeline(RandomOverSampler(),
                        KNeighborsClassifier(n_neighbors = 5,
                                            metric = 'euclidean'))

clf.fit(X_train_2d, y_train)

save_scores(clf, 
            X_train_2d, 
            X_test_2d, 
            y_train, 
            y_test, 
            "knn-spacyvec-over")

F1_0_Train      0.828014
F1_1_Train      0.784922
F1_Avg_Train    0.806468
F1_0_Test       0.711824
F1_1_Test       0.644167
F1_Avg_Test     0.677996
Name: knn-spacyvec-over, dtype: float64


In [85]:
# smoted knn

clf = resample_pipeline(SMOTE(),
                        KNeighborsClassifier(n_neighbors = 5,
                                            metric = 'euclidean'))

clf.fit(X_train_2d, y_train)

save_scores(clf, 
            X_train_2d, 
            X_test_2d, 
            y_train, 
            y_test, 
            "knn-spacyvec-smote")

F1_0_Train      0.786374
F1_1_Train      0.772465
F1_Avg_Train    0.779420
F1_0_Test       0.654402
F1_1_Test       0.654856
F1_Avg_Test     0.654629
Name: knn-spacyvec-smote, dtype: float64


In [86]:
# undersampled default knn

clf = resample_pipeline(RandomUnderSampler(),
                        KNeighborsClassifier())

clf.fit(X_train_2d, y_train)

save_scores(clf, 
            X_train_2d, 
            X_test_2d, 
            y_train, 
            y_test, 
            "knndef-spacyvec-under")

F1_0_Train      0.812723
F1_1_Train      0.768892
F1_Avg_Train    0.790808
F1_0_Test       0.726521
F1_1_Test       0.657797
F1_Avg_Test     0.692159
Name: knndef-spacyvec-under, dtype: float64


In [87]:
# oversampled default knn

clf = resample_pipeline(RandomOverSampler(),
                        KNeighborsClassifier())

clf.fit(X_train_2d, y_train)

save_scores(clf, 
            X_train_2d, 
            X_test_2d, 
            y_train, 
            y_test, 
            "knndef-spacyvec-over")

F1_0_Train      0.826461
F1_1_Train      0.784847
F1_Avg_Train    0.805654
F1_0_Test       0.720238
F1_1_Test       0.655930
F1_Avg_Test     0.688084
Name: knndef-spacyvec-over, dtype: float64


In [88]:
# smoted default knn

clf = resample_pipeline(SMOTE(),
                        KNeighborsClassifier())

clf.fit(X_train_2d, y_train)

save_scores(clf, 
            X_train_2d, 
            X_test_2d, 
            y_train, 
            y_test, 
            "knndef-spacyvec-smote")

F1_0_Train      0.785256
F1_1_Train      0.774411
F1_Avg_Train    0.779834
F1_0_Test       0.650667
F1_1_Test       0.661061
F1_Avg_Test     0.655864
Name: knndef-spacyvec-smote, dtype: float64


## RF, class balancing

In [91]:
# rf under

clf = resample_pipeline(RandomUnderSampler(),
                        RandomForestClassifier())

clf.fit(X_train_2d, y_train)

save_scores(clf, 
            X_train_2d, 
            X_test_2d, 
            y_train, 
            y_test, 
            "rf-spacyvec-under")

F1_0_Train      0.966774
F1_1_Train      0.957977
F1_Avg_Train    0.962376
F1_0_Test       0.779236
F1_1_Test       0.700696
F1_Avg_Test     0.739966
Name: rf-spacyvec-under, dtype: float64


In [97]:
# rf under bal

clf = resample_pipeline(RandomUnderSampler(),
                        RandomForestClassifier(class_weight = 'balanced'))

clf.fit(X_train_2d, y_train)

save_scores(clf, 
            X_train_2d, 
            X_test_2d, 
            y_train, 
            y_test, 
            "rfbal-spacyvec-under")

F1_0_Train      0.963695
F1_1_Train      0.954478
F1_Avg_Train    0.959087
F1_0_Test       0.774453
F1_1_Test       0.700306
F1_Avg_Test     0.737380
Name: rfbal-spacyvec-under, dtype: float64


In [92]:
# rf over

clf = resample_pipeline(RandomOverSampler(),
                        RandomForestClassifier())

clf.fit(X_train_2d, y_train)

save_scores(clf, 
            X_train_2d, 
            X_test_2d, 
            y_train, 
            y_test, 
            "rf-spacyvec-over")

F1_0_Train      0.990085
F1_1_Train      0.986784
F1_Avg_Train    0.988434
F1_0_Test       0.797157
F1_1_Test       0.695152
F1_Avg_Test     0.746154
Name: rf-spacyvec-over, dtype: float64


In [96]:
# rf over bal

clf = resample_pipeline(RandomOverSampler(),
                        RandomForestClassifier(class_weight = 'balanced'))

clf.fit(X_train_2d, y_train)

save_scores(clf, 
            X_train_2d, 
            X_test_2d, 
            y_train, 
            y_test, 
            "rfbal-spacyvec-over")

F1_0_Train      0.990515
F1_1_Train      0.987361
F1_Avg_Train    0.988938
F1_0_Test       0.795640
F1_1_Test       0.690339
F1_Avg_Test     0.742989
Name: rfbal-spacyvec-over, dtype: float64


In [93]:
# rf smote

clf = resample_pipeline(SMOTE(),
                        RandomForestClassifier())

clf.fit(X_train_2d, y_train)

save_scores(clf, 
            X_train_2d, 
            X_test_2d, 
            y_train, 
            y_test, 
            "rf-spacyvec-smote")

F1_0_Train      0.990815
F1_1_Train      0.987721
F1_Avg_Train    0.989268
F1_0_Test       0.797357
F1_1_Test       0.700813
F1_Avg_Test     0.749085
Name: rf-spacyvec-smote, dtype: float64


In [95]:
# rf smote with balancing

clf = resample_pipeline(SMOTE(),
                        RandomForestClassifier(class_weight = 'balanced'))

clf.fit(X_train_2d, y_train)

save_scores(clf, 
            X_train_2d, 
            X_test_2d, 
            y_train, 
            y_test, 
            "rfbal-spacyvec-smote")

F1_0_Train      0.990826
F1_1_Train      0.987702
F1_Avg_Train    0.989264
F1_0_Test       0.795806
F1_1_Test       0.700162
F1_Avg_Test     0.747984
Name: rfbal-spacyvec-smote, dtype: float64


In [99]:
scores_df.sort_values(by = 'F1_Avg_Test', ascending = False).head()

Unnamed: 0,F1_0_Train,F1_1_Train,F1_Avg_Train,F1_0_Test,F1_1_Test,F1_Avg_Test
mnb-tfidf-unb-prep,0.926736,0.889803,0.908269,0.836074,0.73385,0.784962
mnb-tfidf-unb,0.91058,0.859256,0.884918,0.835836,0.715695,0.775766
mnb-tfidf-smote-def-prep,0.93102,0.907624,0.919322,0.801615,0.737805,0.76971
mnb-tfidf-smote-prep,0.934886,0.913268,0.924077,0.798841,0.73732,0.76808
mnb-tfidf-over-prep,0.934497,0.911719,0.923108,0.797688,0.734043,0.765865


# Gensim word vectors

In [2]:
import gensim.downloader as api

wv = api.load("word2vec-google-news-300")

In [3]:
import pandas as pd
df = pd.read_csv('Data/train.csv')

In [4]:
print(df.shape)
df.head()

(7613, 5)


Unnamed: 0,id,keyword,location,text,target
0,1,,,Our Deeds are the Reason of this #earthquake M...,1
1,4,,,Forest fire near La Ronge Sask. Canada,1
2,5,,,All residents asked to 'shelter in place' are ...,1
3,6,,,"13,000 people receive #wildfires evacuation or...",1
4,7,,,Just got sent this photo from Ruby #Alaska as ...,1


In [None]:
# balance classes?

In [6]:
# preprocess and get gensim doc vector
import spacy

nlp = spacy.load("en_core_web_lg")

def preprocess_and_vectorize(text):
    doc = nlp(text)
    
    filtered_tokens = []
    
    for token in doc:
        if token.is_punct or token.is_stop:
            continue
        filtered_tokens.append(token.lemma_)
    
    return wv.get_mean_vector(filtered_tokens)

In [8]:
# convert text into gensim word embeddings

df['gensim_vector'] = df['text'].apply(lambda text: preprocess_and_vectorize(text))

In [9]:
df.head()

Unnamed: 0,id,keyword,location,text,target,gensim_vector
0,1,,,Our Deeds are the Reason of this #earthquake M...,1,"[0.05016107, 0.00387215, 0.047061782, 0.028958..."
1,4,,,Forest fire near La Ronge Sask. Canada,1,"[0.03064329, 0.0030595234, 0.0369662, 0.020602..."
2,5,,,All residents asked to 'shelter in place' are ...,1,"[-0.0048536863, 0.011481234, 0.016771162, -0.0..."
3,6,,,"13,000 people receive #wildfires evacuation or...",1,"[0.060398173, -0.012511074, -0.0018801317, 0.0..."
4,7,,,Just got sent this photo from Ruby #Alaska as ...,1,"[0.021673834, 0.0012636562, -0.031610973, 0.03..."


In [13]:
# tts
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(
    df.gensim_vector.values,
    df.target,
    test_size = 0.2,
    random_state = 2022,
    stratify = df.target)

In [15]:
# create 2d np arrays for X train and test sets
import numpy as np
X_train_2d = np.stack(X_train)
X_test_2d = np.stack(X_test)

In [19]:
# gradient boosting classifier
from sklearn.ensemble import GradientBoostingClassifier

clf = GradientBoostingClassifier()

clf.fit(X_train_2d, y_train)

save_scores(clf, 
            X_train_2d, 
            X_test_2d, 
            y_train, 
            y_test, 
            "gbc-gensim")

F1_0_Train      0.891286
F1_1_Train      0.841212
F1_Avg_Train    0.866249
F1_0_Test       0.816304
F1_1_Test       0.737849
F1_Avg_Test     0.777076
Name: gbc-gensim, dtype: float64


In [20]:
scores_df.sort_values(by = "F1_Avg_Test", ascending = False).head()

Unnamed: 0,F1_0_Train,F1_1_Train,F1_Avg_Train,F1_0_Test,F1_1_Test,F1_Avg_Test
gbc-gensim,0.891286,0.841212,0.866249,0.816304,0.737849,0.777076


more gensim:
other models available:
- twitter, wiki
- glove, fasttext

consider gensim models:
- glove-twitter-200
- word2vec-google-news-300
- glove-wiki-gigaword-300

# Fast Text

In [7]:
# read in kaggle's training data
import pandas as pd
df = pd.read_csv('Data/train.csv')

In [8]:
# check shape and head
print(df.shape)
df.head()

(7613, 5)


Unnamed: 0,id,keyword,location,text,target
0,1,,,Our Deeds are the Reason of this #earthquake M...,1
1,4,,,Forest fire near La Ronge Sask. Canada,1
2,5,,,All residents asked to 'shelter in place' are ...,1
3,6,,,"13,000 people receive #wildfires evacuation or...",1
4,7,,,Just got sent this photo from Ruby #Alaska as ...,1


In [9]:
# format target labels for fasttext:
# must have "__label__" before each target lable
df['target'] = "__label__" + df['target'].astype(str)

# check
df.target.unique()

array(['__label__1', '__label__0'], dtype=object)

In [10]:
# merge target and text columns so the text is on the same line as the label
# with the label first (separated by a space); this is required for 
# fasttext formatting
df['target_text'] = df['target'] + " " + df['text']

# check
df.target_text.head()

0    __label__1 Our Deeds are the Reason of this #e...
1    __label__1 Forest fire near La Ronge Sask. Canada
2    __label__1 All residents asked to 'shelter in ...
3    __label__1 13,000 people receive #wildfires ev...
4    __label__1 Just got sent this photo from Ruby ...
Name: target_text, dtype: object

In [11]:
# define preprocess function

import re

# preprocess function subs everything that's not a word character or 
# is a space character or is an apostrophe with a space, then subs all
# multiple spaces with just one space, then strips leading and lagging spaces
# and converts all letters to lowercase, then subs newline characters with a
# single space and subs all multiple spaces with just one space again
def preprocess(text):
    text = re.sub(r"[^\w\s\']", " ", text)
    text = re.sub(r" +", " ", text)
    text = text.strip().lower()
    text = re.sub(r"\\n+", " ", text)
    text = re.sub(r" +", " ", text)
    return text

# apply preprocess function to target_text column
df['target_text'] = df['target_text'].map(preprocess)

# check
df.target_text.head()

0    __label__1 our deeds are the reason of this ea...
1     __label__1 forest fire near la ronge sask canada
2    __label__1 all residents asked to 'shelter in ...
3    __label__1 13 000 people receive wildfires eva...
4    __label__1 just got sent this photo from ruby ...
Name: target_text, dtype: object

In [12]:
# make preprocessed text column (without label) to test model on
df['processed_text'] = df['text'].map(preprocess)

# check
df.processed_text.head()

0    our deeds are the reason of this earthquake ma...
1                forest fire near la ronge sask canada
2    all residents asked to 'shelter in place' are ...
3    13 000 people receive wildfires evacuation ord...
4    just got sent this photo from ruby alaska as s...
Name: processed_text, dtype: object

In [13]:
# tts

from sklearn.model_selection import train_test_split

train, test = train_test_split(df, 
                               test_size = 0.2, 
                               random_state = 2022,
                               stratify = df.target)

# check
print(train.shape)
print(test.shape)

(6090, 7)
(1523, 7)


In [14]:
# save formatted and labeled column in file for fasttext to train on

import csv

train.to_csv("disaster_train_fasttext.csv", 
             columns = ["target_text"], 
             index = False, 
             header = False,
             quoting = csv.QUOTE_NONNUMERIC)

In [2]:
!pip install fasttext

Collecting fasttext

  error: subprocess-exited-with-error
  
  python setup.py bdist_wheel did not run successfully.
  exit code: 1
  
  [75 lines of output]
  !!
  
          ********************************************************************************
          Usage of dash-separated 'description-file' will not be supported in future
          versions. Please use the underscore name 'description_file' instead.
  
          By 2023-Sep-26, you need to update your project and remove deprecated calls
          or your builds will no longer be supported.
  
          See https://setuptools.pypa.io/en/latest/userguide/declarative_config.html for details.
          ********************************************************************************
  
  !!
    opt = self.warn_dash_deprecation(opt, section)
  running bdist_wheel
  running build
  running build_py
  creating build
  creating build\lib.win-amd64-cpython-310
  creating build\lib.win-amd64-cpython-310\fasttext
  copying python\fasttext_module\fas


  Using cached fasttext-0.9.2.tar.gz (68 kB)
  Preparing metadata (setup.py): started
  Preparing metadata (setup.py): finished with status 'done'
Building wheels for collected packages: fasttext
  Building wheel for fasttext (setup.py): started
  Building wheel for fasttext (setup.py): finished with status 'error'
  Running setup.py clean for fasttext
Failed to build fasttext


In [16]:
import fasttext

ModuleNotFoundError: No module named 'fasttext'

In [15]:
# train model with labels
model = fasttext.train_supervised(input = 'disaster_train_fasttext.csv')

ModuleNotFoundError: No module named 'fasttext'

In [None]:
# check model
model.predict("hello how are you")

# THIS KILLS MY KERNEL EVERY TIME

# IGNORE BELOW HERE

In [12]:
print(len(test.processed_text))

1523


In [13]:
# make file with processed text (without labels) in test set
# to test model on

test.to_csv("disaster_test_fasttext.csv", 
             columns = ["processed_text"], 
             index = False, 
             header = False,
             quoting = csv.QUOTE_NONNUMERIC)

In [14]:
print(len(test.target))

1523


In [15]:
# make file with true labels for test set to compare model predictions to

test.to_csv("disaster_test_fasttext_true_labels.csv",
           columns = ['target'],
           index = False,
           header = False,
           quoting = csv.QUOTE_NONNUMERIC)

In [16]:
# load test data
# with open('disaster_test_fasttext.csv', 'r', encoding = 'utf-8') as f:
#     test_data = f.readlines()

test_data = pd.read_csv("disaster_test_fasttext.csv", 
                        header = None,
                        quoting = csv.QUOTE_NONNUMERIC)

print(len(test_data))
    
# check
print(test_data[:10])

1523
                                                   0
0  had a nightmare and was about to jump out of b...
1  mumbai24x7 helping hand in mumbai 2 ttes take ...
2  pandemonium in aba as woman delivers baby with...
3  whereas jez will obliterate the national debt ...
4                               dust storm in riyadh
5  savings and sewing in guatemala savings and se...
6  israeli helicopters that attacked civilians in...
7  amirkingkhan you would have been annihilated s...
8    going to go drown my sorrows with sad music brb
9  'how many men would a human hew if a human cou...


In [17]:
# check
test_data[0][0]

'had a nightmare and was about to jump out of bed when i remembered my injury alas it was too late and i screamed in my bedroom'

In [18]:
# remove newline characters
# test_data = [line.strip() for line in test_data]

# check
# print(test_data[:10])

In [19]:
# check length of test data
print(len(test_data))

1523


In [20]:
# convert test data to list
test_data_list = test_data[0].tolist()

# check
test_data_list[:10]

['had a nightmare and was about to jump out of bed when i remembered my injury alas it was too late and i screamed in my bedroom',
 'mumbai24x7 helping hand in mumbai 2 ttes take charge of helpline to calm anxious relatives the ind http t co tuaryijpqu mumbai',
 'pandemonium in aba as woman delivers baby without face photos http t co acfi2rhz4n',
 'whereas jez will obliterate the national debt and give lots of new benefits by simply printing money genius https t co reffbkvg9r',
 'dust storm in riyadh',
 'savings and sewing in guatemala savings and sewing in guatemala when a natural disaster hit seamstress elvia http t co jdx9ox2kik',
 'israeli helicopters that attacked civilians in gaza just completed exercises in greece',
 'amirkingkhan you would have been annihilated so you might as well thank floydmayweather',
 'going to go drown my sorrows with sad music brb',
 "'how many men would a human hew if a human could hew men '\n\n popular tongue twister among woodchucks"]

In [21]:
true_labels = pd.read_csv("disaster_test_fasttext_true_labels.csv", 
                        header = None,
                        quoting = csv.QUOTE_NONNUMERIC)

print(len(true_labels))
    
# check
print(true_labels[:10])

1523
            0
0  __label__0
1  __label__1
2  __label__0
3  __label__1
4  __label__1
5  __label__1
6  __label__1
7  __label__0
8  __label__0
9  __label__0


In [22]:
# check length of true_labels
print(len(true_labels))

1523


In [23]:
# convert true labels to list
true_labels_list = true_labels[0].tolist()

# check
true_labels_list[:10]

['__label__0',
 '__label__1',
 '__label__0',
 '__label__1',
 '__label__1',
 '__label__1',
 '__label__1',
 '__label__0',
 '__label__0',
 '__label__0']

In [24]:
# this line of code kills the kernel

# predict labels for test data
# model.predict(test_data[0][0])
# predictions = [model.predict(line)[0] for line in test_data.iterrows()]

# check
# print(predictions[:10])

In [25]:
# check length of predictions
# print(len(predictions))

In [26]:
test_data_list[0]

'had a nightmare and was about to jump out of bed when i remembered my injury alas it was too late and i screamed in my bedroom'

In [None]:
# this is causing my kernel to crash
# https://fasttext.cc/docs/en/supervised-tutorial.html

model.predict("which baking dish is best to bake a banana bread")

In [None]:
# test_prediction = model.predict(test_data_list[0])
# test_prediction

In [None]:
# # get the f1 score

# from sklearn.metrics import f1_score

# f1 = f1_score(true_labels, predictions, average = 'macro')

In [None]:


# model.test("disaster_test_fasttext")

# # three output numbers: size of test samples, precision, recall

In [None]:
# how to get f1?
# model.test_label("disaster_test_fasttext")

In [None]:
# y_pred = model.predict("disaster_test_fasttext")

# y_pred

In [None]:
# from sklearn.metrics import classification_report

# classification_report()

# Other

categorical nb?