# Natural Language Processing with Disaster Tweets (Kaggle Competition)

# Starter Notebook

In [106]:
# imports
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O
from sklearn import feature_extraction, linear_model, model_selection, preprocessing

In [107]:
# read in data
train_df = pd.read_csv("Data/train.csv")
test_df = pd.read_csv("Data/test.csv")

# check
train_df.info()
test_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7613 entries, 0 to 7612
Data columns (total 5 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   id        7613 non-null   int64 
 1   keyword   7552 non-null   object
 2   location  5080 non-null   object
 3   text      7613 non-null   object
 4   target    7613 non-null   int64 
dtypes: int64(2), object(3)
memory usage: 297.5+ KB
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3263 entries, 0 to 3262
Data columns (total 4 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   id        3263 non-null   int64 
 1   keyword   3237 non-null   object
 2   location  2158 non-null   object
 3   text      3263 non-null   object
dtypes: int64(1), object(3)
memory usage: 102.1+ KB


In [108]:
# check for duplicates
train_df.duplicated().sum()

0

In [109]:
# check class balance
train_df['target'].value_counts()

# the classes are a bit unbalanced (57% to 43%),
# so we may want to try some class balancing techniques

0    4342
1    3271
Name: target, dtype: int64

In [110]:
# check uniqueness of id column
train_df['id'].nunique()

7613

In [111]:
# check examples of non-disaster tweets (target = 0)
nondisaster_df = train_df[train_df['target'] == 0]

for tweet in nondisaster_df['text'].values[1:10]:
    print(tweet)

I love fruits
Summer is lovely
My car is so fast
What a goooooooaaaaaal!!!!!!
this is ridiculous....
London is cool ;)
Love skiing
What a wonderful day!
LOOOOOOL


In [112]:
# check examples of disaster tweets (target = 1)
disaster_df = train_df[train_df['target'] == 1]

for tweet in disaster_df['text'].values[1:10]:
    print(tweet)

Forest fire near La Ronge Sask. Canada
All residents asked to 'shelter in place' are being notified by officers. No other evacuation or shelter in place orders are expected
13,000 people receive #wildfires evacuation orders in California 
Just got sent this photo from Ruby #Alaska as smoke from #wildfires pours into a school 
#RockyFire Update => California Hwy. 20 closed in both directions due to Lake County fire - #CAfire #wildfires
#flood #disaster Heavy rain causes flash flooding of streets in Manitou, Colorado Springs areas
I'm on top of the hill and I can see a fire in the woods...
There's an emergency evacuation happening now in the building across the street
I'm afraid that the tornado is coming to our area...


In [113]:
# count the words in each tweet and turn them into word vectors
count_vectorizer = feature_extraction.text.CountVectorizer()

# check
example_train_vectors = count_vectorizer.fit_transform(train_df['text'][0:5])
print(example_train_vectors[0].todense().shape)
print(example_train_vectors[0].todense())

(1, 54)
[[0 0 0 1 1 1 0 0 0 0 0 0 1 1 0 0 0 0 1 0 0 0 0 0 0 1 0 0 0 1 0 0 0 0 1 0
  0 0 0 1 0 0 0 0 0 0 0 0 0 1 1 0 1 0]]


In [114]:
# create all vectors
train_vectors = count_vectorizer.fit_transform(train_df['text'])

In [115]:
# baseline model: linear ridge regression
clf = linear_model.RidgeClassifier()

In [116]:
# get f1 score of baseline model
scores = model_selection.cross_val_score(clf, 
                                         train_vectors, 
                                         train_df['target'], 
                                         cv = 3, 
                                         scoring = 'f1')
scores

array([0.59453669, 0.5642787 , 0.64082434])

In [117]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(
    train_df.text,
    train_df.target,
    test_size = 0.2,
    random_state = 2022,
    stratify = train_df.target)

clf = Pipeline([
    ('count_vectorizer', feature_extraction.text.CountVectorizer()),
    ('ridge_classifier', linear_model.RidgeClassifier())
])

clf.fit(X_train, y_train)

y_pred = clf.predict(X_test)

print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.78      0.85      0.81       869
           1       0.77      0.68      0.72       654

    accuracy                           0.78      1523
   macro avg       0.78      0.76      0.77      1523
weighted avg       0.78      0.78      0.77      1523



In [118]:
# helper function to save scores
from sklearn.metrics import f1_score

scores_df = pd.DataFrame()

def save_scores(model_pipe, X_train, X_test, y_train, y_test, name):
  
    # calculate predictions
    train_pred = model_pipe.predict(X_train)
    test_pred = model_pipe.predict(X_test)
    
    # save f1 scores for each class
    f1_train_scores = f1_score(y_train, train_pred, average = None)
    for i, f1 in enumerate(f1_train_scores):
        if i == 0:
            f1_0_train = f1
        elif i == 1:
            f1_1_train = f1
            
    f1_test_scores = f1_score(y_test, test_pred, average = None)
    for i, f1 in enumerate(f1_test_scores):
        if i == 0:
            f1_0_test = f1
        elif i == 1:
            f1_1_test = f1
        
    # store scores
    scores_df.at[name, 'F1_0_Train'] = f1_0_train
    scores_df.at[name, 'F1_1_Train'] = f1_1_train
    scores_df.at[name, 'F1_Avg_Train'] = f1_score(y_train, train_pred, average = 'macro')
    scores_df.at[name, 'F1_0_Test'] = f1_0_test
    scores_df.at[name, 'F1_1_Test'] = f1_1_test
    scores_df.at[name, 'F1_Avg_Test'] = f1_score(y_test, test_pred, average = 'macro')
    
    # show scores for this model only (can call scores_df to see all scores)
    print(scores_df.loc[name, :])

# TF-IDF

## No class balancing, no preprocessing (KNN, MNB, RF)

In [119]:
train_df.head()

Unnamed: 0,id,keyword,location,text,target
0,1,,,Our Deeds are the Reason of this #earthquake M...,1
1,4,,,Forest fire near La Ronge Sask. Canada,1
2,5,,,All residents asked to 'shelter in place' are ...,1
3,6,,,"13,000 people receive #wildfires evacuation or...",1
4,7,,,Just got sent this photo from Ruby #Alaska as ...,1


In [120]:
# tts on unprocessed data
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(
    train_df.text,
    train_df.target,
    test_size = 0.2,
    random_state = 2022,
    stratify = train_df.target)

In [121]:
# knn, tfidf vectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.neighbors import KNeighborsClassifier
from sklearn.pipeline import Pipeline
from sklearn.metrics import classification_report

clf = Pipeline([
    ('vectorizer_tfidf', TfidfVectorizer()),
    ('KNN', KNeighborsClassifier())
])

clf.fit(X_train, y_train)

y_pred = clf.predict(X_test)

print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.76      0.86      0.81       869
           1       0.78      0.65      0.71       654

    accuracy                           0.77      1523
   macro avg       0.77      0.75      0.76      1523
weighted avg       0.77      0.77      0.76      1523



In [122]:
save_scores(clf, X_train, X_test, y_train, y_test, "knn-tfidf-unb")

F1_0_Train      0.865308
F1_1_Train      0.797617
F1_Avg_Train    0.831462
F1_0_Test       0.808672
F1_1_Test       0.706078
F1_Avg_Test     0.757375
Name: knn-tfidf-unb, dtype: float64


In [123]:
scores_df

Unnamed: 0,F1_0_Train,F1_1_Train,F1_Avg_Train,F1_0_Test,F1_1_Test,F1_Avg_Test
knn-tfidf-unb,0.865308,0.797617,0.831462,0.808672,0.706078,0.757375


In [124]:
# multinomial naive bayes

from sklearn.naive_bayes import MultinomialNB

clf = Pipeline([
    ('vectorizer_tfidf', TfidfVectorizer()),
    ('Multi NB', MultinomialNB())
])

clf.fit(X_train, y_train)

y_pred = clf.predict(X_test)

print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.76      0.93      0.84       869
           1       0.87      0.61      0.72       654

    accuracy                           0.79      1523
   macro avg       0.81      0.77      0.78      1523
weighted avg       0.81      0.79      0.78      1523



In [125]:
save_scores(clf, X_train, X_test, y_train, y_test, "mnb-tfidf-unb")

F1_0_Train      0.910580
F1_1_Train      0.859256
F1_Avg_Train    0.884918
F1_0_Test       0.835836
F1_1_Test       0.715695
F1_Avg_Test     0.775766
Name: mnb-tfidf-unb, dtype: float64


In [126]:
# random forest

from sklearn.ensemble import RandomForestClassifier

clf = Pipeline([
    ('vectorizer_tfidf', TfidfVectorizer()),
    ('Random Forest', RandomForestClassifier())
])

clf.fit(X_train, y_train)

y_pred = clf.predict(X_test)

print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.74      0.93      0.82       869
           1       0.85      0.57      0.69       654

    accuracy                           0.78      1523
   macro avg       0.80      0.75      0.76      1523
weighted avg       0.79      0.78      0.77      1523



In [127]:
save_scores(clf, X_train, X_test, y_train, y_test, "rf-tfidf-unb")

F1_0_Train      0.997411
F1_1_Train      0.996557
F1_Avg_Train    0.996984
F1_0_Test       0.824795
F1_1_Test       0.687386
F1_Avg_Test     0.756090
Name: rf-tfidf-unb, dtype: float64


## No class balancing, minimal preprocessing (KNN, MNB, RF)

In [128]:
# preprocessing: removing spacy stopwords and punctuation, lemmatizing

import spacy

nlp = spacy.load("en_core_web_sm")

def preprocess(text):
    doc = nlp(text)
    
    filtered_tokens = []
    
    # take out stopwords and punctuation
    for token in doc:
        if token.is_stop or token.is_punct:
            continue
        
        # convert to lemmas
        filtered_tokens.append(token.lemma_)
            
    return " ".join(filtered_tokens)

In [129]:
train_df['preprocessed_txt'] = train_df['text'].apply(preprocess)

In [130]:
# check
train_df.head()

Unnamed: 0,id,keyword,location,text,target,preprocessed_txt
0,1,,,Our Deeds are the Reason of this #earthquake M...,1,deed Reason earthquake ALLAH forgive
1,4,,,Forest fire near La Ronge Sask. Canada,1,forest fire near La Ronge Sask Canada
2,5,,,All residents asked to 'shelter in place' are ...,1,resident ask shelter place notify officer evac...
3,6,,,"13,000 people receive #wildfires evacuation or...",1,"13,000 people receive wildfire evacuation orde..."
4,7,,,Just got sent this photo from Ruby #Alaska as ...,1,got send photo Ruby Alaska smoke wildfire pour...


In [131]:
# tts on processed data

X_train, X_test, y_train, y_test = train_test_split(
    train_df.preprocessed_txt,
    train_df.target,
    test_size = 0.2,
    random_state = 2022,
    stratify = train_df.target)

In [132]:
# knn on preprocessed data
clf = Pipeline([
    ('vectorizer_tfidf', TfidfVectorizer()),
    ('KNN', KNeighborsClassifier())
])

clf.fit(X_train, y_train)

y_pred = clf.predict(X_test)

print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.76      0.86      0.81       869
           1       0.77      0.63      0.70       654

    accuracy                           0.76      1523
   macro avg       0.77      0.75      0.75      1523
weighted avg       0.76      0.76      0.76      1523



In [133]:
save_scores(clf, X_train, X_test, y_train, y_test, "knn-tfidf-unb-prep")

F1_0_Train      0.864214
F1_1_Train      0.790221
F1_Avg_Train    0.827218
F1_0_Test       0.806034
F1_1_Test       0.697479
F1_Avg_Test     0.751757
Name: knn-tfidf-unb-prep, dtype: float64


In [134]:
# multinomial naive bayes on preprocessed text

clf = Pipeline([
    ('vectorizer_tfidf', TfidfVectorizer()),
    ('Multi NB', MultinomialNB())
])

clf.fit(X_train, y_train)

y_pred = clf.predict(X_test)

print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.78      0.91      0.84       869
           1       0.84      0.65      0.73       654

    accuracy                           0.80      1523
   macro avg       0.81      0.78      0.78      1523
weighted avg       0.80      0.80      0.79      1523



In [135]:
save_scores(clf, X_train, X_test, y_train, y_test, "mnb-tfidf-unb-prep")

F1_0_Train      0.926736
F1_1_Train      0.889803
F1_Avg_Train    0.908269
F1_0_Test       0.836074
F1_1_Test       0.733850
F1_Avg_Test     0.784962
Name: mnb-tfidf-unb-prep, dtype: float64


In [136]:
# random forest on preprocessed text

clf = Pipeline([
    ('vectorizer_tfidf', TfidfVectorizer()),
    ('Random Forest', RandomForestClassifier())
])

clf.fit(X_train, y_train)

y_pred = clf.predict(X_test)

print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.74      0.91      0.82       869
           1       0.83      0.58      0.68       654

    accuracy                           0.77      1523
   macro avg       0.79      0.74      0.75      1523
weighted avg       0.78      0.77      0.76      1523



In [137]:
save_scores(clf, X_train, X_test, y_train, y_test, "rf-tfidf-unb-prep")

F1_0_Train      0.997554
F1_1_Train      0.996750
F1_Avg_Train    0.997152
F1_0_Test       0.817947
F1_1_Test       0.681120
F1_Avg_Test     0.749534
Name: rf-tfidf-unb-prep, dtype: float64


In [140]:
scores_df.sort_values(by = 'F1_Avg_Test', ascending = False)

Unnamed: 0,F1_0_Train,F1_1_Train,F1_Avg_Train,F1_0_Test,F1_1_Test,F1_Avg_Test
mnb-tfidf-unb-prep,0.926736,0.889803,0.908269,0.836074,0.73385,0.784962
mnb-tfidf-unb,0.91058,0.859256,0.884918,0.835836,0.715695,0.775766
knn-tfidf-unb,0.865308,0.797617,0.831462,0.808672,0.706078,0.757375
rf-tfidf-unb,0.997411,0.996557,0.996984,0.824795,0.687386,0.75609
knn-tfidf-unb-prep,0.864214,0.790221,0.827218,0.806034,0.697479,0.751757
rf-tfidf-unb-prep,0.997554,0.99675,0.997152,0.817947,0.68112,0.749534


## Class balancing