### Import libraries

In [1]:
##
## Import libraries
import pandas as pd
import numpy as np
from sklearn import linear_model, model_selection, preprocessing, metrics, svm
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn.naive_bayes import BernoulliNB
from sklearn.model_selection import cross_val_score, train_test_split
from sklearn.ensemble import RandomForestClassifier
import nltk
from plotnine import *

In [2]:
# True Positive [TP] = your prediction is 1, and the ground truth is also 1 - you predicted a positive and that's true!
# False Positive [FP] = your prediction is 1, and the ground truth is 0 - you predicted a positive, and that's false.
# False Negative [FN] = your prediction is 0, and the ground truth is 1 - you predicted a negative, and that's false.

##
## Accuracy is measured by F1 score = 2 ∗ (precision∗recall) / (precision+recall)
## 
## and precision = TP/(TP+FP) and recall = TP/(TP+FN)

### Read Data

In [6]:
# read data
train = pd.read_csv( '../data/nlp-getting-started/train.csv' )
test  = pd.read_csv( '../data/nlp-getting-started/test.csv' )
subm_samp = pd.read_csv( '../data/nlp-getting-started/sample_submission.csv' )

In [7]:
train.head(10)

Unnamed: 0,id,keyword,location,text,target
0,1,,,Our Deeds are the Reason of this #earthquake M...,1
1,4,,,Forest fire near La Ronge Sask. Canada,1
2,5,,,All residents asked to 'shelter in place' are ...,1
3,6,,,"13,000 people receive #wildfires evacuation or...",1
4,7,,,Just got sent this photo from Ruby #Alaska as ...,1
5,8,,,#RockyFire Update => California Hwy. 20 closed...,1
6,10,,,#flood #disaster Heavy rain causes flash flood...,1
7,13,,,I'm on top of the hill and I can see a fire in...,1
8,14,,,There's an emergency evacuation happening now ...,1
9,15,,,I'm afraid that the tornado is coming to our a...,1


In [8]:
##
## Checking size of each class on train set
print( train[ train['target'] == 1 ].shape[0] )
print( train[ train['target'] == 0 ].shape[0] )

3271
4342


In [9]:
# check first tweets when disaster happened
train[train["target"] == 1]["text"].values[0:2]

array(['Our Deeds are the Reason of this #earthquake May ALLAH Forgive us all',
       'Forest fire near La Ronge Sask. Canada'], dtype=object)

In [10]:
# check first tweets when disaster NOT happened
train[train["target"] == 0]["text"].values[0:2]

array(["What's up man?", 'I love fruits'], dtype=object)

## Feature Engineering

### Remove letter case and stopwords 

In [11]:
##
## Clean words: tokenize, remove stopwords and punctuation, all lowercase
## 
## remove times etc
##
## Use word embeddings - GloVe trained on Twitter data already downloaded
##
## Think of way to convert word embedding into some type of aggregate embedding, like tweet embeddding
##
## Model it
##

In [12]:
###
### Function to process data
###
def textProcessing(dt, textCol):
    
    # Get stopwords
    stop_words = set(stopwords.words('english')) 
    
    # Remove cases
    dt['tmp_text'] = dt[textCol].str.lower()

    # Remove punctuation
    dt['tmp_text'] = dt.apply(lambda row: row['tmp_text'].translate(str.maketrans('', '', string.punctuation)), axis=1)

    # Tokenize
    dt['tmp_text'] = dt.apply(lambda row: word_tokenize(row['tmp_text']), axis=1)

    # Remove stopwords
    dt['new_text'] = dt['tmp_text'].apply(lambda x: [item for item in x if item not in stop_words])
    
    # Return dataframe
    return dt
    

In [13]:
from nltk.tokenize import word_tokenize 
from nltk.corpus import stopwords
import string
# nltk.download('stopwords')
# nltk.download('punkt')

newtrain = textProcessing( dt=train, textCol='text')
newtrain

Unnamed: 0,id,keyword,location,text,target,tmp_text,new_text
0,1,,,Our Deeds are the Reason of this #earthquake M...,1,"[our, deeds, are, the, reason, of, this, earth...","[deeds, reason, earthquake, may, allah, forgiv..."
1,4,,,Forest fire near La Ronge Sask. Canada,1,"[forest, fire, near, la, ronge, sask, canada]","[forest, fire, near, la, ronge, sask, canada]"
2,5,,,All residents asked to 'shelter in place' are ...,1,"[all, residents, asked, to, shelter, in, place...","[residents, asked, shelter, place, notified, o..."
3,6,,,"13,000 people receive #wildfires evacuation or...",1,"[13000, people, receive, wildfires, evacuation...","[13000, people, receive, wildfires, evacuation..."
4,7,,,Just got sent this photo from Ruby #Alaska as ...,1,"[just, got, sent, this, photo, from, ruby, ala...","[got, sent, photo, ruby, alaska, smoke, wildfi..."
...,...,...,...,...,...,...,...
7608,10869,,,Two giant cranes holding a bridge collapse int...,1,"[two, giant, cranes, holding, a, bridge, colla...","[two, giant, cranes, holding, bridge, collapse..."
7609,10870,,,@aria_ahrary @TheTawniest The out of control w...,1,"[ariaahrary, thetawniest, the, out, of, contro...","[ariaahrary, thetawniest, control, wild, fires..."
7610,10871,,,M1.94 [01:04 UTC]?5km S of Volcano Hawaii. htt...,1,"[m194, 0104, utc5km, s, of, volcano, hawaii, h...","[m194, 0104, utc5km, volcano, hawaii, httptcoz..."
7611,10872,,,Police investigating after an e-bike collided ...,1,"[police, investigating, after, an, ebike, coll...","[police, investigating, ebike, collided, car, ..."


### Read GloVe vectors

In [14]:
# this chunk takes a bit to run
glovePath = '/Users/joaquimlyrio/Downloads/glove/glove.twitter.27B.25d.txt'
embeddings_dict = {}
with open( glovePath, 'r') as f:
    for line in f:
        values = line.split()
        word = values[0]
        vector = np.asarray(values[1:], "float32")
        embeddings_dict[word] = vector

In [15]:
embeddings_dict['joaquim']

array([ 0.41602 ,  0.32667 ,  0.65292 ,  0.18718 , -0.57324 , -1.1845  ,
       -1.8769  , -0.011423,  1.2516  , -1.1869  , -0.46634 , -0.57578 ,
       -0.8656  , -0.4988  ,  0.63822 , -1.4461  , -1.2926  ,  0.57836 ,
        0.39184 , -0.49741 ,  0.6698  ,  0.94942 , -1.1361  , -1.1276  ,
        0.0813  ], dtype=float32)

### Create features for each tweet based on GloVe vectors

In [16]:
embeddings_dict[ newtrain['new_text'][0][0] ]
newtrain['new_text'][0]

# embeddings_dict[ newtrain['new_text'] ]

def computeGloVeEnsemble( s, embeddings ):
    
    iCnt = 0
    cum = np.zeros( embeddings.get('cool').shape )
    
    # iterate over words in sentence s
    # first word
    if s[0] in embeddings:
        cum = embeddings.get(s[0])
        iCnt = iCnt + 1
    
    # other words
    for w in s[1:]:
        if w in embeddings:
            cum = cum + embeddings.get(w)
            iCnt = iCnt + 1
        
    return cum / iCnt

# print(newtrain['new_text'][100])
# computeGloVeEnsemble( newtrain['new_text'][3], embeddings=embeddings_dict )

In [17]:

# iterate over rows of train
sentEmbed = {}
for iRow in np.arange(0,newtrain.shape[0],1):
    sentEmbed[iRow] = computeGloVeEnsemble( newtrain['new_text'][iRow], embeddings_dict )




In [18]:
gloveTrain = pd.DataFrame.from_dict(sentEmbed, orient = 'index')
gloveTrain['target'] = newtrain['target']
gloveTrain = gloveTrain.dropna()
gloveTrain.shape

gloveTarget = gloveTrain['target']

gloveTrain = pd.DataFrame.from_dict(sentEmbed, orient = 'index')
gloveTrain = gloveTrain.dropna()
print(gloveTrain.shape)

gloveTrain

(7610, 25)


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,15,16,17,18,19,20,21,22,23,24
0,-0.034642,0.113201,-0.430404,0.489391,-0.853268,-0.306259,1.077509,-0.201949,-0.086634,0.437167,...,-0.115637,0.230185,0.121293,0.576177,-0.770221,0.263890,0.306265,-0.436617,0.242523,-0.605413
1,-0.567261,-0.202308,-0.182597,-0.002299,-0.911867,0.029404,0.044439,-0.149501,0.482768,-0.156152,...,0.251632,0.237843,0.109339,0.164636,0.505321,0.013704,-0.441899,0.354123,-0.906471,0.207224
2,-0.600594,0.683745,-0.279191,-0.182460,-0.222019,-0.768375,0.441783,-1.233742,0.358456,0.232217,...,0.012621,0.572267,-0.258214,0.147398,-0.713238,0.231145,-0.565903,0.002023,-0.839046,-0.239976
3,-0.302245,0.713562,-0.572892,0.051555,-0.331429,-0.620134,0.610448,-1.426075,0.374652,0.257453,...,-0.229748,0.337154,-0.285332,0.569386,-0.700277,-0.067718,-0.516196,0.041319,-0.958743,-0.079488
4,-0.440428,0.247316,0.190015,0.146781,-0.571416,0.001253,0.602678,-0.746124,0.165429,0.126911,...,0.120295,0.539177,-0.081443,-0.047724,0.087765,0.124569,-0.118026,0.481347,-0.423808,0.124419
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7608,-0.648327,0.070490,0.380065,-0.042914,-0.387247,0.036434,0.478167,-1.198946,0.579818,0.453287,...,-0.100243,0.376008,-0.045428,0.473099,-0.313125,-0.304531,-0.226257,0.297763,-1.010939,-0.234808
7609,-0.135133,0.096568,-0.276388,-0.126443,-0.288394,0.141316,0.764192,-0.630848,0.299536,-0.072727,...,-0.019655,0.220042,0.173761,0.262831,-0.165517,0.061640,-0.072648,0.181287,-0.851537,-0.043328
7610,-1.168000,-0.450815,0.532550,-0.399675,-1.363600,-0.095002,0.954495,-0.814990,0.556580,0.270772,...,-0.002895,0.338860,0.395045,1.074420,0.028868,-0.702900,-0.445012,0.069295,-1.341100,0.638075
7611,-0.287354,0.493432,-0.094455,-0.263650,0.140191,-0.228304,0.218582,-0.772906,0.312802,-0.002916,...,0.336800,0.562909,0.074350,0.353500,-0.345130,0.084602,0.363871,0.330678,-0.892794,-0.284000


### Word Counts

In [19]:

##
## Count words
count_vect = CountVectorizer()
train_counts = count_vect.fit_transform(train.text)
train_counts.shape

(7613, 21637)

### TF-IDF

In [20]:

##
## TF-IDF
tfidf_transformer = TfidfTransformer()
train_tfidf = tfidf_transformer.fit_transform(train_counts)
train_tfidf.shape

(7613, 21637)

## Models

### (1) Tf-idf + Bernoulli Naive Bayes

In [21]:
##
## Fit Bernoulli Naive-Bayes
clf_tfidf = BernoulliNB().fit(train_tfidf, train.target)
clf_tfidf

BernoulliNB(alpha=1.0, binarize=0.0, class_prior=None, fit_prior=True)

In [22]:

##
## Predict on train set
nObs = train_tfidf.shape[0]
print( np.mean( clf_tfidf.predict(train_tfidf[0:nObs]) == train.target[0:nObs] ) )
metrics.f1_score( y_true = clf_tfidf.predict(train_tfidf[0:nObs]),
                  y_pred = train.target[0:nObs] )

0.8943911729935636


0.8658658658658658

In [23]:
## Bernoulli NB
clf1 = BernoulliNB()
scores_tfidf = cross_val_score(clf1, train_tfidf, train.target, cv=5, scoring='f1')
f'BernoulliNB: {scores_tfidf}'

'BernoulliNB: [0.63339383 0.62758051 0.68714632 0.64700781 0.76566125]'

### (2) GloVe Ensemble + Bernoulli Naive Bayes

In [24]:
clf_glove = BernoulliNB().fit(gloveTrain, gloveTarget)
clf_glove


##
## Predict on train set
nObs = gloveTrain.shape[0]
print( np.mean( clf_glove.predict(gloveTrain[0:nObs]) == gloveTarget[0:nObs] ) )
metrics.f1_score( y_true = clf_glove.predict(gloveTrain[0:nObs]),
                  y_pred = gloveTarget[0:nObs] )

0.7336399474375821


0.7016045929633445

In [25]:
## Bernoulli NB
clf1 = BernoulliNB()
scores_glove = cross_val_score(clf1, gloveTrain, gloveTarget, cv=5, scoring='f1')
f'BernoulliNB: {scores_glove}'

'BernoulliNB: [0.68663968 0.68019594 0.69257951 0.68864469 0.73676471]'

In [26]:
# ## RandomForestClassifier
# clf2 = RandomForestClassifier()
# scores2 = cross_val_score(clf2, gloveTrain, gloveTarget, cv=5, scoring='f1')
# f'RandomForestClassifier: {scores2}'

### (3) Tf-Idf + Ridge Classifier

In [27]:
##
## Obs: this cell takes a while to run ~ 3 min or so
##

from sklearn.model_selection import GridSearchCV
from sklearn.metrics import classification_report
from sklearn.linear_model import RidgeClassifier

## RidgeClassifier
ridge = linear_model.RidgeClassifier()
ridge.get_params()

# define parameter range to perform GridSearchCV
param_grid = [ {'alpha': np.arange(0,1.1,.1)} ]

# Split the dataset for cross validation
X_train_tfidf, X_test_tfidf, y_train, y_test = train_test_split(train_tfidf, 
                                                          train.target, 
                                                          test_size=0.2, 
                                                          random_state=0)


# Set the parameters by cross-validation
tuned_parameter = [{'alpha':[0,.25,.5,.75,1.0]}]

# Score to to optimize over
score = 'f1'

clf = GridSearchCV(
    RidgeClassifier(), tuned_parameter, scoring=score
)

clf.fit(X_train_tfidf, y_train)


GridSearchCV(cv=None, error_score=nan,
             estimator=RidgeClassifier(alpha=1.0, class_weight=None,
                                       copy_X=True, fit_intercept=True,
                                       max_iter=None, normalize=False,
                                       random_state=None, solver='auto',
                                       tol=0.001),
             iid='deprecated', n_jobs=None,
             param_grid=[{'alpha': [0, 0.25, 0.5, 0.75, 1.0]}],
             pre_dispatch='2*n_jobs', refit=True, return_train_score=False,
             scoring='f1', verbose=0)

In [28]:
print("Best parameters set found on development set:")
print()
print(clf.best_params_)
print()
print("Grid scores on development set:")
print()
means = clf.cv_results_['mean_test_score']
stds = clf.cv_results_['std_test_score']
for mean, std, params in zip(means, stds, clf.cv_results_['params']):
    print("%0.3f (+/-%0.03f) for %r"
          % (mean, std * 2, params))
print()

print("Detailed classification report:")
print()
print("The model is trained on the full development set.")
print("The scores are computed on the full evaluation set.")
print()
y_true, y_pred = y_test, clf.predict(X_test_tfidf)
print(classification_report(y_true, y_pred))
print()

Best parameters set found on development set:

{'alpha': 0.75}

Grid scores on development set:

0.409 (+/-0.161) for {'alpha': 0}
0.737 (+/-0.018) for {'alpha': 0.25}
0.745 (+/-0.025) for {'alpha': 0.5}
0.751 (+/-0.032) for {'alpha': 0.75}
0.750 (+/-0.032) for {'alpha': 1.0}

Detailed classification report:

The model is trained on the full development set.
The scores are computed on the full evaluation set.

              precision    recall  f1-score   support

           0       0.80      0.90      0.85       886
           1       0.83      0.69      0.75       637

    accuracy                           0.81      1523
   macro avg       0.81      0.79      0.80      1523
weighted avg       0.81      0.81      0.81      1523




### (4) GloVe + Ridge Classifier

In [29]:
##
## Obs: this cell takes a while to run ~ 3 min or so
##

from sklearn.model_selection import GridSearchCV
from sklearn.metrics import classification_report
from sklearn.linear_model import RidgeClassifier

## RidgeClassifier
ridge = linear_model.RidgeClassifier()
ridge.get_params()

# define parameter range to perform GridSearchCV
param_grid = [ {'alpha': np.arange(0,1.1,.1)} ]

# Split the dataset for cross validation
X_train_glove, X_test_glove, y_train, y_test = train_test_split(gloveTrain, 
                                                          gloveTarget, 
                                                          test_size=0.2, 
                                                          random_state=0)


# Set the parameters by cross-validation
tuned_parameter = [{'alpha':[0,.25,.5,.75,1.0]}]

# Score to to optimize over
score = 'f1'

clf = GridSearchCV(
    RidgeClassifier(), tuned_parameter, scoring=score
)

clf.fit(X_train_glove, y_train)

GridSearchCV(cv=None, error_score=nan,
             estimator=RidgeClassifier(alpha=1.0, class_weight=None,
                                       copy_X=True, fit_intercept=True,
                                       max_iter=None, normalize=False,
                                       random_state=None, solver='auto',
                                       tol=0.001),
             iid='deprecated', n_jobs=None,
             param_grid=[{'alpha': [0, 0.25, 0.5, 0.75, 1.0]}],
             pre_dispatch='2*n_jobs', refit=True, return_train_score=False,
             scoring='f1', verbose=0)

In [32]:
print("Best parameters set found on development set:")
print()
print(clf.best_params_)
print()
print("Grid scores on development set:")
print()
means = clf.cv_results_['mean_test_score']
stds = clf.cv_results_['std_test_score']
for mean, std, params in zip(means, stds, clf.cv_results_['params']):
    print("%0.3f (+/-%0.03f) for %r"
          % (mean, std * 2, params))
print()

print("Detailed classification report:")
print()
print("The model is trained on the full development set.")
print("The scores are computed on the full evaluation set.")
print()
y_true, y_pred = y_test, clf.predict(X_test_glove)
print(classification_report(y_true, y_pred))
print()

Best parameters set found on development set:

{'alpha': 0}

Grid scores on development set:

0.724 (+/-0.035) for {'alpha': 0}
0.724 (+/-0.035) for {'alpha': 0.25}
0.724 (+/-0.035) for {'alpha': 0.5}
0.724 (+/-0.035) for {'alpha': 0.75}
0.724 (+/-0.035) for {'alpha': 1.0}

Detailed classification report:

The model is trained on the full development set.
The scores are computed on the full evaluation set.

              precision    recall  f1-score   support

           0       0.77      0.84      0.80       890
           1       0.75      0.64      0.69       632

    accuracy                           0.76      1522
   macro avg       0.76      0.74      0.75      1522
weighted avg       0.76      0.76      0.76      1522




### (5) GloVe + Feed-Forward Neural Net

In [78]:
# TensorFlow and tf.keras
import tensorflow as tf
from tensorflow import keras
# from tensorflow.keras import backend as K

# def recall_m(y_true, y_pred):
#     true_positives = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
#     possible_positives = K.sum(K.round(K.clip(y_true, 0, 1)))
#     recall = true_positives / (possible_positives + K.epsilon())
#     return recall

# def precision_m(y_true, y_pred):
#     true_positives = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
#     predicted_positives = K.sum(K.round(K.clip(y_pred, 0, 1)))
#     precision = true_positives / (predicted_positives + K.epsilon())
#     return precision

# def f1_m(y_true, y_pred):
#     precision = precision_m(y_true, y_pred)
#     recall = recall_m(y_true, y_pred)
#     return 2*((precision*recall)/(precision+recall+K.epsilon()))


# define NN architecture
model = keras.Sequential([
    keras.layers.Dense(64, activation='relu'),
    keras.layers.Dense(2)
])

# optimizes and loss function
model.compile(optimizer='adam',
              loss=tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True),
              metrics=['accuracy'])

In [75]:
X_train_glove.to_numpy().shape

(6088, 25)

In [84]:
model.fit(X_train_glove.to_numpy(), y_train.to_numpy(), epochs=30)

Train on 6088 samples
Epoch 1/30
Epoch 2/30
Epoch 3/30
Epoch 4/30
Epoch 5/30
Epoch 6/30
Epoch 7/30
Epoch 8/30
Epoch 9/30
Epoch 10/30
Epoch 11/30
Epoch 12/30
Epoch 13/30
Epoch 14/30
Epoch 15/30
Epoch 16/30
Epoch 17/30
Epoch 18/30
Epoch 19/30
Epoch 20/30
Epoch 21/30
Epoch 22/30
Epoch 23/30
Epoch 24/30
Epoch 25/30
Epoch 26/30
Epoch 27/30
Epoch 28/30
Epoch 29/30
Epoch 30/30


<tensorflow.python.keras.callbacks.History at 0x1a52e21e50>

In [85]:
test_loss, test_acc = model.evaluate(X_test_glove.to_numpy(),  y_test.to_numpy(), verbose=2)

print('\nTest accuracy:', test_acc)

1522/1522 - 0s - loss: 0.4900 - accuracy: 0.7858

Test accuracy: 0.78580815


In [93]:
probability_model = tf.keras.Sequential([model, 
                                         tf.keras.layers.Softmax()])

predictions = probability_model.predict(X_test_glove.to_numpy())

print(y_test.shape)
print(predictions.shape)

def argmax(a):
    return np.argmax(a)

preds_argmax = np.apply_along_axis(argmax, 1, predictions)


(1522,)
(1522, 2)


0.78580814717477

In [104]:
def recall(y_true, y_pred):
    true_positives = np.sum(np.round(np.clip(y_true * y_pred, 0, 1)))
    possible_positives = np.sum(np.round(np.clip(y_true, 0, 1)))
    recall = true_positives / (possible_positives)
    return recall

def precision(y_true, y_pred):
    true_positives = np.sum(np.round(np.clip(y_true * y_pred, 0, 1)))
    predicted_positives = np.sum(np.round(np.clip(y_pred, 0, 1)))
    precision = true_positives / (predicted_positives)
    return precision

# def f1(y_true, y_pred):
#     precision = precision(y_true, y_pred)
#     recall = recall(y_true, y_pred)
#     return 2*((precision*recall)/(precision+recall))


prec = precision( y_test, preds_argmax )
reca = recall( y_test, preds_argmax )
f1   = 2*((prec*reca)/(prec+reca))
f1

0.7120141342756183