### Import libraries

In [1]:
##
## Import libraries
import pandas as pd
import numpy as np
from sklearn import linear_model, model_selection, preprocessing, metrics, svm
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn.naive_bayes import BernoulliNB
from sklearn.model_selection import cross_val_score, train_test_split
from sklearn.ensemble import RandomForestClassifier
import nltk

In [2]:
# True Positive [TP] = your prediction is 1, and the ground truth is also 1 - you predicted a positive and that's true!
# False Positive [FP] = your prediction is 1, and the ground truth is 0 - you predicted a positive, and that's false.
# False Negative [FN] = your prediction is 0, and the ground truth is 1 - you predicted a negative, and that's false.

##
## Accuracy is measured by F1 score = 2 ∗ (precision∗recall) / (precision+recall)
## 
## and precision = TP/(TP+FP) and recall = TP/(TP+FN)

### Read Data

In [2]:
# read data
rootPath = '/Users/joaquimlyrio/Documents/Kaggle/NLP with Disaster Tweets/'
train = pd.read_csv( rootPath + 'data/nlp-getting-started/train.csv' )
test  = pd.read_csv( rootPath + 'data/nlp-getting-started/test.csv' )
subm_samp = pd.read_csv( rootPath + 'data/nlp-getting-started/sample_submission.csv' )

In [3]:
train.head(10)

Unnamed: 0,id,keyword,location,text,target
0,1,,,Our Deeds are the Reason of this #earthquake M...,1
1,4,,,Forest fire near La Ronge Sask. Canada,1
2,5,,,All residents asked to 'shelter in place' are ...,1
3,6,,,"13,000 people receive #wildfires evacuation or...",1
4,7,,,Just got sent this photo from Ruby #Alaska as ...,1
5,8,,,#RockyFire Update => California Hwy. 20 closed...,1
6,10,,,#flood #disaster Heavy rain causes flash flood...,1
7,13,,,I'm on top of the hill and I can see a fire in...,1
8,14,,,There's an emergency evacuation happening now ...,1
9,15,,,I'm afraid that the tornado is coming to our a...,1


In [4]:
##
## Checking size of each class on train set
print( train[ train['target'] == 1 ].shape[0] )
print( train[ train['target'] == 0 ].shape[0] )

3271
4342


In [6]:
# check first tweets when disaster happened
train[train["target"] == 1]["text"].values[0:2]

array(['Our Deeds are the Reason of this #earthquake May ALLAH Forgive us all',
       'Forest fire near La Ronge Sask. Canada'], dtype=object)

In [7]:
# check first tweets when disaster NOT happened
train[train["target"] == 0]["text"].values[0:2]

array(["What's up man?", 'I love fruits'], dtype=object)

## Feature Engineering

### Remove letter case and stopwords 

In [None]:
##
## Clean words: tokenize, remove stopwords and punctuation, all lowercase
## 
## remove times etc
##
## Use word embeddings - GloVe trained on Twitter data already downloaded
##
## Think of way to convert word embedding into some type of aggregate embedding, like tweet embeddding
##
## Model it
##

In [4]:

# function to process data
def textProcessing(dt, textCol):
    
    # Get stopwords
    stop_words = set(stopwords.words('english')) 
    
    # Remove cases
    dt['tmp_text'] = dt[textCol].str.lower()

    # Remove punctuation
    dt['tmp_text'] = dt.apply(lambda row: row['tmp_text'].translate(str.maketrans('', '', string.punctuation)), axis=1)

    # Tokenize
    dt['tmp_text'] = dt.apply(lambda row: word_tokenize(row['tmp_text']), axis=1)

    # Remove stopwords
    dt['new_text'] = dt['tmp_text'].apply(lambda x: [item for item in x if item not in stop_words])
    
    # Return dataframe
    return dt
    

In [5]:
from nltk.tokenize import word_tokenize 
from nltk.corpus import stopwords
import string
# nltk.download('stopwords')
# nltk.download('punkt')

newtrain = textProcessing( dt=train, textCol='text')
newtrain

Unnamed: 0,id,keyword,location,text,target,tmp_text,new_text
0,1,,,Our Deeds are the Reason of this #earthquake M...,1,"[our, deeds, are, the, reason, of, this, earth...","[deeds, reason, earthquake, may, allah, forgiv..."
1,4,,,Forest fire near La Ronge Sask. Canada,1,"[forest, fire, near, la, ronge, sask, canada]","[forest, fire, near, la, ronge, sask, canada]"
2,5,,,All residents asked to 'shelter in place' are ...,1,"[all, residents, asked, to, shelter, in, place...","[residents, asked, shelter, place, notified, o..."
3,6,,,"13,000 people receive #wildfires evacuation or...",1,"[13000, people, receive, wildfires, evacuation...","[13000, people, receive, wildfires, evacuation..."
4,7,,,Just got sent this photo from Ruby #Alaska as ...,1,"[just, got, sent, this, photo, from, ruby, ala...","[got, sent, photo, ruby, alaska, smoke, wildfi..."
5,8,,,#RockyFire Update => California Hwy. 20 closed...,1,"[rockyfire, update, california, hwy, 20, close...","[rockyfire, update, california, hwy, 20, close..."
6,10,,,#flood #disaster Heavy rain causes flash flood...,1,"[flood, disaster, heavy, rain, causes, flash, ...","[flood, disaster, heavy, rain, causes, flash, ..."
7,13,,,I'm on top of the hill and I can see a fire in...,1,"[im, on, top, of, the, hill, and, i, can, see,...","[im, top, hill, see, fire, woods]"
8,14,,,There's an emergency evacuation happening now ...,1,"[theres, an, emergency, evacuation, happening,...","[theres, emergency, evacuation, happening, bui..."
9,15,,,I'm afraid that the tornado is coming to our a...,1,"[im, afraid, that, the, tornado, is, coming, t...","[im, afraid, tornado, coming, area]"


### Read GloVe vectors

In [6]:
# this chunk takes a bit to run
glovePath = '/Users/joaquimlyrio/Downloads/glove/glove.twitter.27B.25d.txt'
embeddings_dict = {}
with open( glovePath, 'r') as f:
    for line in f:
        values = line.split()
        word = values[0]
        vector = np.asarray(values[1:], "float32")
        embeddings_dict[word] = vector

In [7]:
embeddings_dict['#cool']

array([-1.41149998,  0.95050001, -0.82784998, -2.13610005,  0.56384999,
       -0.38422   , -0.66835999, -1.16620004, -1.54890001, -1.4648    ,
       -1.09200001,  0.089005  ,  0.48635   , -0.39085001, -2.04520011,
        0.072904  , -0.17231999,  0.34520999,  1.39880002,  1.30509996,
        0.33195999,  1.03830004,  0.41363001, -0.58867002,  1.33420002], dtype=float32)

### Create features for each tweet based on GloVe vectors

### Word Counts

In [9]:

##
## Count words
count_vect = CountVectorizer()
train_counts = count_vect.fit_transform(train.text)
train_counts.shape

(7613, 21637)

### TF-IDF

In [10]:

##
## TF-IDF
tfidf_transformer = TfidfTransformer()
train_tfidf = tfidf_transformer.fit_transform(train_counts)
train_tfidf.shape

(7613, 21637)

## Models

### Bernoulli Naive Bayes

In [11]:
##
## Fit Bernoulli Naive-Bayes
clf = BernoulliNB().fit(train_tfidf, train.target)
clf

BernoulliNB(alpha=1.0, binarize=0.0, class_prior=None, fit_prior=True)

In [12]:

##
## Predict on train set
nObs = 100000
print( np.mean( clf.predict(train_tfidf[0:nObs]) == train.target[0:nObs] ) )
metrics.f1_score( y_true = clf.predict(train_tfidf[0:nObs]),
                  y_pred = train.target[0:nObs] )

0.8943911729935636


0.8658658658658658

In [13]:
## Bernoulli NB
clf1 = BernoulliNB()
scores1 = cross_val_score(clf1, train_tfidf, train.target, cv=5, scoring='f1')
f'BernoulliNB: {scores1}'

'BernoulliNB: [0.63339383 0.62758051 0.68714632 0.64700781 0.76566125]'

In [61]:
# ## SVM
# clf2 = svm.SVC(kernel='linear', C=1)
# scores2 = cross_val_score(clf2, X_train_tfidf, train.target, cv=5, scoring='f1')
# f'SVM: {scores2}'

'SVM: [0.62643678 0.55838455 0.62658764 0.60056127 0.73311897]'

In [63]:
# ## RandomForestClassifier
# clf3 = RandomForestClassifier()
# scores3 = cross_val_score(clf3, X_train_tfidf, train.target, cv=5, scoring='f1')
# f'RandomForestClassifier: {scores3}'

'SVM: [0.53548387 0.48820513 0.52991453 0.5257732  0.68392857]'

## Ridge Classifier

In [25]:
##
## Obs: this cell takes a while to run ~ 3 min or so
##

from sklearn.model_selection import GridSearchCV
from sklearn.metrics import classification_report
from sklearn.linear_model import RidgeClassifier

## RidgeClassifier
ridge = linear_model.RidgeClassifier()
ridge.get_params()

# define parameter range to perform GridSearchCV
param_grid = [ {'alpha': np.arange(0,1.1,.1)} ]

# Split the dataset for cross validation
X_train_tfidf, X_test_tfidf, y_train, y_test = train_test_split(train_tfidf, 
                                                          train.target, 
                                                          test_size=0.2, 
                                                          random_state=0)


# Set the parameters by cross-validation
tuned_parameter = [{'alpha':[0,.25,.5,.75,1.0]}]

# Score to to optimize over
score = 'f1'

clf = GridSearchCV(
    RidgeClassifier(), tuned_parameter, scoring=score
)

clf.fit(X_train_tfidf, y_train)


GridSearchCV(cv=None, error_score=nan,
             estimator=RidgeClassifier(alpha=1.0, class_weight=None,
                                       copy_X=True, fit_intercept=True,
                                       max_iter=None, normalize=False,
                                       random_state=None, solver='auto',
                                       tol=0.001),
             iid='deprecated', n_jobs=None,
             param_grid=[{'alpha': [0, 0.25, 0.5, 0.75, 1.0]}],
             pre_dispatch='2*n_jobs', refit=True, return_train_score=False,
             scoring='f1', verbose=0)

In [27]:
print("Best parameters set found on development set:")
print()
print(clf.best_params_)
print()
print("Grid scores on development set:")
print()
means = clf.cv_results_['mean_test_score']
stds = clf.cv_results_['std_test_score']
for mean, std, params in zip(means, stds, clf.cv_results_['params']):
    print("%0.3f (+/-%0.03f) for %r"
          % (mean, std * 2, params))
print()

print("Detailed classification report:")
print()
print("The model is trained on the full development set.")
print("The scores are computed on the full evaluation set.")
print()
y_true, y_pred = y_test, clf.predict(X_test_tfidf)
print(classification_report(y_true, y_pred))
print()

Best parameters set found on development set:

{'alpha': 0.75}

Grid scores on development set:

0.409 (+/-0.161) for {'alpha': 0}
0.737 (+/-0.018) for {'alpha': 0.25}
0.745 (+/-0.025) for {'alpha': 0.5}
0.751 (+/-0.032) for {'alpha': 0.75}
0.750 (+/-0.032) for {'alpha': 1.0}

Detailed classification report:

The model is trained on the full development set.
The scores are computed on the full evaluation set.

              precision    recall  f1-score   support

           0       0.80      0.90      0.85       886
           1       0.83      0.69      0.75       637

    accuracy                           0.81      1523
   macro avg       0.81      0.79      0.80      1523
weighted avg       0.81      0.81      0.81      1523




In [None]:
###
### Try to do word embeddings + classification model
###
