### Import Libraries

In [1]:

import pandas as pd
import numpy as np

from sklearn import linear_model, model_selection, preprocessing, metrics, svm
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn.naive_bayes import BernoulliNB
from sklearn.model_selection import cross_val_score, train_test_split
from sklearn.ensemble import RandomForestClassifier

import nltk
from nltk.tokenize import word_tokenize 
from nltk.corpus import stopwords
import string
# nltk.download('stopwords')
# nltk.download('punkt')

from plotnine import *

# tensorflow and Keras
import tensorflow as tf
from tensorflow import keras
import tensorflow_hub as hub

# module_url = "https://tfhub.dev/google/nnlm-en-dim128/2"
# embed = hub.KerasLayer(module_url)
# embeddings = embed(["A long sentence.", "single-word",
#                   "http://example.com"])
# print(embeddings.shape)  #(3,128)

### Read Data

In [2]:
# read data
train = pd.read_csv( '../data/nlp-getting-started/train.csv' )
test  = pd.read_csv( '../data/nlp-getting-started/test.csv' )
subm_samp = pd.read_csv( '../data/nlp-getting-started/sample_submission.csv' )

train.head(5)

Unnamed: 0,id,keyword,location,text,target
0,1,,,Our Deeds are the Reason of this #earthquake M...,1
1,4,,,Forest fire near La Ronge Sask. Canada,1
2,5,,,All residents asked to 'shelter in place' are ...,1
3,6,,,"13,000 people receive #wildfires evacuation or...",1
4,7,,,Just got sent this photo from Ruby #Alaska as ...,1


In [3]:
##
## Checking size of each class on train set
print( train[ train['target'] == 1 ].shape[0] )
print( train[ train['target'] == 0 ].shape[0] )

3271
4342


### Feature Engineering

In [4]:
###
### Function to process data
###
def textProcessing(dt, textCol):
    
    # make copy
    dt_copy = dt.copy()
    
    # Get stopwords
    stop_words = set(stopwords.words('english')) 
    
    # Remove cases
    dt_copy['tmp_text'] = dt[textCol].str.lower()

    # Remove punctuation
    dt_copy['tmp_text'] = dt_copy.apply(lambda row: row['tmp_text'].translate(str.maketrans('', '', string.punctuation)), axis=1)

    # Tokenize
    dt_copy['tmp_text'] = dt_copy.apply(lambda row: word_tokenize(row['tmp_text']), axis=1)

    # Remove stopwords
    dt_copy['new_text'] = dt_copy['tmp_text'].apply(lambda x: [item for item in x if item not in stop_words])
    
    # Return dataframe
    return dt_copy
    

In [5]:
##
## Load pre-trained sentence embedding - Google's Swivel
model = "https://tfhub.dev/google/tf2-preview/gnews-swivel-20dim/1"
hub_layer = hub.KerasLayer(model, output_shape=[20], input_shape=[], 
#                            dtype=tf.string, trainable=False)
                           dtype=tf.string, trainable=True)

In [6]:
##
## Define model's architecture
model = tf.keras.Sequential()
model.add(hub_layer)
model.add(tf.keras.layers.Dense(16, activation='relu'))
model.add(tf.keras.layers.Dense(1, activation='sigmoid'))

model.summary()

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
keras_layer (KerasLayer)     (None, 20)                400020    
_________________________________________________________________
dense (Dense)                (None, 16)                336       
_________________________________________________________________
dense_1 (Dense)              (None, 1)                 17        
Total params: 400,373
Trainable params: 400,373
Non-trainable params: 0
_________________________________________________________________


In [7]:
model.compile(optimizer='adam',
              loss='binary_crossentropy',
              metrics=['accuracy'])

In [8]:
X_train_all = train['text']
Y_train_all = train['target']

# Separate in test/train sets
X_train, X_test, y_train, y_test = train_test_split( X_train_all, Y_train_all, test_size=0.3, random_state=710)

In [9]:
## Train Model
model.fit(X_train,
          y_train,
          epochs=10,
          validation_data=(X_test, y_test),
          verbose=1)

Train on 5329 samples, validate on 2284 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<tensorflow.python.keras.callbacks.History at 0x10eaa9410>

In [10]:
# get predictions for out of sample
predictions = model.predict(X_test)

# get metrics
results = model.evaluate(X_test, y_test)
print(results)

[0.5172323067321878, 0.7749562]


In [11]:
def recall(y_true, y_pred):
    true_positives = np.sum(np.round(np.clip(y_true * y_pred, 0, 1)))
    possible_positives = np.sum(np.round(np.clip(y_true, 0, 1)))
    recall = true_positives / (possible_positives)
    return recall

def precision(y_true, y_pred):
    true_positives = np.sum(np.round(np.clip(y_true * y_pred, 0, 1)))
    predicted_positives = np.sum(np.round(np.clip(y_pred, 0, 1)))
    precision = true_positives / (predicted_positives)
    return precision

prec = precision( np.reshape(y_test.to_numpy(), (-1, 1)), np.round(predictions) )
reca = recall( np.reshape(y_test.to_numpy(), (-1, 1)), np.round(predictions) )
f1   = 2*((prec*reca)/(prec+reca))
print(prec)
print(reca)
f1

0.7281153450051493
0.7387669801462905


0.7334024896265561

### Tf-idf + Bernoulli Naive Bayes Classifier

In [None]:


##
## Count words
count_vect = CountVectorizer()
train_counts = count_vect.fit_transform(train.text)
train_counts.shape


##
## TF-IDF
tfidf_transformer = TfidfTransformer()
train_tfidf = tfidf_transformer.fit_transform(train_counts)
train_tfidf.shape

##
## Fit Bernoulli Naive-Bayes
clf_tfidf = BernoulliNB().fit(train_tfidf, train.target)
clf_tfidf


##
## Predict on train set
nObs = train_tfidf.shape[0]
print( np.mean( clf_tfidf.predict(train_tfidf[0:nObs]) == train.target[0:nObs] ) )
metrics.f1_score( y_true = clf_tfidf.predict(train_tfidf[0:nObs]),
                  y_pred = train.target[0:nObs] )