In [1]:
# import libraries

import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from nltk.corpus import stopwords
from sklearn.metrics import confusion_matrix, plot_confusion_matrix, accuracy_score, silhouette_score
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.svm import SVC
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.cluster import DBSCAN, KMeans
import nltk

import warnings 
warnings.filterwarnings("ignore")

In [3]:
# read in csv

df = pd.read_csv('../data/final_hurricane_labeled.csv')

print(df.shape)
df.head()

(41101, 7)


Unnamed: 0,Date,text_clean,label,n_sentence,n_words,lat,long
0,2020-08-31 23:15:25+00:00,a customer service rep told me friday there is...,0,2,11,25.525284,-80.60692
1,2020-08-31 22:39:25+00:00,tomorrow at pm after hour with sabor havana ci...,0,2,19,27.686273,-80.934588
2,2020-08-31 22:31:09+00:00,hurricane laura wallop area with high mortgage...,0,2,13,27.701712,-75.255859
3,2020-08-31 20:25:29+00:00,i never wish bad on anyone but think we need a...,0,1,9,29.114762,-84.339632
4,2020-08-31 19:51:39+00:00,wth is pricemart so full their a hurricane idk...,0,2,5,28.506867,-89.67809


In [4]:
# check null values

df.isnull().sum()

Date           0
text_clean    25
label          0
n_sentence     0
n_words        0
lat            0
long           0
dtype: int64

In [5]:
# it could be due to non-english tweets that I did not know how to process in cleaning stage 
# will drop these rows

df.dropna(axis = 0, inplace = True)

In [6]:
# define baseline accuracy

df['label'].value_counts(normalize = True)

0    0.526658
1    0.473342
Name: label, dtype: float64

In [7]:
# define labels
my_labels = ['earthquake', 'no-earthquake']

### Final model: word2vec

In [8]:
# import gensim and create a model to vectorize the words

import gensim

file_path = '../vectors/lexvec.enwiki+newscrawl.300d.W.pos.vectors'

model = gensim.models.KeyedVectors.load_word2vec_format(file_path)

In [9]:
# import stopwords

from gensim.parsing.preprocessing import STOPWORDS

In [10]:
# customize stopwords using union

my_stop_words = STOPWORDS.union(set(['earthquake']))

In [11]:
## code taken from 
## https://github.com/susanli2016/NLP-with-Python/blob/master/Text%20Classification%20model%20selection.ipynb
## and adapted

model.init_sims(replace = True)

In [12]:
# checking the vocabulary stored in gensim's word2vec vocabulary

from itertools import islice
list(islice(model.vocab, 13030, 13050))

['bal',
 'harley',
 'proponents',
 'escalating',
 'madeleine',
 'crushing',
 'yielded',
 'understandable',
 'agnes',
 'victorious',
 'rockefeller',
 'deeds',
 'jude',
 'doomed',
 'sundays',
 'rejecting',
 'prep',
 'concession',
 'leopold',
 'dislike']

In [13]:
# create our own message in logging

import logging

In [14]:
# average word vectors

def word_averaging(model, words):
    all_words, mean = set(), []
    
    for word in words:
        
        # only check the words if not in  stopwords
        if word not in my_stop_words:
        
            # check if the word is in the array of words in tweet, if so, append
            if isinstance(word, np.ndarray):
                mean.append(word)
            
            # if the word is also found in gensim word2vec vocabulary, append unit normalized vectors
            # and add the index of the words in all_words
            elif word in model.vocab:
                mean.append(model.syn0norm[model.vocab[word].index])
                all_words.add(model.vocab[word].index)
    
    # if mean returns no unit vectors, produce warning message and 0s
    if not mean:
        logging.warning('cannot compute similarity with no input %s', words)
        return np.zeros(model.vector_size, )
    
    # calculate mean from the collected unit vectors
    mean = gensim.matutils.unitvec(np.array(mean).mean(axis = 0)).astype(np.float32)
    
    return mean

def word_averaging_list(model, text_list):
    
    # return the calculated mean from above function in a array by stacking them
    return np.vstack([word_averaging(model, tweet) for tweet in text_list])

In [15]:
# tokenize the text with word2vec

def w2v_tokenize_text(text):
    
    # create an empty list to store the tokenized words
    tokens = []
    
    # tokenize the sentence 
    for sent in nltk.sent_tokenize(text, language = 'english'):
        
        # tokenize the word
        for word in nltk.word_tokenize(sent, language = 'english'):
            
            # if length of each word contains less than 2 characters, ignore
            if len(word) < 2:
                continue
            
            # append each word tokens
            tokens.append(word)
    
    # return tokens
    return tokens

In [17]:
# train test split - there's no target variable!!!

train, test = train_test_split(df, test_size = 0.3, random_state = 42)

# then tokenize 

test_tokenized = test.apply(lambda r: w2v_tokenize_text(r['text_clean']), axis = 1).values
train_tokenized = train.apply(lambda r: w2v_tokenize_text(r['text_clean']), axis = 1).values

In [18]:
# calculate the means for us to be able to use vectors for train, and test dataset

X_train_word_average = word_averaging_list(model, train_tokenized)
X_test_word_average = word_averaging_list(model, test_tokenized)

# a lot of warnings - clear the output



In [19]:
# support vector machine

svc = SVC()
svc = svc.fit(X = X_train_word_average, y = train['label'])
y_pred = svc.predict(X_test_word_average)

In [20]:
accuracy_score(y_pred, test['label'])

0.9414915199220969

In [21]:
from sklearn.metrics import classification_report

# return f1, precision, and recall score

print(classification_report(test['label'], y_pred, target_names = my_labels))

               precision    recall  f1-score   support

   earthquake       0.92      0.97      0.95      6494
no-earthquake       0.97      0.91      0.94      5829

     accuracy                           0.94     12323
    macro avg       0.94      0.94      0.94     12323
 weighted avg       0.94      0.94      0.94     12323

