In [None]:
# import libraries

import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from nltk.corpus import stopwords
from sklearn.metrics import confusion_matrix, plot_confusion_matrix, accuracy_score, silhouette_score
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.svm import SVC
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.cluster import DBSCAN, KMeans
import nltk

import warnings 
warnings.filterwarnings("ignore")

In [2]:
# read in csv

df = pd.read_csv('/Users/juhee/Desktop/GA/lsmtmp/juhee/data/final_labeled.csv')

print(df.shape)
df.head()

(38325, 10)


Unnamed: 0,id,user_name,lat,long,date,text,text_clean,n_sentence,n_words,label
0,1241152458912673794,DarkDon_,40.738154,-112.101609,2020-03-20 23:59:40+00:00,TK Kirkland's Reaction To Earthquake's Ex-Wife...,tk kirkland s reaction to earthquake ex wife o...,1,17,1
1,1241152416214474753,ReallyInID,42.242918,-112.103304,2020-03-20 23:59:30+00:00,That sound you hear is NOT an earthquake. It’s...,that sound you hear is not an earthquake it s...,2,13,1
2,1241152284827856896,folsworth,40.982162,-112.10402,2020-03-20 23:58:58+00:00,he said he stocked up on ammo cause he knew Th...,he said stocked up on ammo cause knew the big ...,2,23,1
3,1241152178695405570,Colony14,42.729481,-112.107004,2020-03-20 23:58:33+00:00,"If an earthquake separates CA, WA, and OR from...",if an earthquake separate ca wa and or from t...,1,23,1
4,1241152072818364416,MonitorSismico,41.044996,-112.103864,2020-03-20 23:58:08+00:00,"#Sismo M 4 SOUTH OF BALI, #INDONESIA. 20-03-20...",sismo m south of bali indonesia utc http...,2,13,1


In [3]:
# check null values

df.isnull().sum()

id            0
user_name     0
lat           0
long          0
date          0
text          0
text_clean    0
n_sentence    0
n_words       0
label         0
dtype: int64

In [4]:
# define baseline accuracy

df['label'].value_counts(normalize = True)

1    0.511155
0    0.488845
Name: label, dtype: float64

In [5]:
# define labels
my_labels = ['earthquake', 'no-earthquake']

### Final model: word2vec

In [25]:
# import gensim and create a model to vectorize the words

import gensim

file_path = '/Users/juhee/Desktop/GA/08-week/8.05-lesson-word-vectors/lexvec.enwiki+newscrawl.300d.W.pos.vectors'

model = gensim.models.KeyedVectors.load_word2vec_format(file_path)

In [32]:
# import stopwords

from gensim.parsing.preprocessing import STOPWORDS

In [33]:
# customize stopwords using union

my_stop_words = STOPWORDS.union(set(['earthquake']))

In [34]:
## code taken from 
## https://github.com/susanli2016/NLP-with-Python/blob/master/Text%20Classification%20model%20selection.ipynb
## and adapted

model.init_sims(replace = True)

In [35]:
# checking the vocabulary stored in gensim's word2vec vocabulary

from itertools import islice
list(islice(model.vocab, 13030, 13050))

['bal',
 'harley',
 'proponents',
 'escalating',
 'madeleine',
 'crushing',
 'yielded',
 'understandable',
 'agnes',
 'victorious',
 'rockefeller',
 'deeds',
 'jude',
 'doomed',
 'sundays',
 'rejecting',
 'prep',
 'concession',
 'leopold',
 'dislike']

In [28]:
# create our own message in logging

import logging

In [41]:
# average word vectors

def word_averaging(model, words):
    all_words, mean = set(), []
    
    for word in words:
        
        # only check the words if not in  stopwords
        if word not in my_stop_words:
        
            # check if the word is in the array of words in tweet, if so, append
            if isinstance(word, np.ndarray):
                mean.append(word)
            
            # if the word is also found in gensim word2vec vocabulary, append unit normalized vectors
            # and add the index of the words in all_words
            elif word in model.vocab:
                mean.append(model.syn0norm[model.vocab[word].index])
                all_words.add(model.vocab[word].index)
    
    # if mean returns no unit vectors, produce warning message and 0s
    if not mean:
        logging.warning('cannot compute similarity with no input %s', words)
        return np.zeros(model.vector_size, )
    
    # calculate mean from the collected unit vectors
    mean = gensim.matutils.unitvec(np.array(mean).mean(axis = 0)).astype(np.float32)
    
    return mean

def word_averaging_list(model, text_list):
    
    # return the calculated mean from above function in a array by stacking them
    return np.vstack([word_averaging(model, tweet) for tweet in text_list])

In [42]:
# tokenize the text with word2vec

def w2v_tokenize_text(text):
    
    # create an empty list to store the tokenized words
    tokens = []
    
    # tokenize the sentence 
    for sent in nltk.sent_tokenize(text, language = 'english'):
        
        # tokenize the word
        for word in nltk.word_tokenize(sent, language = 'english'):
            
            # if length of each word contains less than 2 characters, ignore
            if len(word) < 2:
                continue
            
            # append each word tokens
            tokens.append(word)
    
    # return tokens
    return tokens

In [43]:
# train test split - there's no target variable!!!

train, test = train_test_split(df, test_size = 0.3, random_state = 42)

# then tokenize 

test_tokenized = test.apply(lambda r: w2v_tokenize_text(r['text']), axis = 1).values
train_tokenized = train.apply(lambda r: w2v_tokenize_text(r['text']), axis = 1).values

In [None]:
# calculate the means for us to be able to use vectors for train, and test dataset

X_train_word_average = word_averaging_list(model, train_tokenized)
X_test_word_average = word_averaging_list(model, test_tokenized)

# a lot of warnings - clear the output

In [45]:
# support vector machine

svc = SVC()
svc = svc.fit(X = X_train_word_average, y = train['label'])
y_pred = svc.predict(X_test_word_average)

In [47]:
accuracy_score(y_pred, test['label'])

0.8667594364237259

In [48]:
from sklearn.metrics import classification_report

# return f1, precision, and recall score

print(classification_report(test['label'], y_pred, target_names = my_labels))

               precision    recall  f1-score   support

   earthquake       0.85      0.89      0.87      5618
no-earthquake       0.89      0.85      0.87      5880

     accuracy                           0.87     11498
    macro avg       0.87      0.87      0.87     11498
 weighted avg       0.87      0.87      0.87     11498



### We were able to achieve F1 score of 0.87!!