# Compare NLP Techniques: Build Model On word2vec Vectors

### Read In Cleaned Text

In [2]:
# Load the cleaned training and test sets
import gensim
import numpy as np
import pandas as pd

X_train = pd.read_csv('data/X_train.csv')
X_test = pd.read_csv('data/X_test.csv')
y_train = pd.read_csv('data/y_train.csv')
y_test = pd.read_csv('data/y_test.csv')


### Create word2vec Vectors

In [3]:
# Train a basic word2vec model
w2v_model = gensim.models.Word2Vec(X_train,
                                   size=100,
                                   window=5,
                                   min_count=2)

In [4]:
# Replace the words in each text message with the learned word vector
words = set(w2v_model.wv.index2word)
X_train_vect = np.array([np.array([w2v_model.wv[i] for i in ls if i in words])
                         for ls in X_train['clean_text']])


X_test_vect = np.array([np.array([w2v_model.wv[i] for i in ls if i in words])
                         for ls in X_test['clean_text']])

  X_train_vect = np.array([np.array([w2v_model.wv[i] for i in ls if i in words])
  X_test_vect = np.array([np.array([w2v_model.wv[i] for i in ls if i in words])


In [5]:
# Average the word vectors for each sentence (and assign a vector of zeros if the model
# did not learn any of the words in the text message during training
X_train_vect_avg = []
for v in X_train_vect:
    if v.size:
        X_train_vect_avg.append(v.mean(axis=0))
    else:
        X_train_vect_avg.append(np.zeros(100, dtype=float))
        
X_test_vect_avg = []
for v in X_test_vect:
    if v.size:
        X_test_vect_avg.append(v.mean(axis=0))
    else:
        X_test_vect_avg.append(np.zeros(100, dtype=float))

In [6]:
# What does the unaveraged version look like?
X_test_vect[0]

array([[ 0.00052976,  0.00129748, -0.00435183, ..., -0.00282719,
         0.00305199, -0.0037114 ],
       [ 0.00052976,  0.00129748, -0.00435183, ..., -0.00282719,
         0.00305199, -0.0037114 ],
       [ 0.00052976,  0.00129748, -0.00435183, ..., -0.00282719,
         0.00305199, -0.0037114 ],
       ...,
       [-0.00314388, -0.00413153, -0.00382374, ..., -0.00023136,
        -0.00277039, -0.00068424],
       [ 0.00052976,  0.00129748, -0.00435183, ..., -0.00282719,
         0.00305199, -0.0037114 ],
       [ 0.00052976,  0.00129748, -0.00435183, ..., -0.00282719,
         0.00305199, -0.0037114 ]], dtype=float32)

In [7]:
# What does the averaged version look like?
X_test_vect_avg[0]

array([-1.3817525e-04,  3.1038400e-04, -4.2558154e-03, -4.1776630e-03,
       -8.8688580e-04,  2.6434367e-03,  1.6403514e-03,  2.1463572e-03,
        1.0858827e-03, -7.3269551e-04,  3.4686334e-03, -2.7430775e-03,
        3.1948858e-03, -2.5256479e-03,  2.0413992e-03, -4.5796830e-04,
        6.4931711e-04, -4.0241065e-03, -2.2872582e-03, -2.2350156e-03,
        9.3134609e-04,  4.1169468e-03, -1.2247874e-03, -1.8431377e-04,
       -4.2313961e-03,  1.2366581e-03, -5.6617917e-04,  3.1511628e-03,
        2.9806406e-03,  2.9528842e-03, -3.5275936e-03,  1.4976065e-03,
       -2.6132101e-03,  1.1233944e-03, -1.3340815e-04, -4.8620622e-03,
        3.7058496e-03, -2.9357019e-04, -1.0671747e-04, -8.0434268e-04,
       -3.6717599e-04,  3.5600124e-03, -2.6058396e-03, -1.2634781e-03,
       -2.0134782e-03,  2.2437698e-03,  1.4429702e-04,  1.5926402e-03,
        2.3008336e-03, -3.6393001e-03, -1.9044623e-03, -1.2565989e-03,
       -3.0875874e-03, -3.3891611e-04, -1.0627392e-04, -2.8948216e-03,
      

### Fit RandomForestClassifier On Top Of Word Vectors

In [8]:
# Instantiate and fit a basic Random Forest model on top of the vectors
from sklearn.ensemble import RandomForestClassifier

rf = RandomForestClassifier()
rf_model = rf.fit(X_train_vect_avg, y_train.values.ravel())

In [9]:
# Use the trained model to make predictions on the test data
y_pred = rf_model.predict(X_test_vect_avg)

In [10]:
# Evaluate the predictions of the model on the holdout test set
from sklearn.metrics import precision_score, recall_score

precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
print('Precision: {} / Recall: {} / Accuracy: {}'.format(
    round(precision, 3), round(recall, 3), round((y_pred==y_test['label']).sum()/len(y_pred), 3)))

Precision: 0.55 / Recall: 0.217 / Accuracy: 0.869
