In [21]:
import gensim
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import precision_score, recall_score, accuracy_score

In [3]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [7]:
df = pd.read_csv('/content/drive/MyDrive/NLP/spam.csv', encoding = 'latin-1')
df = df.drop(labels = ['Unnamed: 2', 'Unnamed: 3', 'Unnamed: 4'], axis = 1)
df.columns = ['label', 'text']
df.head()

Unnamed: 0,label,text
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [9]:
#clean data using the built in cleaner gensim
df['text_clean'] = df['text'].apply(lambda x: gensim.utils.simple_preprocess(x))
df.head()

Unnamed: 0,label,text,text_clean
0,ham,"Go until jurong point, crazy.. Available only ...","[go, until, jurong, point, crazy, available, o..."
1,ham,Ok lar... Joking wif u oni...,"[ok, lar, joking, wif, oni]"
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,"[free, entry, in, wkly, comp, to, win, fa, cup..."
3,ham,U dun say so early hor... U c already then say...,"[dun, say, so, early, hor, already, then, say]"
4,ham,"Nah I don't think he goes to usf, he lives aro...","[nah, don, think, he, goes, to, usf, he, lives..."


In [10]:
#encoding the label column
df['label'] = df['label'].map({'ham': 1, 'spam': 0})
#split data into train and test sets
xtrain, xtest, ytrain, ytest = train_test_split(df['text_clean'], df['label'], test_size = 0.2)
w2v = gensim.models.Word2Vec(xtrain, size = 100, window = 5, min_count = 2)

In [None]:
df['label']

In [None]:
#w2v model - index to key
list(w2v.wv.index2word)

In [13]:
#find the most similar words to 'king' based on word vectors from our trained model
w2v.wv.most_similar('king')

[('part', 0.9974408149719238),
 ('long', 0.9974242448806763),
 ('evening', 0.9974095225334167),
 ('everyone', 0.9974071383476257),
 ('while', 0.9973952770233154),
 ('shows', 0.997390866279602),
 ('hot', 0.9973853826522827),
 ('motorola', 0.9973711967468262),
 ('hand', 0.9973613023757935),
 ('boy', 0.9973571300506592)]

In [17]:
words = set(w2v.wv.index2word)
xtrain_vect = np.array([np.array([w2v.wv[i] for i in ls if i in words]) for ls in xtrain])
xtest_vect = np.array([np.array([w2v.wv[i] for i in ls if i in words]) for ls in xtest])

  
  This is separate from the ipykernel package so we can avoid doing imports until


In [None]:
xtrain_vect

In [19]:
#compute sentence vectors by averaging the word vectors for the words contained in the sentence
xtrain_vect_avg = []
for v in xtrain_vect:
  if v.size:
    xtrain_vect_avg.append(v.mean(axis = 0))
  else:
    xtrain_vect_avg.append(np.zeros(100, dtype = float))

xtest_vect_avg = []
for v in xtest_vect:
  if v.size:
    xtest_vect_avg.append(v.mean(axis = 0))
  else:
    xtest_vect_avg.append(np.zeros(100, dtype = float))

In [22]:
#instantiate and fit a basic random forest model on top of the vectors
rf = RandomForestClassifier()
rf_model = rf.fit(xtrain_vect_avg, ytrain.values.ravel())
ypred = rf_model.predict(xtest_vect_avg)
prec = precision_score(ytest, ypred)
recall = recall_score(ytest, ypred)
acc = accuracy_score(ytest, ypred)
print("Accuracy - ", acc)
print("Precision - ", prec)
print("Recall - ", recall)

Accuracy -  0.9623318385650225
Precision -  0.9656912209889001
Recall -  0.9917098445595854
