## BOW: Count Vectorizer

In [None]:
import pandas as pd
import numpy as np
import nltk
from nltk.corpus import stopwords
import string
import scipy
import matplotlib.pyplot as plt
import seaborn as sns
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import SnowballStemmer, PorterStemmer
import math
from collections import Counter
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score, roc_curve, auc, mean_squared_error
from sklearn.decomposition import TruncatedSVD, PCA
from sklearn.linear_model import LinearRegression, SGDRegressor, LogisticRegression
from sklearn.svm import SVR
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor, AdaBoostRegressor
from xgboost import XGBRegressor
from sklearn.preprocessing import LabelEncoder
from sklearn.tree import DecisionTreeClassifier

In [None]:
df = pd.read_csv('/content/aditi_nlp.csv')
df.head()

Unnamed: 0,Category,Message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [None]:
df['Message']=df['Message'].apply(lambda x: x.strip())    
df['Message']=df['Message'].apply(lambda x: x.lower())    
df['Message']=df['Message'].apply(lambda x: x.encode('ascii', 'ignore').decode())   
def remove_punct(text):
    text = re.sub("https*\S+", " ", text)
    text = re.sub("@\S+", " ", text)
    text = re.sub("#\S+", " ", text)
    text = re.sub("\'\w+", '', text)
    text = re.sub('[%s]' % re.escape(string.punctuation), ' ', text)
    text = re.sub(r'\w*\d+\w*', '', text)
    text = re.sub('\s{2,}', " ", text)
    return text

df['Message']=df['Message'].apply(lambda x: remove_punct(x))

In [None]:
df['Message']

0       go until jurong point crazy available only in ...
1                                ok lar joking wif u oni 
2       free entry in a wkly comp to win fa cup final ...
3            u dun say so early hor u c already then say 
4       nah i don think he goes to usf he lives around...
                              ...                        
5567    this is the time we have tried contact u u hav...
5568                   will b going to esplanade fr home 
5569    pity was in mood for that so any other suggest...
5570    the guy did some bitching but i acted like i b...
5571                            rofl its true to its name
Name: Message, Length: 5572, dtype: object

In [None]:
le = LabelEncoder()
df['Category'] = le.fit_transform(df['Category'])

In [None]:
X, y = df.Message , df['Category']

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42)

In [None]:
df['Category'].value_counts()

0    4825
1     747
Name: Category, dtype: int64

##CV

In [None]:
vectorizer = CountVectorizer(max_features = 600, stop_words = 'english')
X_train_cv = vectorizer.fit_transform(X_train).toarray()
X_test_cv = vectorizer.fit_transform(X_test).toarray()

In [None]:
from sklearn.ensemble import RandomForestClassifier

dt = LogisticRegression()
dt.fit(X_train_cv, y_train)
predictions_dt = dt.predict(X_test_cv)

In [None]:
from sklearn.metrics import accuracy_score, confusion_matrix, roc_auc_score, recall_score, precision_score, classification_report

print(classification_report(y_test,predictions_dt))

              precision    recall  f1-score   support

           0       0.88      0.99      0.93      1207
           1       0.65      0.16      0.26       186

    accuracy                           0.88      1393
   macro avg       0.77      0.57      0.60      1393
weighted avg       0.85      0.88      0.84      1393



##TF-IDF

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer

tfIdf= TfidfVectorizer(stop_words='english', max_features=3269)
X_train_tfidf = tfIdf.fit_transform(X_train)
X_test_tfidf = tfIdf.fit_transform(X_test)

In [None]:
dt_tf = DecisionTreeClassifier()
dt_tf.fit(X_train_tfidf, y_train)
predictions_dt_tfidf = dt_tf.predict(X_test_tfidf)
print(classification_report(y_test,predictions_dt_tfidf))

              precision    recall  f1-score   support

           0       0.87      0.94      0.91      1207
           1       0.21      0.10      0.13       186

    accuracy                           0.83      1393
   macro avg       0.54      0.52      0.52      1393
weighted avg       0.78      0.83      0.80      1393



## Word2Vec

In [None]:
df.head()

Unnamed: 0,Category,Message
0,0,go until jurong point crazy available only in ...
1,0,ok lar joking wif u oni
2,1,free entry in a wkly comp to win fa cup final ...
3,0,u dun say so early hor u c already then say
4,0,nah i don think he goes to usf he lives around...


In [None]:
from gensim.utils import simple_preprocess
# Tokenize the text column to get the new column 'tokenized_text'
df['tokenized_text'] = [simple_preprocess(line, deacc=True) for line in df['Message']] 
print(df['tokenized_text'].head(10))

0    [go, until, jurong, point, crazy, available, o...
1                          [ok, lar, joking, wif, oni]
2    [free, entry, in, wkly, comp, to, win, fa, cup...
3       [dun, say, so, early, hor, already, then, say]
4    [nah, don, think, he, goes, to, usf, he, lives...
5    [freemsg, hey, there, darling, it, been, week,...
6    [even, my, brother, is, not, like, to, speak, ...
7    [as, per, your, request, melle, oru, minnaminu...
8    [winner, as, valued, network, customer, you, h...
9    [had, your, mobile, months, or, more, entitled...
Name: tokenized_text, dtype: object


In [None]:

from gensim.parsing.porter import PorterStemmer
porter_stemmer = PorterStemmer()
# Get the stemmed_tokens
df['stemmed_tokens'] = [[porter_stemmer.stem(word) for word in tokens] for tokens in df['tokenized_text']]
df['stemmed_tokens'].head(10)

0    [go, until, jurong, point, crazi, avail, onli,...
1                            [ok, lar, joke, wif, oni]
2    [free, entri, in, wkly, comp, to, win, fa, cup...
3       [dun, sai, so, earli, hor, alreadi, then, sai]
4    [nah, don, think, he, goe, to, usf, he, live, ...
5    [freemsg, hei, there, darl, it, been, week, no...
6    [even, my, brother, is, not, like, to, speak, ...
7    [as, per, your, request, mell, oru, minnaminun...
8    [winner, as, valu, network, custom, you, have,...
9    [had, your, mobil, month, or, more, entitl, to...
Name: stemmed_tokens, dtype: object

In [None]:
def split_train_test(top_data_df_small, test_size=0.3, shuffle_state=True):
    X_train, X_test, Y_train, Y_test = train_test_split(df[['stemmed_tokens']], 
                                                        df['Category'], 
                                                        shuffle=shuffle_state,
                                                        test_size=test_size, 
                                                        random_state=15)
    print("Value counts for Train sentiments")
    print(Y_train.value_counts())
    print("Value counts for Test sentiments")
    print(Y_test.value_counts())
    print(type(X_train))
    print(type(Y_train))
    X_train = X_train.reset_index()
    X_test = X_test.reset_index()
    Y_train = Y_train.to_frame()
    Y_train = Y_train.reset_index()
    Y_test = Y_test.to_frame()
    Y_test = Y_test.reset_index()
    print(X_train.head())
    return X_train, X_test, Y_train, Y_test

# Call the train_test_split
X_train, X_test, Y_train, Y_test = split_train_test(df)

Value counts for Train sentiments
0    3377
1     523
Name: Category, dtype: int64
Value counts for Test sentiments
0    1448
1     224
Name: Category, dtype: int64
<class 'pandas.core.frame.DataFrame'>
<class 'pandas.core.series.Series'>
   index                                     stemmed_tokens
0    688  [dear, me, at, cherthala, in, case, come, coch...
1    800                    [gimm, few, wa, lt, minut, ago]
2   4990  [so, your, tell, me, coulda, been, your, real,...
3   5345                                     [wat, do, now]
4   3044           [your, bill, at, is, so, that, not, bad]


In [None]:
from gensim.models import Word2Vec
import time

# Skip-gram model (sg = 1)
size = 1000
window = 10
min_count = 1
workers = 3
sg = 0

# word2vec_model_file = OUTPUT_FOLDER + 'word2vec_' + str(size) + '.model'
start_time = time.time()
stemmed_tokens = pd.Series(df['stemmed_tokens']).values
# Train the Word2Vec Model
w2v_model = Word2Vec(stemmed_tokens, min_count = min_count, size = size, workers = workers, window = window, sg = sg)
print("Time taken to train word2vec model: " + str(time.time() - start_time))
# w2v_model.save(word2vec_model_file)

Time taken to train word2vec model: 3.3142993450164795


In [None]:
print("Index of the word 'action':")
# print(w2v_model.wv.vocab["action"].index)
# Total number of the words 
print(len(w2v_model.wv.vocab))
# Print the size of the word2vec vector for one word
print("Length of the vector generated for a word")
# print(len(w2v_model['action']))
# Get the mean for the vectors for an example review
print("Print the length after taking average of all word vectors in a sentence:")
# print(np.mean([w2v_model[token] for token in df['stemmed_tokens'][0]], axis=0))


Index of the word 'action':
6186
Length of the vector generated for a word
Print the length after taking average of all word vectors in a sentence:


In [None]:
word2vec_filename = 'train_review_word2vec.csv'
with open(word2vec_filename, 'w+') as word2vec_file:
    for index, row in X_train.iterrows():
        model_vector = (np.mean([w2v_model[token] for token in row['stemmed_tokens']], axis=0)).tolist()
        if index == 0:
            header = ",".join(str(ele) for ele in range(1000))
            word2vec_file.write(header)
            word2vec_file.write("\n")
        # Check if the line exists else it is vector of zeros
        if type(model_vector) is list:  
            line1 = ",".join( [str(vector_element) for vector_element in model_vector] )
        else:
            line1 = ",".join([str(0) for i in range(1000)])
        word2vec_file.write(line1)
        word2vec_file.write('\n')

  after removing the cwd from sys.path.
  out=out, **kwargs)
  ret = ret.dtype.type(ret / rcount)


In [None]:
word2vec_df = pd.read_csv(word2vec_filename)
word2vec_df.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,990,991,992,993,994,995,996,997,998,999
0,0.068926,0.225933,0.371986,-0.036024,-0.194419,-0.16128,-0.021689,0.151048,-0.007574,0.252238,...,-0.032758,-0.028815,-0.068177,0.032769,0.030125,-0.138109,0.321199,-0.078571,0.094888,0.160082
1,0.058187,0.189866,0.312698,-0.029909,-0.164023,-0.136044,-0.018457,0.127234,-0.006427,0.212192,...,-0.027727,-0.024046,-0.057587,0.027623,0.025284,-0.117171,0.271285,-0.066413,0.080224,0.134933
2,0.075512,0.247578,0.407726,-0.039786,-0.212968,-0.17613,-0.023741,0.165809,-0.008314,0.276534,...,-0.035473,-0.031617,-0.074144,0.035375,0.032681,-0.150678,0.351488,-0.085524,0.103466,0.174866
3,0.091567,0.299684,0.494011,-0.048033,-0.257569,-0.214177,-0.028271,0.200558,-0.01017,0.334439,...,-0.043511,-0.038064,-0.089876,0.043304,0.040009,-0.182432,0.425895,-0.103984,0.124947,0.212173
4,0.096545,0.315623,0.519412,-0.04969,-0.27148,-0.225422,-0.030426,0.210386,-0.010462,0.351854,...,-0.045414,-0.040281,-0.094821,0.045383,0.041744,-0.193537,0.449081,-0.110065,0.132845,0.224251


In [None]:
word2vec_df.shape

(3900, 1000)

In [None]:
#Initialize the model
clf_decision_word2vec = DecisionTreeClassifier()

start_time = time.time()
# Fit the model
clf_decision_word2vec.fit(word2vec_df, Y_train['Category'])
print("Time taken to fit the model with word2vec vectors: " + str(time.time() - start_time))


Time taken to fit the model with word2vec vectors: 3.5202829837799072


In [None]:
from sklearn.metrics import classification_report
test_features_word2vec = []
for index, row in X_test.iterrows():
    model_vector = np.mean([w2v_model[token] for token in row['stemmed_tokens']], axis=0)
    if type(model_vector) is list:
        test_features_word2vec.append(model_vector)
    else:
        test_features_word2vec.append(np.array([0 for i in range(1000)]))
test_predictions_word2vec = clf_decision_word2vec.predict(test_features_word2vec)
print(classification_report(Y_test['Category'],test_predictions_word2vec))

  after removing the cwd from sys.path.
  out=out, **kwargs)
  ret = ret.dtype.type(ret / rcount)


              precision    recall  f1-score   support

           0       0.87      1.00      0.93      1448
           1       0.00      0.00      0.00       224

    accuracy                           0.87      1672
   macro avg       0.43      0.50      0.46      1672
weighted avg       0.75      0.87      0.80      1672



  "X does not have valid feature names, but"
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
