# Word2Vec + ML Classifier

### Libraries

In [1]:
## Download and import libraries

import pandas as pd
import numpy as np
import nltk
from sklearn.metrics import accuracy_score
from sklearn.svm import LinearSVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report


In [2]:
## Load the pretrained english language spacy model
## Uncomment the below 3 lines to download spacy and pretrained model

# !pip install spacy
# !python -m spacy download en_core_web_lg
# nltk.download('stopwords')


from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import spacy
import en_core_web_lg
nlp=en_core_web_lg.load()

### Dataset

In [3]:
## Read train and test dataset
## Please provide the path where training and testing files exist

train_df=pd.read_excel('P1_training.xlsx')
test_df=pd.read_excel('P1_testing.xlsx')
train_df

Unnamed: 0,sentence,label
0,living in a concentration camp-like atmosphere...,1
1,"there's even a nod to "" the blues brothers , ""...",1
2,"park , lord , and screenwriter karey kirkpatri...",1
3,"ginger is perfect , spunky and opinionated , b...",2
4,jane horrocks delivers a lovely voice characte...,2
...,...,...
1655,"lin shae , who plays mary's neighbor magda ( a...",2
1656,steve martin took an extended vacation from al...,2
1657,much of the book spares tinseltown from mocker...,2
1658,"now , as writer and star of bowfinger , he off...",1


### Processing the dataset

In [4]:
## Cleaning the sentences for training and testing data
## Removes punctuations and stopwords


def get_tokens(sentence):
    return sentence.split()

def text_preprocessing(df):
    stop_words=stopwords.words('english')
    df['sentence']=df['sentence'].str.lower()
    df['sentence']=df['sentence'].str.replace('[^\w\s]','')
    df['sentence'] = df['sentence'].apply(lambda x: ' '.join([word for word in get_tokens(x) if word not in (stop_words)]))
    return df

train_df=text_preprocessing(train_df)
test_df=text_preprocessing(test_df)

In [5]:
train_df

Unnamed: 0,sentence,label
0,living concentration camplike atmosphere led o...,1
1,theres even nod blues brothers believe filmmak...,1
2,park lord screenwriter karey kirkpatrick reali...,1
3,ginger perfect spunky opinionated soft heart f...,2
4,jane horrocks delivers lovely voice characteri...,2
...,...,...
1655,lin shae plays marys neighbor magda also appea...,2
1656,steve martin took extended vacation facets mov...,2
1657,much book spares tinseltown mockery although r...,2
1658,writer star bowfinger offers masses plenty goo...,1


### Vector Representation

In [6]:
## Get vector representation for sentences

def vector_representation(sentence):
    doc=nlp(sentence)
    vector=doc.vector
    return vector

def get_vectors(df):
    df['vector_rep']=df['sentence'].apply(lambda x: vector_representation(x))
    return df

train_df=get_vectors(train_df)
test_df=get_vectors(test_df)

In [7]:
train_df

Unnamed: 0,sentence,label,vector_rep
0,living concentration camplike atmosphere led o...,1,"[-0.043274872, 0.13555712, 0.0031807376, -0.02..."
1,theres even nod blues brothers believe filmmak...,1,"[-0.0827837, 0.056825243, -0.09836677, 0.01558..."
2,park lord screenwriter karey kirkpatrick reali...,1,"[-0.105013736, 0.043996673, -0.100186825, 0.02..."
3,ginger perfect spunky opinionated soft heart f...,2,"[0.0011016836, 0.07213296, -0.24056295, 0.0744..."
4,jane horrocks delivers lovely voice characteri...,2,"[0.028003514, 0.043497793, -0.04778112, -0.024..."
...,...,...,...
1655,lin shae plays marys neighbor magda also appea...,2,"[-0.08198986, 0.06842123, -0.036075655, -0.046..."
1656,steve martin took extended vacation facets mov...,2,"[0.014995606, 0.08740837, 0.05923424, -0.04339..."
1657,much book spares tinseltown mockery although r...,2,"[-0.10096329, 0.1045848, 0.064322814, -0.05705..."
1658,writer star bowfinger offers masses plenty goo...,1,"[0.053507973, 0.14204131, 0.0054865074, 0.0234..."


### Reshape the datasets

In [8]:
## Reshape the vectors for ML Models

train_X=train_df['vector_rep'].to_numpy().reshape(-1,1)
temp_tr=np.concatenate(train_X,axis=0)
train_X=np.concatenate(temp_tr,axis=0)
train_X=train_X.reshape(-1,300)

test_X=test_df['vector_rep'].to_numpy().reshape(-1,1)
temp_te=np.concatenate(test_X,axis=0)
test_X=np.concatenate(temp_te,axis=0)
test_X=test_X.reshape(-1,300)

In [9]:
train_Y=train_df['label']
test_Y=test_df['label']

### ML Models

In [10]:
# Random Forest Classifier
rfc=RandomForestClassifier(n_estimators=10,bootstrap=False,random_state=22)
rfc.fit(train_X,train_Y)
y_pred=rfc.predict(test_X)
print('Accuracy %s \n' % accuracy_score(y_pred, test_Y))
print ("Classification Report \n",classification_report(test_Y,y_pred))

Accuracy 0.5695461200585652 

Classification Report 
               precision    recall  f1-score   support

           0       0.20      0.11      0.14        82
           1       0.56      0.69      0.62       303
           2       0.64      0.58      0.61       298

    accuracy                           0.57       683
   macro avg       0.47      0.46      0.46       683
weighted avg       0.55      0.57      0.56       683



In [21]:
# Support Vector Classifier
svc=LinearSVC()
svc.fit(train_X,train_Y)
y_pred=svc.predict(test_X)
print('Accuracy %s \n' % accuracy_score(y_pred, test_Y))
print ("Classification Report \n",classification_report(test_Y,y_pred))

Accuracy 0.5929721815519766 

Classification Report 
               precision    recall  f1-score   support

           0       0.26      0.18      0.21        82
           1       0.59      0.67      0.63       303
           2       0.66      0.62      0.64       298

    accuracy                           0.59       683
   macro avg       0.50      0.49      0.50       683
weighted avg       0.58      0.59      0.59       683



In [22]:
## Here SVC performs better as compared to Random Forest and we consider SVC 
## Update the path based on location of testing data

temp_df=pd.read_excel('P1_testing.xlsx')
temp_df['predicted_label']=y_pred
temp_df=temp_df.rename(columns={"label": "golden_label"})
temp_df.to_csv("testing_output_Word2vec_Spacy.csv")