# Word2Vec + ML Classifier (Gensim)

In [17]:
## Run the following cell to download pretrained model and stopwords

import nltk
nltk.download('stopwords')
!wget https://s3.amazonaws.com/dl4j-distribution/GoogleNews-vectors-negative300.bin.gz 
!gunzip /content/GoogleNews-vectors-negative300.bin.gz

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [11]:
## Download and import libraries
import pandas as pd
import numpy as np
import nltk
from sklearn.metrics import accuracy_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import gensim


### Dataset

In [12]:
## Read train and test dataset
## Please provide the path where training and testing files exist

train_df=pd.read_excel('/content/P1_training.xlsx')
test_df=pd.read_excel('/content/P1_testing.xlsx')
train_df

Unnamed: 0,sentence,label
0,living in a concentration camp-like atmosphere...,1
1,"there's even a nod to "" the blues brothers , ""...",1
2,"park , lord , and screenwriter karey kirkpatri...",1
3,"ginger is perfect , spunky and opinionated , b...",2
4,jane horrocks delivers a lovely voice characte...,2
...,...,...
1655,"lin shae , who plays mary's neighbor magda ( a...",2
1656,steve martin took an extended vacation from al...,2
1657,much of the book spares tinseltown from mocker...,2
1658,"now , as writer and star of bowfinger , he off...",1


### Preprocessing the dataset

In [13]:
## Cleaning the sentences for training and testing data
## Removes punctuations and stopwords

def get_tokens(sentence):
    return sentence.split()

def text_preprocessing(df):
    stop_words=stopwords.words('english')
    df['sentence']=df['sentence'].str.lower()
    df['sentence']=df['sentence'].str.replace('[^\w\s]','')
    df['sentence'] = df['sentence'].apply(lambda x: ' '.join([word for word in get_tokens(x) if word not in (stop_words)]))
    return df

train_df=text_preprocessing(train_df)
test_df=text_preprocessing(test_df)

### Train-test Sentences

In [4]:
## Fetch training and testing sentences

train_sentences=[]
test_sentences=[]

for x in list(train_df.sentence.values):
    train_sentences.append(x.split())

for x in list(test_df.sentence.values):
    test_sentences.append(x.split())


### Gensim Word2Vec model

In [5]:
## Load model
from gensim.models import KeyedVectors

trained_model = KeyedVectors.load_word2vec_format("/content/GoogleNews-vectors-negative300.bin",
                                         binary = True)
model = gensim.models.Word2Vec(size = 300, window=5,
min_count = 3, workers = 2)
model.build_vocab(train_sentences) 
model.build_vocab([list(trained_model.vocab.keys())], update=True)
model.intersect_word2vec_format('/content/GoogleNews-vectors-negative300.bin', lockf=1.0, binary=True)
model.train(train_sentences, total_examples=len(train_sentences), epochs = 5)



  'See the migration notes for details: %s' % _MIGRATION_NOTES_URL


(149062, 207745)

### Word2vec Vectors

In [6]:
## Calculate the feature vectors

def get_FeatureVectors(words,model,features):
    vector=np.zeros(features,dtype="float32")
    idx2word = set(model.wv.index2word)
    cnt=0
    for word in words:
        if word in idx2word:
            cnt+=1
            vector=np.add(vector,model[word])
    vector=np.divide(vector,cnt)
    return vector

def get_SentenceVectors(sentences,model,features):
    cnt=0
    sentence_len=len(sentences)
    Sentence_Vector=np.zeros((sentence_len,features),dtype="float32")
    for sentence in sentences:
        Sentence_Vector[cnt]=get_FeatureVectors(sentence,model,features)
        cnt+=1
        
    return Sentence_Vector
 
trainDataVecs = get_SentenceVectors(train_sentences, model, 300)
testDataVecs = get_SentenceVectors(test_sentences, model, 300)

  


### ML Model

In [9]:
## Random Forest Classifier

forest = RandomForestClassifier(n_estimators=5,random_state=22)
    
print("Fitting random forest to training data....")    
forest = forest.fit(trainDataVecs, train_df["label"])
result = forest.predict(testDataVecs)

print('accuracy %s' % accuracy_score(result, test_df['label']))
print (classification_report(test_df['label'],result))

Fitting random forest to training data....
accuracy 0.4612005856515373
              precision    recall  f1-score   support

           0       0.08      0.11      0.10        82
           1       0.51      0.57      0.54       303
           2       0.56      0.45      0.50       298

    accuracy                           0.46       683
   macro avg       0.38      0.38      0.38       683
weighted avg       0.48      0.46      0.47       683



In [10]:
## Please provide the path where testing files exist
temp_df=pd.read_excel('P1_testing.xlsx')
temp_df['predicted_label']=result
temp_df=temp_df.rename(columns={"label": "golden_label"})
temp_df.to_csv("testing_output_Word2vec_Gensim.csv")