# Assignment 07 - Sentiment Analysis


* Importing necessary libraries and dataset

In [1]:
import numpy as np
import pandas as pd

import gensim
from gensim.models import Word2Vec

from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

pd.set_option('display.max_colwidth',None)

In [3]:
data = pd.read_csv('/content/drive/MyDrive/Data/tweets.csv')

In [4]:
data.head()

Unnamed: 0,id,label,tweet
0,1,0,#fingerprint #Pregnancy Test https://goo.gl/h1MfQV #android #apps #beautiful #cute #health #igers #iphoneonly #iphonesia #iphone
1,2,0,Finally a transparant silicon case ^^ Thanks to my uncle :) #yay #Sony #Xperia #S #sonyexperias… http://instagram.com/p/YGEt5JC6JM/
2,3,0,We love this! Would you go? #talk #makememories #unplug #relax #iphone #smartphone #wifi #connect... http://fb.me/6N3LsUpCu
3,4,0,I'm wired I know I'm George I was made that way ;) #iphone #cute #daventry #home http://instagr.am/p/Li_5_ujS4k/
4,5,1,What amazing service! Apple won't even talk to me about a question I have unless I pay them $19.95 for their stupid support!


In [5]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7920 entries, 0 to 7919
Data columns (total 3 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   id      7920 non-null   int64 
 1   label   7920 non-null   int64 
 2   tweet   7920 non-null   object
dtypes: int64(2), object(1)
memory usage: 185.8+ KB


* Checking for null entries

In [6]:
data.isna().sum()

id       0
label    0
tweet    0
dtype: int64

In [7]:
data['label'].value_counts()

label
0    5894
1    2026
Name: count, dtype: int64

* Data Preprocessing

In [10]:
data['cleaned_text'] = data['tweet'].apply(lambda x:gensim.utils.simple_preprocess(x))
data.head()

Unnamed: 0,id,label,tweet,cleaned_text
0,1,0,#fingerprint #Pregnancy Test https://goo.gl/h1MfQV #android #apps #beautiful #cute #health #igers #iphoneonly #iphonesia #iphone,"[fingerprint, pregnancy, test, https, goo, gl, mfqv, android, apps, beautiful, cute, health, igers, iphoneonly, iphonesia, iphone]"
1,2,0,Finally a transparant silicon case ^^ Thanks to my uncle :) #yay #Sony #Xperia #S #sonyexperias… http://instagram.com/p/YGEt5JC6JM/,"[finally, transparant, silicon, case, thanks, to, my, uncle, yay, sony, xperia, sonyexperias, http, instagram, com, yget, jc, jm]"
2,3,0,We love this! Would you go? #talk #makememories #unplug #relax #iphone #smartphone #wifi #connect... http://fb.me/6N3LsUpCu,"[we, love, this, would, you, go, talk, makememories, unplug, relax, iphone, smartphone, wifi, connect, http, fb, me, lsupcu]"
3,4,0,I'm wired I know I'm George I was made that way ;) #iphone #cute #daventry #home http://instagr.am/p/Li_5_ujS4k/,"[wired, know, george, was, made, that, way, iphone, cute, daventry, home, http, instagr, am, li_]"
4,5,1,What amazing service! Apple won't even talk to me about a question I have unless I pay them $19.95 for their stupid support!,"[what, amazing, service, apple, won, even, talk, to, me, about, question, have, unless, pay, them, for, their, stupid, support]"


In [15]:
data.columns

Index(['id', 'label', 'tweet', 'cleaned_text'], dtype='object')

* Train test split

In [16]:
x_train,x_test,y_train,y_test=train_test_split(data['cleaned_text'],data['label'],test_size=0.3)

In [17]:
print(x_train.shape)
print(x_test.shape)
print(y_train.shape)
print(y_test.shape)

(5544,)
(2376,)
(5544,)
(2376,)


* Create a Word2Vec model named w2v_model using the sentences in x_train as training data

In [26]:
w2v_model = gensim.models.Word2Vec(x_train,min_count=2)

In [27]:
words = w2v_model.wv.index_to_key
len(words)

5426

* Converting text data into numerical vectors using a pre-trained Word2Vec model (w2v_model) for each word in the provided reviews

In [28]:
x_train_vec = []
x_test_vec = []

for rvw in x_train:
  vec = []
  for word in rvw:
    if word in words:
      vec.append(w2v_model.wv['word'])

  x_train_vec.append(np.array(vec))

for rvw in x_test:
  vec = []
  for word in rvw:
    if word in words:
      vec.append(w2v_model.wv['word'])

  x_test_vec.append(np.array(vec))

* Convert the individual word vectors (which were obtained using Word2Vec) for each review into a single vector that represents the overall "meaning" of that review

In [29]:
x_train_vec_avg =[]
x_test_vec_avg = []

def cal_avg_vec(v):
  if v.size:
    return v.mean(axis = 0)
  else:
    np.zeros(100,dtype=float)

for v in x_train_vec:
  x_train_vec_avg.append(cal_avg_vec(v))

for v in x_test_vec:
  x_test_vec_avg.append(cal_avg_vec(v))

* Converting x_train_vec_avg and x_test_vec_avg into numpy arrays before training the model

In [30]:
x_train_vec_avg = np.array(x_train_vec_avg)
x_test_vec_avg = np.array(x_test_vec_avg)

* Model building and training

In [31]:
from sklearn.ensemble import RandomForestClassifier
clf = RandomForestClassifier(n_estimators=100)
clf.fit(x_train_vec_avg,y_train)

* Predicting

In [32]:
y_pred = clf.predict(x_test_vec_avg)

* Accuracy

In [33]:
accuracy_score(y_test,y_pred)

0.7436868686868687