# Movie sentimental classfication

In [61]:
import pandas as pd

df = pd.read_csv('IMDB_Dataset.csv')

print(df.columns)

# comment the below lines if you want to try this in large datasets
data = {
    "No": [1, 2, 3, 4],
    "review": [
        "Movie was boring",
        "Movie actions were very good",
        "Movie was good",
        "Movie story was very bad"
    ],
    "sentiment": ["Negative", "Positive", "Positive", "Negative"]
}
# Create DataFrame
df = pd.DataFrame(data)

df 

Index(['review', 'sentiment'], dtype='object')


Unnamed: 0,No,review,sentiment
0,1,Movie was boring,Negative
1,2,Movie actions were very good,Positive
2,3,Movie was good,Positive
3,4,Movie story was very bad,Negative


Getting the vocabulary from the unique values from the text input

In [62]:
sent_tokens = df.review.map(lambda x: set(x.split(' ')))
vocab = sent_tokens.explode().unique()
print(f'Voculary: {vocab}')
print(f'Vocabulary size: {vocab.shape[0]}')

Voculary: ['Movie' 'was' 'boring' 'were' 'good' 'very' 'actions' 'bad' 'story']
Vocabulary size: 9


Assiging index values to each unique words in the vocabulary

In [63]:
word_index = {}
for index , words in enumerate(vocab):
    word_index[words] = index

word_index

{'Movie': 0,
 'was': 1,
 'boring': 2,
 'were': 3,
 'good': 4,
 'very': 5,
 'actions': 6,
 'bad': 7,
 'story': 8}

Assigning index values for each words in the sentences , therby converting text into vector representation

In [64]:
sent_indices = sent_tokens.map(lambda x: [word_index[word] for word in x])

print('Sentences string representation') 
print(sent_tokens)

print('Sentences number representation')
print(sent_indices)

Sentences string representation
0                  {Movie, was, boring}
1    {were, good, very, Movie, actions}
2                    {Movie, was, good}
3        {was, very, bad, Movie, story}
Name: review, dtype: object
Sentences number representation
0          [0, 1, 2]
1    [3, 4, 5, 0, 6]
2          [0, 1, 4]
3    [1, 5, 7, 0, 8]
Name: review, dtype: object


Converting postive and negatives to 1 and 0 respectively

In [65]:
target = df.sentiment.map(lambda x: 1 if x == 'Positive' else 0)
target

0    0
1    1
2    1
3    0
Name: sentiment, dtype: int64

# Model development

Weights Initialization

In [66]:
import numpy as np

# increase the hidden size for caputring more features
hidden_size = 10

weights_0_1 = 0.2*np.random.random((len(vocab),hidden_size)) - 0.1
weights_1_2 = 0.2*np.random.random((hidden_size,1)) - 0.1

pd.DataFrame(weights_0_1)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9
0,-0.07398,0.098261,0.092965,-0.066388,-0.011961,-0.088054,-0.003089,-0.043435,-0.07192,0.081986
1,0.030255,-0.014901,-0.010139,0.051264,0.032952,-0.020704,0.053456,0.034872,-0.034435,-0.052285
2,0.067736,-0.064053,0.078827,0.077187,0.033873,0.067507,0.002935,-0.022488,-0.012487,0.075166
3,-0.069594,-0.032719,0.070568,-0.018031,0.079513,-0.076135,-0.002726,0.032917,0.047635,-0.037925
4,-0.03111,0.018861,0.016609,-0.029621,0.038772,0.08361,-0.05717,0.07969,0.067313,-0.007265
5,-0.076487,0.055454,0.03465,0.028094,-0.09307,-0.038858,-0.053667,0.091784,-0.097305,-0.048323
6,0.017615,-0.065482,-0.033781,0.001049,-0.039667,0.018366,-0.067613,0.079818,-0.05423,-0.09168
7,-0.097806,-0.019705,-0.047896,-0.059787,-0.060634,0.019132,0.038606,0.081531,-0.085805,0.001163
8,-0.038481,-0.017671,0.081217,0.053139,-0.009114,-0.029167,0.011763,-0.052071,-0.069424,0.094191


Neural network to compute the sentiments from the reviews

In [67]:
alpha, iteration = 0.1, 2

def sigmoid(x):
    return 1/(1+np.exp(-x))

def neural_network(x,y):

    global weights_0_1, weights_1_2 

    # print(f'Input: len = {len(x)} , data = {x}')

    layer_1 = sigmoid(np.sum(weights_0_1[x],axis=0))
    # print(f'Layer 1 shape: {layer_1.shape}')
    layer_2 = sigmoid(np.dot(layer_1,weights_1_2))
    
    layer_2_delta = layer_2 - y
    # print(f'weights_1_2 shape: {weights_1_2.T.shape}')

    layer_1_delta = layer_2_delta.dot(weights_1_2.T)  
    # print(f'layer_1_delta shape: {layer_1_delta.shape}')

    weights_1_2 -= np.outer(layer_1 , layer_2_delta) * alpha
    weights_0_1[x] -= layer_1_delta * alpha

    return layer_2

correct , total = 0, 0
for i in range(iteration):
    for x,y in zip(sent_indices,target):
        output = neural_network(x,y)

        # print(np.abs(output))
        correct += 1 if np.abs(output) < 0.5 else 0
        total += 1

    print(f'Iteration: {i}, Accuracy: {correct/total}')
    correct , total = 0, 0


Iteration: 0, Accuracy: 0.75
Iteration: 1, Accuracy: 0.75


Display emebeding matrix

In [68]:
pd.DataFrame(weights_0_1)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9
0,-0.080064,0.092425,0.087963,-0.071892,-0.017311,-0.093082,-0.008635,-0.048848,-0.076932,0.076082
1,0.036322,-0.013895,-0.019223,0.050621,0.029311,-0.025056,0.052827,0.032721,-0.037771,-0.050471
2,0.07621,-0.060315,0.072643,0.079107,0.032985,0.065809,0.004916,-0.021887,-0.013202,0.079587
3,-0.081745,-0.039562,0.07465,-0.022892,0.077805,-0.07681,-0.007644,0.029654,0.04596,-0.045643
4,-0.052279,0.008437,0.027569,-0.036416,0.038188,0.08493,-0.06421,0.076453,0.066679,-0.019637
5,-0.082027,0.049461,0.028955,0.022602,-0.098656,-0.044184,-0.059072,0.085743,-0.102644,-0.053994
6,0.005463,-0.072325,-0.029698,-0.003813,-0.041375,0.017691,-0.07253,0.076554,-0.055906,-0.099399
7,-0.091195,-0.018855,-0.057673,-0.060417,-0.064512,0.014482,0.038118,0.078754,-0.089468,0.00321
8,-0.03187,-0.016822,0.071439,0.052509,-0.012991,-0.033818,0.011275,-0.054848,-0.073087,0.096238


Comparing similarity b/w words using Euclidian distances

In [69]:
from collections import Counter 
import math 

def similar(target='beautiful'):
    target_index = word_index[target] 
    scores = Counter() 
    for word,index in word_index.items(): 
        raw_difference = weights_0_1[index] - (weights_0_1[target_index]) 
        squared_difference = raw_difference * raw_difference 
        scores[word] = -math.sqrt(sum(squared_difference)) 
    return scores.most_common(10)

In [70]:
similar('good')

[('good', -0.0),
 ('were', -0.1994909518660217),
 ('actions', -0.21565912894940956),
 ('was', -0.2403334666017908),
 ('bad', -0.24696548797785692),
 ('boring', -0.26015675652639186),
 ('very', -0.267232047083485),
 ('story', -0.28897536349704966),
 ('Movie', -0.3099607603734129)]