<a href="https://colab.research.google.com/github/infiniteoverflow/Sentiment-Analysis-using-Neural-Networks/blob/master/Sentiment_Analysis.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Curating the Dataset

In [0]:
def pretty_print_review_and_label(i):
    print(labels[i] + "\t:\t" + reviews[i][:80] + "...")

g = open('/reviews.txt','r') # What we know!
reviews = list(map(lambda x:x[:-1],g.readlines()))
g.close()

g = open('/labels.txt','r') # What we WANT to know!
labels = list(map(lambda x:x[:-1].upper(),g.readlines()))
g.close()

In [2]:
len(reviews)

25000

In [3]:
reviews[1]

'story of a man who has unnatural feelings for a pig . starts out with a opening scene that is a terrific example of absurd comedy . a formal orchestra audience is turned into an insane  violent mob by the crazy chantings of it  s singers . unfortunately it stays absurd the whole time with no general narrative eventually making it just too off putting . even those from the era should be turned off . the cryptic dialogue would make shakespeare seem easy to a third grader . on a technical level it  s better than you might think with some good cinematography by future great vilmos zsigmond . future stars sally kirkland and frederic forrest can be seen briefly .  '

In [4]:
labels[1]

'NEGATIVE'

# Exploring the dataset

In [5]:
print('label\t\t: \t review\n')

pretty_print_review_and_label(2241)
pretty_print_review_and_label(1871)
pretty_print_review_and_label(6)
pretty_print_review_and_label(18)
pretty_print_review_and_label(24)

label		: 	 review

NEGATIVE	:	i bought this at tower records after seeing the info  mercial about fifteen hund...
NEGATIVE	:	noting the cast  i recently watched this movie on tcm  hoping for an under  appr...
POSITIVE	:	this is easily the most underrated film inn the brooks cannon . sure  its flawed...
POSITIVE	:	you know  robin williams  god bless him  is constantly shooting himself in the f...
POSITIVE	:	there are many illnesses born in the mind of man which have been given life in m...


# Developing a Predictive theory

We will now count the occurances of each word in both POSITIVE and NEGATIVE reviews

In [0]:
from collections import Counter

In [0]:
positive_words = Counter()
negative_words = Counter()
total_words = Counter()

In [0]:
for i in range(len(reviews)):
  if(labels[i] == 'POSITIVE'):
    for word in reviews[i].split(" "):
      positive_words[word] += 1
      total_words[word] += 1
  else:
    for word in reviews[i].split(" "):
      negative_words[word] += 1
      total_words[word] += 1

In [14]:
positive_words.most_common()[0:30]

[('', 550468),
 ('the', 173324),
 ('.', 159654),
 ('and', 89722),
 ('a', 83688),
 ('of', 76855),
 ('to', 66746),
 ('is', 57245),
 ('in', 50215),
 ('br', 49235),
 ('it', 48025),
 ('i', 40743),
 ('that', 35630),
 ('this', 35080),
 ('s', 33815),
 ('as', 26308),
 ('with', 23247),
 ('for', 22416),
 ('was', 21917),
 ('film', 20937),
 ('but', 20822),
 ('movie', 19074),
 ('his', 17227),
 ('on', 17008),
 ('you', 16681),
 ('he', 16282),
 ('are', 14807),
 ('not', 14272),
 ('t', 13720),
 ('one', 13655)]

In [15]:
negative_words.most_common()[0:30]

[('', 561462),
 ('.', 167538),
 ('the', 163389),
 ('a', 79321),
 ('and', 74385),
 ('of', 69009),
 ('to', 68974),
 ('br', 52637),
 ('is', 50083),
 ('it', 48327),
 ('i', 46880),
 ('in', 43753),
 ('this', 40920),
 ('that', 37615),
 ('s', 31546),
 ('was', 26291),
 ('movie', 24965),
 ('for', 21927),
 ('but', 21781),
 ('with', 20878),
 ('as', 20625),
 ('t', 20361),
 ('film', 19218),
 ('you', 17549),
 ('on', 17192),
 ('not', 16354),
 ('have', 15144),
 ('are', 14623),
 ('be', 14541),
 ('he', 13856)]

We are not able to derive any intuition from this data :(

Soo we use postive to negative ratios to determine the correlation between the words in the review and its corresponding label

In [0]:
import numpy as np

positive_negative_ratio = Counter()

for term,cnt in list(total_words.most_common()):
  if cnt>10:
    positive_negative_rat = positive_words[term]/float(negative_words[term]+1)
    positive_negative_ratio[term] = positive_negative_rat

for word,ratio in list(positive_negative_ratio.most_common()):
  if ratio > 1:
    positive_negative_ratio[word] = np.log(ratio)
  else:
    positive_negative_ratio[word] = -np.log(1/(ratio+0.01))

In [12]:
positive_negative_ratio.most_common()[0:30]

[('edie', 4.6913478822291435),
 ('antwone', 4.477336814478207),
 ('din', 4.406719247264253),
 ('gunga', 4.189654742026425),
 ('goldsworthy', 4.174387269895637),
 ('gypo', 4.0943445622221),
 ('yokai', 4.0943445622221),
 ('paulie', 4.07753744390572),
 ('visconti', 3.9318256327243257),
 ('flavia', 3.9318256327243257),
 ('blandings', 3.871201010907891),
 ('kells', 3.871201010907891),
 ('brashear', 3.8501476017100584),
 ('gino', 3.828641396489095),
 ('deathtrap', 3.8066624897703196),
 ('harilal', 3.713572066704308),
 ('panahi', 3.713572066704308),
 ('ossessione', 3.6635616461296463),
 ('tsui', 3.6375861597263857),
 ('caruso', 3.6375861597263857),
 ('sabu', 3.6109179126442243),
 ('ahmad', 3.6109179126442243),
 ('khouri', 3.58351893845611),
 ('dominick', 3.58351893845611),
 ('aweigh', 3.5553480614894135),
 ('mj', 3.5553480614894135),
 ('mcintire', 3.5263605246161616),
 ('kriemhild', 3.5263605246161616),
 ('blackie', 3.4965075614664802),
 ('daisies', 3.4965075614664802)]

In [13]:
list(reversed(positive_negative_ratio.most_common()))[0:30]

[('rosarios', -4.605170185988092),
 ('frewer', -4.605170185988092),
 ('manu', -4.605170185988092),
 ('borel', -4.605170185988092),
 ('swinton', -4.605170185988092),
 ('sagemiller', -4.605170185988092),
 ('summersisle', -4.605170185988092),
 ('qi', -4.605170185988092),
 ('redline', -4.605170185988092),
 ('slipstream', -4.605170185988092),
 ('bolo', -4.605170185988092),
 ('emraan', -4.605170185988092),
 ('geico', -4.605170185988092),
 ('cato', -4.605170185988092),
 ('liliom', -4.605170185988092),
 ('rajni', -4.605170185988092),
 ('mayeda', -4.605170185988092),
 ('crapfest', -4.605170185988092),
 ('tmtm', -4.605170185988092),
 ('sued', -4.605170185988092),
 ('keyes', -4.605170185988092),
 ('nichole', -4.605170185988092),
 ('straightheads', -4.605170185988092),
 ('aluminium', -4.605170185988092),
 ('groaning', -4.605170185988092),
 ('templars', -4.605170185988092),
 ('krista', -4.605170185988092),
 ('spandex', -4.605170185988092),
 ('unisols', -4.605170185988092),
 ('mache', -4.60517018598

# Creating the Input/Output Data

In [16]:
vocab = set(total_words)
vocab_size = len(vocab)
print(vocab_size)

74074


In [17]:
# Creating a row vector of size: vocab_size filled with 0

layer_0 = np.zeros((1,vocab_size))
layer_0

array([[0., 0., 0., ..., 0., 0., 0.]])

In [20]:
word2index = {}

for i,word in enumerate(vocab):
  word2index[word] = i
word2index

{'': 0,
 'sista': 1,
 'coartship': 2,
 'umbrellas': 3,
 'povich': 4,
 'audiences': 5,
 'exectioner': 6,
 'mayberry': 7,
 'lagrange': 8,
 'sober': 9,
 'ryuhei': 10,
 'witchdoctor': 11,
 'godawfully': 12,
 'swapping': 13,
 'jeopardized': 14,
 'psychiatrists': 15,
 'psyche': 16,
 'wafty': 17,
 'misdrawing': 18,
 'traffic': 19,
 'woodenhead': 20,
 'melachonic': 21,
 'slut': 22,
 'shockless': 23,
 'misused': 24,
 'defecating': 25,
 'koi': 26,
 'camion': 27,
 'immortal': 28,
 'cay': 29,
 'inder': 30,
 'alraira': 31,
 'balrog': 32,
 'oddball': 33,
 'hotbod': 34,
 'aachen': 35,
 'sweid': 36,
 'ninjitsu': 37,
 'airplay': 38,
 'alamo': 39,
 'michol': 40,
 'won': 41,
 'precisely': 42,
 'tells': 43,
 'browning': 44,
 'doofy': 45,
 'homespun': 46,
 'leatrice': 47,
 'satisying': 48,
 'freewheelers': 49,
 'pixelated': 50,
 'cortese': 51,
 'dvdcompare': 52,
 'detective': 53,
 'arsehole': 54,
 'negotiating': 55,
 'reptiles': 56,
 'stamper': 57,
 'augers': 58,
 'splaining': 59,
 'barf': 60,
 'spitied': 

In [0]:
def update_layer_0(review):
  global layer_0

  layer_0 *= 0

  for word in review.split(" "):
    layer_0[0][word2index[word]] += 1
  
update_layer_0(reviews[0])

In [25]:
layer_0

array([[18.,  0.,  0., ...,  0.,  1.,  0.]])

In [0]:
def get_target_for_label(label):
  if label == 'POSITIVE':
    return 1
  else:
    return 0

In [28]:
get_target_for_label(labels[0])

1