In [1]:
import pandas as pd
import contractions
import nltk
from nltk.corpus import stopwords
import re
from collections import defaultdict 
df = pd.read_csv('data.csv',index_col='Unnamed: 0')

In [2]:
positive = df[df['sentiment']>=4]
negative = df[df['sentiment']<4]
positive = positive[:len(negative)]

In [3]:
def cleaning(data):
    cleaned = []
    for sentences in data:
        # clean errors
        sentences = sentences.encode('ascii','ignore')
        sentences = sentences.decode()
        
        # fix contactions
        sentences = contractions.fix(sentences.replace('READ MORE',''))
        
        # remove everything except characters
        sentences =  re.sub('[^a-zA-Z]', ' ', sentences)
        
        split = sentences.split(' ')
        clean = ' '.join( [i.lower() for i in split] )
        
        cleaned.append(' ' + clean)
        
    return cleaned

In [4]:
cleaned_all = cleaning(positive['review'].values)
cleaned_all.extend(cleaning(negative['review'].values))

unique = set()
for clean in cleaned_all:
    unique = unique.union(set(clean.split(' ')))
unique.remove('')

In [5]:
all_sentences = []
for clean in cleaned_all:
    all_sentences.extend(clean.split(' '))

count_total = defaultdict(lambda:0)
for ele in unique:
    count_total[ele] += all_sentences.count(ele)

In [6]:
def search_count(string,element):
    string = string.split(' ')
    return string.count(element)

In [7]:
dict_positive = defaultdict(lambda:0)
dict_negative = defaultdict(lambda:0)

cleaned_positive = cleaning(positive['review'].values)
cleaned_negative = cleaning(negative['review'].values)

for ele in unique:
    
    for i in cleaned_positive:
        dict_positive[ele] += search_count(i,ele)
    
    for i in cleaned_negative:
        dict_negative[ele] += search_count(i,ele)

In [8]:
proba_positive = {}
proba_negative = {}

for ele in unique:
    proba_positive[ele]  = (dict_positive[ele] + 1)/ (count_total[ele] + len(unique) )
    
    proba_negative[ele]  = (dict_negative[ele] + 1)/ (count_total[ele] + len(unique) )

In [9]:
def prediction ( sentence , pp = proba_positive , pn = proba_negative , uni = unique):
    sentence = cleaning([sentence])[0]
    proba = 1
    for word in sentence.split(' '):
        if word in unique:
            print(f"{word}\t{pp[word]}\t/\t{pn[word]}\t=\t{pp[word] / pn[word]}")
            proba *= pp[word] / pn[word]
    return proba

In [10]:
pred = prediction("awesome, good")
if pred > 1:
    print('positive')
elif pred == 1:
    print('neutral')
else:
    print('negative')

awesome	0.08278016399843811	/	0.005466614603670442	=	15.142857142857142
good	0.2627568376649884	/	0.10178255544972105	=	2.5815508021390374
positive


In [11]:
pred = prediction("bad, broken, worse")
if pred > 1:
    print('positive')
elif pred == 1:
    print('neutral')
else:
    print('negative')

bad	0.005980861244019139	/	0.06299840510366826	=	0.09493670886075949
broken	0.00021353833013025838	/	0.0025624599615631004	=	0.08333333333333334
worse	0.00021358393848782572	/	0.002349423323366083	=	0.09090909090909091
negative
