In [17]:
import nltk
from nltk.tokenize import treebank
import pandas as pd
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer 
import math
from sklearn.model_selection import train_test_split
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import random

In [18]:
stop_words = set(stopwords.words('english'))

In [19]:
dataset = pd.read_csv('deceptive-opinion.csv')

In [20]:
truthful_prior_log_prob = math.log((list(dataset['deceptive']).count("truthful"))/len(dataset))
deceptive_prior_log_prob = math.log((list(dataset['deceptive']).count("deceptive"))/len(dataset))

In [21]:
def train_test_split(df,test_size):
    if isinstance(test_size,float):
        test_size = round(test_size * len(df))
        
    if(test_size > len(df)):
        return 0, df

    indices = df.index.tolist()
    test_indices = random.sample(population=indices,k = test_size)

    test_df = df.loc[test_indices]
    train_df = df.drop(test_indices)
    
    return train_df, test_df
train_df,test_df = train_test_split(dataset,0.3)

In [22]:
vocab = {}
truthful_token_count = {} #how many occurances of each word for truthful class
deceptive_token_count = {}

for i in range(len(train_df)):
    review = list(train_df['text'])[i]
    words = word_tokenize(review)
    new_words = []
    for word in words:
        if word.lower() not in stop_words:
            new_words.append(word.lower())
    
    for word in new_words:
        if word not in vocab:
            vocab[word] = 1
        else:
            vocab[word] += 1
        if list(train_df['deceptive'])[i] == "truthful":
            if word not in truthful_token_count:
                truthful_token_count[word] = 1
            else:
                truthful_token_count[word] += 1
            if word not in deceptive_token_count:
                deceptive_token_count[word] = 0
        else:
            if word not in deceptive_token_count:
                deceptive_token_count[word] = 1
            else:
                deceptive_token_count[word] += 1
            if word not in truthful_token_count:
                truthful_token_count[word] = 0
new_words

['former',
 'chicagoan',
 ',',
 "'m",
 'appalled',
 'amalfi',
 'hotel',
 'chicago',
 '.',
 'first',
 ',',
 'expecting',
 'luxury',
 'hospitality',
 ',',
 'neither',
 'received',
 '.',
 "'s",
 'experience',
 'designer',
 'supposed',
 'like',
 "'personal",
 'concierge',
 ',',
 "'",
 'experience',
 'ed',
 'terrible',
 '.',
 'felt',
 'like',
 'trying',
 'pressure',
 'staying',
 'days',
 'wanted',
 '.',
 ',',
 'could',
 "n't",
 'understand',
 'saying',
 'time',
 'talking',
 'fast',
 '.',
 'finally',
 'got',
 'room',
 ',',
 'disappointed',
 'quality',
 'furniture',
 'room',
 "'s",
 'cleanliness',
 '.',
 'ask',
 'maid',
 'come',
 'give',
 'clean',
 'towels',
 'towels',
 'bathroom',
 'damp',
 '.',
 'top',
 ',',
 'bed',
 'messily',
 'done',
 ';',
 'could',
 'done',
 'better',
 'job',
 'bed',
 'home',
 '.',
 'angry',
 'point',
 ',',
 'paying',
 'lot',
 'money',
 'every',
 'night',
 'staying',
 'amalfi',
 ',',
 "n't",
 'expect',
 'greeted',
 'wet',
 'towels',
 '.',
 'needed',
 'use',
 'wi-fi',
 '

In [23]:
def laplace_smoothening(word,c):
    if(c):
        numerator = truthful_token_count[word] + 1
        denominator = sum(truthful_token_count.values()) + len(vocab)
        return math.log(numerator/denominator)
    else:
        numerator = deceptive_token_count[word] + 1
        denominator = sum(truthful_token_count.values()) + len(vocab)
        return math.log(numerator/denominator)

In [24]:
def predict(reviews):
    predicted_classes = []
    for review in reviews:
        
        class_scores = [deceptive_prior_log_prob,truthful_prior_log_prob]
        words = word_tokenize(review)
        for word in words:
            if word not in vocab:
                continue
            
            for c in range(2):
                log_w_give_c = laplace_smoothening(word,c)
                class_scores[c] += log_w_give_c
        
        predicted_classes.append(class_scores.index(max(class_scores)))  
    return predicted_classes

In [26]:
def addLabel(value):
    if(value == "truthful"):
        return 1
    else:
        return 0

test_df['label'] = test_df['deceptive'].apply(addLabel)

In [27]:
true_classes = list(test_df.label)

In [28]:
predicted_classes = predict(test_df.text)

In [29]:
count = 0
for i in range(len(true_classes)):
    if true_classes[i] == predicted_classes[i]:
        count += 1
count/len(true_classes)

0.8270833333333333

In [30]:
truePositive = trueNegative = falsePositive = falseNegative = 0
    

In [31]:
for i in range(len(true_classes)):
    if(true_classes[i] == predicted_classes[i] == 0):
        truePositive += 1
    elif true_classes[i] == predicted_classes[i] == 1:
        trueNegative += 1
    elif true_classes[i] == 1 and predicted_classes[i] == 0:
        falsePositive += 1
    elif true_classes[i] == 0 and predicted_classes[i] == 1 :
        falseNegative += 1

In [32]:
accuracy = (truePositive + trueNegative)/(truePositive + trueNegative + falsePositive + falseNegative)
precision = truePositive/(truePositive + falsePositive)
recall =truePositive/(truePositive + falseNegative) 

accuracy,precision,recall

(0.8270833333333333, 0.8877551020408163, 0.7404255319148936)