In [1]:
import os
import random
import numpy as np
import pandas as pd
import collections
import math

In [2]:
uri_train  = 'https://raw.githubusercontent.com/thiagorainmaker77/liar_dataset/master/train.tsv'

df_train = pd.read_table(uri_train,
                             names = ['id',	'label'	,'statement',	'subject',	'speaker', 	'job', 	'state',	'party',	'barely_true_c',	'false_c',	'half_true_c',	'mostly_true_c',	'pants_on_fire_c',	'venue'])

other_df = pd.DataFrame({"label": ['true', "false", "half-true", "barely-true"]})

df_linear = df_train[df_train.label.isin(other_df.label)]

df_linear_1= df_linear.reset_index()
df_linear_1['statement'] = df_linear_1['statement'].str.lower()
#df_linear_1['statement']= df_linear_1['statement'].str.split()


In [3]:
uri_test  = 'https://raw.githubusercontent.com/thiagorainmaker77/liar_dataset/master/test.tsv'

df_test = pd.read_table(uri_train,
                             names = ['id',	'label'	,'statement',	'subject',	'speaker', 	'job', 	'state',	'party',	'barely_true_c',	'false_c',	'half_true_c',	'mostly_true_c',	'pants_on_fire_c',	'venue'])

other_df_1 = pd.DataFrame({"label": ['true', "false", "half-true", "barely-true"]})

df_linear_test = df_test[df_test.label.isin(other_df_1.label)]

df_linear_test_1= df_linear_test.reset_index()
df_linear_test_1['statement'] = df_linear_test_1['statement'].str.lower()
#df_linear_1['statement']= df_linear_1['statement'].str.split()


In [7]:
def naive_bayes_alpha(train_df, test_df, alpha: int): 

    df_linear_1 = train_df
    df_linear_test_1 = test_df

    ###################################################################
    label_cnt = collections.Counter(df_linear_1.label)
    true_cnt = collections.Counter() #true_cnt is p(x|true)
    false_cnt = collections.Counter()
    half_true_cnt= collections.Counter()
    barely_true_cnt= collections.Counter()



    for idx, row in df_linear_1.iterrows():

        if row["label"]== "true":

            true_cnt.update(row["statement"].split())

        if row["label"]== "false":

            false_cnt.update(row["statement"].split())

        if row["label"]== "half-true":

            half_true_cnt.update(row["statement"].split())

        if row["label"]== "barely-true":

            barely_true_cnt.update(row["statement"].split())

    ################################################################### 
    label_probs = {key: value / sum(label_cnt.values()) for key, value in label_cnt.items()}
    true_probs = {key: (alpha + value) / (sum(true_cnt.values()) + (alpha * len(true_cnt))) for key, value in true_cnt.items()}
    false_probs = {key: (alpha + value) / (sum(false_cnt.values()) + (alpha * len(false_cnt))) for key, value in false_cnt.items()}
    half_true_probs = {key: (alpha + value) / (sum(half_true_cnt.values()) + (alpha * len(half_true_cnt))) for key, value in half_true_cnt.items()}
    barely_true_probs = {key: (alpha + value) / (sum(barely_true_cnt.values()) + (alpha * len(barely_true_cnt))) for key, value in barely_true_cnt.items()}

        ##################################################################

    to_calc =[]

    for idx, row in df_linear_test_1.iterrows(): 

        sentence = row["statement"].split() 

        true_prob = math.prod([true_probs.get(word,0) for word in sentence])*label_probs["true"]
        false_prob = math.prod([false_probs.get(word,0) for word in sentence])*label_probs["false"]
        half_true_prob= math.prod([half_true_probs.get(word,0) for word in sentence])*label_probs["half-true"]
        barely_true_prob= math.prod([barely_true_probs.get(word,0) for word in sentence])*label_probs["barely-true"]


        if true_prob > false_prob: 
            curr_probs = "true"
        elif true_prob > half_true_prob: 
            curr_probs = "true"
        elif true_prob > barely_true_prob: 
            curr_probs = "true"
        elif half_true_prob> false_prob: 
            curr_probs = "half-true"
        elif half_true_prob> true_prob: 
            curr_probs = "half-true"
        elif half_true_prob> barely_true_prob: 
            curr_probs = "half-true"
        elif barely_true_prob> half_true_prob: 
            curr_probs = "barely-true"
        elif barely_true_prob> false_prob: 
            curr_probs = "barely-true"
        elif barely_true_prob> true_prob: 
            curr_probs = "barely-true"
        else: 
            curr_probs = "false"

        #this is to calc accurracy later 
        if curr_probs == row["label"]: 
            to_calc.append(1)
        else: 
            to_calc.append(0)

    ###########################################################################

    count_correct = to_calc.count(1)
    rows_count = df_linear_test_1.shape[0]
    accuracy = count_correct/rows_count



    return(accuracy) 
    

        

In [8]:
naive_bayes_alpha(df_linear_1, df_linear_test_1, 10000)

0.8634225030246001

In [9]:
empt_list= []

for i in range(100): 

    curr_accurange= naive_bayes_alpha(df_linear_1, df_linear_test_1, i)

    curr_list= [i,curr_accurange]

    empt_list.append(curr_list)



In [10]:
empt_list

[[0, 0.8667831697808845],
 [1, 0.8661110364296276],
 [2, 0.8663798897701304],
 [3, 0.866245463099879],
 [4, 0.8658421830891249],
 [5, 0.8657077564188735],
 [6, 0.8654389030783708],
 [7, 0.8655733297486221],
 [8, 0.8654389030783708],
 [9, 0.8655733297486221],
 [10, 0.8653044764081194],
 [11, 0.865170049737868],
 [12, 0.865170049737868],
 [13, 0.8649011963973653],
 [14, 0.8650356230676166],
 [15, 0.8650356230676166],
 [16, 0.8649011963973653],
 [17, 0.8647667697271139],
 [18, 0.8647667697271139],
 [19, 0.8647667697271139],
 [20, 0.8647667697271139],
 [21, 0.8647667697271139],
 [22, 0.8647667697271139],
 [23, 0.8647667697271139],
 [24, 0.8647667697271139],
 [25, 0.8646323430568624],
 [26, 0.8646323430568624],
 [27, 0.8646323430568624],
 [28, 0.8646323430568624],
 [29, 0.8646323430568624],
 [30, 0.8646323430568624],
 [31, 0.8646323430568624],
 [32, 0.8646323430568624],
 [33, 0.8647667697271139],
 [34, 0.8647667697271139],
 [35, 0.8647667697271139],
 [36, 0.8647667697271139],
 [37, 0.864766

In [None]:
#smoothing has not helped. the accuracy keeps on dropping 

