In [1]:
import pandas as pd
import numpy as np
import re

In [3]:
data = pd.read_csv('politifact_clean_binarized.csv')

In [7]:
training_data = []
for i in range(0,11188):
  word_list = data[['statement','veracity']].iloc[i].tolist()[0]
  words = re.sub(r'[^\w\s]', '', word_list).split()
  if data[['statement','veracity']].iloc[i].tolist()[1] == 0:
    lie_tupple = (words,True)
  else:
    lie_tupple = (words,False)
  training_data.append(lie_tupple)

In [91]:
def calculate_lie_probability(training_data, query):
    """
    Calculate the probability of an input statement being spam.
    
    Args:
        training_data (list of tuples): Each tuple contains a list of words and a boolean indicating 
                                        if the list is lie (True) or not lie (False).
        query (list of str): Words in the input statement to evaluate.
        
    Returns:
        float: Probability that the input statement is lie.
    """
    P_lie_given_query = 1.0       

    # checks for at lest one valid word exist in the input statement
    # and keep only the valid words
    valid = False
    valid_query = []
    for i in query:
        for words, isLie in training_data:
            if i in words:
                valid = True
                valid_query.append(i)
                break
    if not valid:
        return -1
    query = valid_query
    
    # calculate P(Wi|Lie) for all words within W*, along with P(Lie)
    PLie = 0
    numLie = 0
    P_Wi_given_lie_list = []
    P_Wi_given_not_lie_list = []
    for i in query:
        numLie = 0
        numWgivenLie = 0
        numNotLie = 0
        numWgivenNotLie = 0
        for words, isLie in training_data:
            if isLie:
                numLie += 1
                if i in words:
                    numWgivenLie += 1
            else:
                numNotLie += 1
                if i in words:
                    numWgivenNotLie += 1
        PLie = numLie / len(training_data)
        PNotLie = 1 - PLie
        # Store P(Wi|Lie)
        P_Wi_given_lie_list.append(numWgivenLie/numLie)
        # Store P(Wi|notLie)
        P_Wi_given_not_lie_list.append(numWgivenNotLie/numNotLie)
    
    PLie = numLie / len(training_data)
    PNotLie = 1 - PLie
    
    # Calculate P(W*|Lie)
    P_W_given_Lie = 1
    for i in P_Wi_given_lie_list:
        P_W_given_Lie *= i
    
    # Calculate P(W*|notLie)
    P_W_given_not_Lie = 1
    for i in P_Wi_given_not_lie_list:
        P_W_given_not_Lie *= i
    
    # Calculate P_lie_given_query
    P_lie_given_query = P_W_given_Lie * PLie / (P_W_given_Lie * PLie + P_W_given_not_Lie * PNotLie)

    return P_lie_given_query

In [92]:
def determine_lie(training_data,query):
    result = calculate_lie_probability(training_data, query)
    if result >= 0.5:
        print ("Lie!")
    elif result >= 0 and result < 0.5:
        print ("Truth")
    else:
        print ("Hmm, I have never seen anything like this before. I may need to upgrade my training data in the future. For now, try another sentence.")

#### the cell below is only for testing the determine_lie function

In [93]:
training_data = [(['must', 'absolutely'], True),
 (['must', 'work', 'promise', 'absolutely'], False),
 (['money', 'contract'], True),
 (['money', 'work'], False),
 (['must'], True),
 (['must', 'money', 'contract'], True),
 (['must', 'money', 'contract', 'work'], True),
 (['project', 'absolutely'], False),
 (['work', 'absolutely'], False),
 (['work', 'promise'], False)]

query = ["wn", "my"]
prob = calculate_lie_probability(training_data, query)
print(f"This sentence is {prob * 100:.2f}% likely to be a lie.")
result = determine_lie(training_data, query)

query = ["absolutely", "my"]
prob = calculate_lie_probability(training_data, query)
print(f"This sentence is {prob * 100:.2f}% likely to be a lie.")
result = determine_lie(training_data, query)

query = ['absolutely', 'work']
prob = calculate_lie_probability(training_data, query)
print(f"This sentence is {prob * 100:.2f}% likely to be a lie.")
result = determine_lie(training_data, query)

query = ['must', 'money']
prob = calculate_lie_probability(training_data, query)
print(f"This sentence is {prob * 100:.2f}% likely to be a lie.")
result = determine_lie(training_data, query)

query = ['money', 'must']
prob = calculate_lie_probability(training_data, query)
print(f"This sentence is {prob * 100:.2f}% likely to be a lie.")
result = determine_lie(training_data, query)

query = ["absolutely"]
prob = calculate_lie_probability(training_data, query)
print(f"This sentence is {prob * 100:.2f}% likely to be a lie.")
result = determine_lie(training_data, query)

query = ["contract", "money", "absolutely"]
prob = calculate_lie_probability(training_data, query)
print(f"This sentence is {prob * 100:.2f}% likely to be a lie.")
result = determine_lie(training_data, query)

query = ["I", "will", "absolutely","sign", "this", "contract","for", "money"]
prob = calculate_lie_probability(training_data, query)
print(f"This sentence is {prob * 100:.2f}% likely to be a lie.")
result = determine_lie(training_data, query)

This sentence is -100.00% likely to be a lie.
Hmm, I have never seen anything like this before. I may need to upgrade my training data in the future. For now, try another sentence.
This sentence is 25.00% likely to be a lie.
Truth
This sentence is 7.69% likely to be a lie.
Truth
This sentence is 92.31% likely to be a lie.
Lie!
This sentence is 92.31% likely to be a lie.
Lie!
This sentence is 25.00% likely to be a lie.
Truth
This sentence is 100.00% likely to be a lie.
Lie!
This sentence is 100.00% likely to be a lie.
Lie!
