### Binomial Logistic Regression with filtered data

In [35]:
import pandas as pd

# create df of all the data
df_liar = pd.read_csv("train.tsv", encoding="utf8", sep="\t", names=["id", "truth-value", 
                                                                     "text", "topic", "name", "job", 
                                                                     "state", "politics", "count1", "count2", 
                                                                     "count3", "count4", "count5", "context"])

df_liar.head(8)

Unnamed: 0,id,truth-value,text,topic,name,job,state,politics,count1,count2,count3,count4,count5,context
0,2635.json,false,Says the Annies List political group supports ...,abortion,dwayne-bohac,State representative,Texas,republican,0.0,1.0,0.0,0.0,0.0,a mailer
1,10540.json,half-true,When did the decline of coal start? It started...,"energy,history,job-accomplishments",scott-surovell,State delegate,Virginia,democrat,0.0,0.0,1.0,1.0,0.0,a floor speech.
2,324.json,mostly-true,"Hillary Clinton agrees with John McCain ""by vo...",foreign-policy,barack-obama,President,Illinois,democrat,70.0,71.0,160.0,163.0,9.0,Denver
3,1123.json,false,Health care reform legislation is likely to ma...,health-care,blog-posting,,,none,7.0,19.0,3.0,5.0,44.0,a news release
4,9028.json,half-true,The economic turnaround started at the end of ...,"economy,jobs",charlie-crist,,Florida,democrat,15.0,9.0,20.0,19.0,2.0,an interview on CNN
5,12465.json,true,The Chicago Bears have had more starting quart...,education,robin-vos,Wisconsin Assembly speaker,Wisconsin,republican,0.0,3.0,2.0,5.0,1.0,a an online opinion-piece
6,2342.json,barely-true,Jim Dunnam has not lived in the district he re...,candidates-biography,republican-party-texas,,Texas,republican,3.0,1.0,1.0,3.0,1.0,a press release.
7,153.json,half-true,I'm the only person on this stage who has work...,ethics,barack-obama,President,Illinois,democrat,70.0,71.0,160.0,163.0,9.0,"a Democratic debate in Philadelphia, Pa."


In [84]:
import nltk
from nltk.corpus import wordnet as wn
from nltk.corpus import stopwords
import csv
import string

countlines = 0 

allwords = []
with open("train.tsv", encoding="utf8") as tsvfile:    #open training set
    lines = csv.reader(tsvfile, delimiter="\t")        #convert file to lines
    for line in lines:
        statement = line[2]                     #get statement from each line 
        lostrings = statement.split(" ")        #convert string to list of strings
        new_lostrings = []
        countlines += 1                         #count number of lines so we know the number of statements
        for word in lostrings:
            word = nltk.WordNetLemmatizer().lemmatize(
                word.translate(str.maketrans('', '', string.punctuation)).lower()) # remove punctuation & lemmatize
            if word not in stopwords.words('english') and not word.isdigit():
                new_lostrings.append(word)
        allwords.extend(new_lostrings)

vocab = []                          #initialize a list for all the distint words in the trainset
for word in allwords:
    if word in vocab:               #do not add word if word is already in vocabulary 
        continue 
    else:
        vocab.append(word)

print("Number of words in trainset:", len(allwords))
print("Number of distinct words in trainset:", len(vocab))
print("Number of statements in train dataset",countlines)
print(allwords[:25])
print(vocab[:25])    #here we can see that the vocabulary only contains distinct words 


Number of words in trainset: 110580
Number of distinct words in trainset: 11057
Number of statements in train dataset 10240
['say', 'annies', 'list', 'political', 'group', 'support', 'thirdtrimester', 'abortion', 'demand', 'decline', 'coal', 'start', 'started', 'natural', 'gas', 'took', 'started', 'begin', 'president', 'george', 'w', 'bush', 'administration', 'hillary', 'clinton']
['say', 'annies', 'list', 'political', 'group', 'support', 'thirdtrimester', 'abortion', 'demand', 'decline', 'coal', 'start', 'started', 'natural', 'gas', 'took', 'begin', 'president', 'george', 'w', 'bush', 'administration', 'hillary', 'clinton', 'agrees']


In [85]:
statements = []
labels = []

with open("train.tsv", encoding="utf8") as tsvfile:
    tsvreader = csv.reader(tsvfile, delimiter="\t")
    for line in tsvreader:
        label = line[1]
        aline = line[2]
        bline = aline.split(" ")
        cline = []
        for word in bline:
            word = nltk.WordNetLemmatizer().lemmatize(
                word.translate(str.maketrans('', '', string.punctuation)).lower()) # remove punctuation & lemmatize
            if word not in stopwords.words('english') and not word.isdigit():
                cline.append(word)
        labels.append(label)
        statements.append(cline)

print("First 3 statements:", statements[:3]) 
print("First 3 labels:", labels[:3])

First 3 statements: [['say', 'annies', 'list', 'political', 'group', 'support', 'thirdtrimester', 'abortion', 'demand'], ['decline', 'coal', 'start', 'started', 'natural', 'gas', 'took', 'started', 'begin', 'president', 'george', 'w', 'bush', 'administration'], ['hillary', 'clinton', 'agrees', 'john', 'mccain', 'voting', 'give', 'george', 'bush', 'benefit', 'doubt', 'iran']]
First 3 labels: ['false', 'half-true', 'mostly-true']


Now we will remove the statements that are not 'false' or 'true' (to be able to apply binomial logistic regression)

In [86]:
i = -1
for label in labels:
    i += 1 
    if label == "false":
         continue 
    if label == "true": 
         continue
    else: 
        del statements[i]
        i = i-1 
        
print(statements[:3])

[['say', 'annies', 'list', 'political', 'group', 'support', 'thirdtrimester', 'abortion', 'demand'], ['health', 'care', 'reform', 'legislation', 'likely', 'mandate', 'free', 'sex', 'change', 'surgery'], ['chicago', 'bear', 'starting', 'quarterback', 'last', 'year', 'total', 'number', 'tenured', 'uw', 'faculty', 'fired', 'last', 'two', 'decade']]


In [87]:
print(len(statements))

3671


In [88]:
labels = [label for label in labels if label not in ("barely-true", "half-true", "mostly-true", "pants-fire")]
print(len(labels))

3671


In [59]:
labels = [label for label in labels if label not in ("barely-true", "half-true", "mostly-true", "pants-fire")]
print(len(labels))

3671


In [89]:
import numpy as np 
rows = len(statements)          
columns = len(vocab)       #corresponds to the number of distinct words occuring in the train set 
matrix = np.zeros((rows, columns))

In [90]:
#create X matrix 
counts1 = 0 
for statement in statements: 
    counts2 = 0
    for word in vocab:
        if word in statement:
            count = statement.count(word)       #count how often the word occurs in the statement
            matrix[counts1, counts2] = count    #puts number of occurences in the entry corresponding to that word
        counts2 += 1 
    counts1 += 1 

In [91]:
#create y matrix
#Create y vector which contains the validity labels of the statements in the train dataset 
labelsdic ={"false":0, "true":1}

size = len(statements)
y_vector = [None] * size

counter = 0
for label in labels:
    y_vector[counter] = labelsdic[label]  #convert the label to the corresponding integer
    counter += 1

In [92]:
from sklearn.linear_model import LogisticRegression
logreg = LogisticRegression(solver='lbfgs')

In [93]:
logreg.fit(matrix, y_vector)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='lbfgs', tol=0.0001,
          verbose=0, warm_start=False)

We now preprocess the test dataset 

In [94]:
teststatements = []
testlabels = []

with open("test.tsv", encoding="utf8") as tsvfile:
    tsvreader = csv.reader(tsvfile, delimiter="\t")
    for line in tsvreader:
        testlabel = line[1]
        aline = line[2]
        bline = aline.split(" ")
        cline = []
        for word in bline:
            word = nltk.WordNetLemmatizer().lemmatize(
                word.translate(str.maketrans('', '', string.punctuation)).lower()) # remove punctuation & lemmatize
            if word not in stopwords.words('english') and not word.isdigit():
                cline.append(word)
        testlabels.append(testlabel)
        teststatements.append(cline)

print("First 3 test statements:", teststatements[:3]) 
print("First 3 test labels:", testlabels[:3])

First 3 test statements: [['building', 'wall', 'usmexico', 'border', 'take', 'literally', 'year'], ['wisconsin', 'pace', 'double', 'number', 'layoff', 'year'], ['say', 'john', 'mccain', 'ha', 'done', 'nothing', 'help', 'vet']]
First 3 test labels: ['true', 'false', 'false']


In [95]:
j = -1
for testlabel in testlabels:
    j += 1 
    if testlabel == "false":
         continue 
    if testlabel == "true": 
         continue
    else: 
        del teststatements[j]
        j = j-1 
        
print(teststatements[:3])

[['building', 'wall', 'usmexico', 'border', 'take', 'literally', 'year'], ['wisconsin', 'pace', 'double', 'number', 'layoff', 'year'], ['say', 'john', 'mccain', 'ha', 'done', 'nothing', 'help', 'vet']]


In [96]:
testlabels = [testlabel for testlabel in testlabels if testlabel not in ("barely-true", "half-true", "mostly-true", "pants-fire")]

In [97]:
print(len(teststatements))
print(len(testlabels))

457
457


In [98]:
import numpy as np 
rows = len(teststatements)          
columns = len(vocab)       #corresponds to the number of distinct words occuring in the train set 
testmatrix = np.zeros((rows, columns))

In [99]:
#create test X matrix 
counts1 = 0 
for teststatement in teststatements: 
    counts2 = 0
    for word in vocab:
        if word in teststatement:
            count = teststatement.count(word)       #count how often the word occurs in the statement
            testmatrix[counts1, counts2] = count    #puts number of occurences in the entry corresponding to that word
        counts2 += 1 
    counts1 += 1 

In [100]:
#create y matrix
#Create y vector which contains the validity labels of the statements in the train dataset 
labelsdic ={"false":0, "true":1}

size = len(teststatements)
testy_vector = [None] * size

counter = 0
for testlabel in testlabels:
    testy_vector[counter] = labelsdic[testlabel]  #convert the label to the corresponding integer
    counter += 1

In [101]:
print(len(testy_vector))

457


In [102]:
y_hat_test = logreg.predict(testmatrix)
from sklearn.metrics import accuracy_score
print(accuracy_score(testy_vector, y_hat_test))
print(accuracy_score(testy_vector, y_hat_test, normalize=False))

0.5842450765864332
267


In [83]:
#results from when filtering was not applied

y_hat_test = logreg.predict(testmatrix)
from sklearn.metrics import accuracy_score
print(accuracy_score(testy_vector, y_hat_test))
print(accuracy_score(testy_vector, y_hat_test, normalize=False))

0.612691466083151
280


In [24]:
example 

exstatements = [['say', 'the', 'annies', 'list' ],
                ['this', 'is', 'another', 'test'],
                ['this', 'one', 'as', 'well'],
                ['political', 'group', 'support', 'thirdtrimester','abortion', 'on', 'demand'], 
                ['when', 'did', 'the', 'decline', 'of', 'coal', 'start', 'it', 'started', 'when', 'natural', 'gas', 'took', 'off', 'that', 'started'],
                ['to', 'begin', 'in', 'president', 'george', 'w', 'bush', 'administration'], 
                ['hillary', 'clinton', 'agrees', 'with', 'john', 'mccain', 'by', 'voting', 'to', 'give', 'george', 'bush', 'the', 'benefit', 'of', 'the', 'doubt', 'on', 'iran']]
exlabels = ['false', 'half-true', 'true','half-true', 'mostly-true', 'true', 'half-true']

i = -1
for label in exlabels:
    i += 1 
    if label == "false":
         continue 
    if label == "true": 
         continue
    else: 
        del exstatements[i]
        i = i - 1 
        
print(exstatements) 

[['say', 'the', 'annies', 'list'], ['this', 'one', 'as', 'well'], ['to', 'begin', 'in', 'president', 'george', 'w', 'bush', 'administration']]


### Applying TFIDF 