In [17]:
import pandas as pd
import os, codecs

# create df
df_liar = pd.read_csv("train.tsv", encoding="utf8", sep="\t", names=["id", "truth-value", 
                                                                     "text", "topic", "name", "job", 
                                                                     "state", "politics", "count1", "count2", 
                                                                     "count3", "count4", "count5", "context"])

df_liar.head(3)

Unnamed: 0,id,truth-value,text,topic,name,job,state,politics,count1,count2,count3,count4,count5,context
0,2635.json,false,Says the Annies List political group supports ...,abortion,dwayne-bohac,State representative,Texas,republican,0.0,1.0,0.0,0.0,0.0,a mailer
1,10540.json,half-true,When did the decline of coal start? It started...,"energy,history,job-accomplishments",scott-surovell,State delegate,Virginia,democrat,0.0,0.0,1.0,1.0,0.0,a floor speech.
2,324.json,mostly-true,"Hillary Clinton agrees with John McCain ""by vo...",foreign-policy,barack-obama,President,Illinois,democrat,70.0,71.0,160.0,163.0,9.0,Denver


In [4]:
def classify(text):
    if text == "false":
        return 0
    elif text == "true":
        return 1 
    else: 
        return -1 

In [12]:
# add the class of truth-values
df_liar["class"] = df_liar["truth-value"].apply(classify) 

In [16]:
# reduce the dataset to only true/false sentences
df_reduced = df_liar[df_liar["class"] != -1]
print(df_reduced.shape)
df_reduced.head(3)

(3671, 15)


Unnamed: 0,id,truth-value,text,topic,name,job,state,politics,count1,count2,count3,count4,count5,context,class
0,2635.json,False,Says the Annies List political group supports ...,abortion,dwayne-bohac,State representative,Texas,republican,0.0,1.0,0.0,0.0,0.0,a mailer,0
3,1123.json,False,Health care reform legislation is likely to ma...,health-care,blog-posting,,,none,7.0,19.0,3.0,5.0,44.0,a news release,0
5,12465.json,True,The Chicago Bears have had more starting quart...,education,robin-vos,Wisconsin Assembly speaker,Wisconsin,republican,0.0,3.0,2.0,5.0,1.0,a an online opinion-piece,1


In [25]:
# TF-IDF representation of the text
from sklearn.feature_extraction.text import CountVectorizer
count_vect = CountVectorizer()
X_train = count_vect.fit(df_reduced.text)
X_train = count_vect.transform(df_reduced.text)
# --> for the test you use same vectorizer, but just transform (not fit!)
X_train.shape

(3671, 7451)

In [26]:
# subtract the classes
y_train = df_reduced["class"].values

In [27]:
from sklearn.linear_model import LogisticRegression
logreg = LogisticRegression(solver='lbfgs')

# Create an instance of Logistic Regression Classifier and fit the data.
logreg.fit(X_train, y_train)



LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='warn', n_jobs=None, penalty='l2',
                   random_state=None, solver='lbfgs', tol=0.0001, verbose=0,
                   warm_start=False)

In [28]:
df_liar_test = pd.read_csv("test.tsv", encoding="utf8", sep="\t", names=["id", "truth-value", 
                                                                     "text", "topic", "name", "job", 
                                                                     "state", "politics", "count1", "count2", 
                                                                     "count3", "count4", "count5", "context"])
df_liar_test["class"] = df_liar_test["truth-value"].apply(classify)
df_test_reduced = df_liar_test[df_liar_test["class"] != -1]

X_test = count_vect.transform(df_test_reduced.text) 
y_test = df_test_reduced["class"].values
X_test.shape

(457, 7451)

In [29]:
# we could try to evaluate the model
logreg.fit(X_train, y_train)
y_hat_test = logreg.predict(X_test)

# evaluate using accuracy: proportion of correctly predicted over total
from sklearn.metrics import accuracy_score
print(accuracy_score(y_test, y_hat_test))
print(accuracy_score(y_test, y_hat_test, normalize=False))

0.599562363239
274




In [54]:
print(logreg.coef_)
print(logreg.coef_.shape)

[[-0.17203638 -0.27699619  0.05011177 ..., -0.15039476  0.27433863
   0.47206197]]
(1, 7451)


In [50]:
print(count_vect.vocabulary_.keys())



In [56]:
coef_dict = dict()
for n, key in enumerate(count_vect.vocabulary_.keys()):
    coef_dict[key] = logreg.coef_[0][n] 

In [57]:
coef_dict

{'says': -0.17203637902256194,
 'the': -0.27699619286997451,
 'annies': 0.05011176823867837,
 'list': 0.14942538913492664,
 'political': 0.11716453602654241,
 'group': -0.092538451088969395,
 'supports': 0.0085350399790884539,
 'third': 0.13434952251712989,
 'trimester': 0.0084091534112239272,
 'abortions': 0.1342368027864384,
 'on': 0.36475708888748087,
 'demand': 0.213236164351612,
 'health': 0.031085675489561142,
 'care': -0.23697213910507725,
 'reform': -0.33416172430507118,
 'legislation': 0.35651077943116427,
 'is': -0.063736457380172168,
 'likely': -0.33376020791019517,
 'to': 0.38212160938752171,
 'mandate': -0.18892486828958799,
 'free': 0.0078327843571648643,
 'sex': -0.41410563411562878,
 'change': 0.25449300803140462,
 'surgeries': -0.34690828965710463,
 'chicago': 0.0078327843571648643,
 'bears': 0.1955976856648608,
 'have': 0.13434952251712989,
 'had': 0.57227443235443787,
 'more': 0.3036737991459309,
 'starting': -0.058194368338498316,
 'quarterbacks': 0.3169049405490369