In [89]:
import pandas as pd
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import accuracy_score
import numpy as np
from sklearn import decomposition
from collections import OrderedDict

In [4]:
df_liar_sentiment = pd.read_csv("sentiment_train.csv", encoding="utf8", sep=",")

df_liar_sentiment.shape

(10240, 16)

In [5]:
# scoring the different truthvalues 
truthlabels ={"false":0, "barely-true":1,"half-true":2,"mostly-true":3,"true":4, "pants-fire":5}

# classification formula for political background
def classify_truth(text):
    if text not in truthlabels.keys(): 
        return -1
    else:
        return truthlabels[text]

# add this new class of politic scores
df_liar_sentiment["truth-score"] = df_liar_sentiment["truth-value"].apply(classify_truth) 

In [6]:
# let's see how many political preferences there are
politics = dict()
for line in df_liar_sentiment["politics"]:
    if line not in politics.keys():
        politics[line] = 1
    else:
        politics[line] +=1
print(politics)

{'republican': 4497, 'democrat': 3336, 'none': 1744, 'organization': 219, 'independent': 147, 'columnist': 35, 'activist': 39, 'talk-show-host': 26, 'libertarian': 40, 'newsmaker': 56, 'journalist': 38, 'labor-leader': 11, 'state-official': 20, 'business-leader': 9, 'education-official': 2, 'tea-party-member': 10, nan: 2, 'green': 3, 'liberal-party-canada': 1, 'government-body': 1, 'Moderate': 1, 'democratic-farmer-labor': 1, 'ocean-state-tea-party-action': 1, 'constitution-party': 1}


In [7]:
# we'll only focus on the 3 most occuring political preferences
politics = dict({"republican": 0, "democrat" : 1, "none" : 2})

# classification formula for political background
def classify_politics(text):
    if text not in politics.keys(): 
        return -1
    else:
        return politics[text]

# add this new class of politic scores
df_liar_sentiment["political-score"] = df_liar_sentiment["politics"].apply(classify_politics) 

In [9]:
# filtering out the statement with a truth-score or a political preference that we don't like
df_reduced_sent1 = df_liar_sentiment[df_liar_sentiment["truth-score"] != -1]
df_reduced_sent2 = df_reduced_sent1[df_reduced_sent1["political-score"] != -1]
print(df_reduced_sent2.shape)
df_reduced_sent2.head(1)

(9577, 18)


Unnamed: 0,id,truth-value,text,topic,name,job,state,politics,count1,count2,count3,count4,count5,context,pos-sentiment,neg-sentiment,truth-score,political-score
0,2635.json,False,Says the Annies List political group supports ...,abortion,dwayne-bohac,State representative,Texas,republican,0.0,1.0,0.0,0.0,0.0,a mailer,0.007972,0.012908,0,0


In [10]:
# reducing the dataframe to only the important/most interesting data
df_reduced_sent = df_reduced_sent2.drop(['id', 'count1', 'count2', 'count3', 'count4', 'count5'], axis=1)  
df_reduced_sent.head(1)

Unnamed: 0,truth-value,text,topic,name,job,state,politics,context,pos-sentiment,neg-sentiment,truth-score,political-score
0,False,Says the Annies List political group supports ...,abortion,dwayne-bohac,State representative,Texas,republican,a mailer,0.007972,0.012908,0,0


In [33]:
# create df of the test data
df_test_sent = pd.read_csv("sentiment_test.csv", encoding="utf8", sep=",", names=["id", "truth-value", 
                                                                     "text", "topic", "name", "job", 
                                                                     "state", "politics", "count1", "count2", 
                                                                     "count3", "count4", "count5", "context", 
                                                                                        "pos-sentiment", "neg-sentiment"])
df_test_sent["truth-score"] = df_test_sent["truth-value"].apply(classify_truth)
df_test_sent["political-score"] = df_test_sent["politics"].apply(classify_politics)

df_test_sent_reduced1 = df_test_sent[df_test_sent["truth-score"] != -1]
df_test_sent_reduced2 = df_test_sent_reduced1[df_test_sent_reduced1["political-score"] != -1]

In [35]:
df_test_sent_reduced = df_test_sent_reduced2.drop(['id', 'count1', 'count2', 'count3', 'count4', 'count5'], axis=1)  
df_test_sent_reduced.head(1)

Unnamed: 0,truth-value,text,topic,name,job,state,politics,context,pos-sentiment,neg-sentiment,truth-score,political-score
1,True,Building a wall on the U.S.-Mexico border will...,immigration,rick-perry,Governor,Texas,republican,Radio interview,0.0079719387755102,0.0129081632653061,4,0


## Regression using TF-IDF Vectorizer

In [63]:
tfidf_vect = TfidfVectorizer()

# creating the training vector
X_train = tfidf_vect.fit(df_reduced_sent.text)
X_train = tfidf_vect.transform(df_reduced_sent.text)
y_train = df_reduced_sent["truth-score"].values

X_train.shape

(9577, 11765)

In [64]:
lr = LogisticRegression(solver='lbfgs',multi_class='multinomial')

In [65]:
# Create an instance of Logistic Regression Classifier and fit the data.
lr.fit(X_train, y_train)



LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='multinomial', n_jobs=None, penalty='l2',
                   random_state=None, solver='lbfgs', tol=0.0001, verbose=0,
                   warm_start=False)

In [66]:
# transform the test data to the right format, aligning with the training data 
# (so that it has the size of the vocab of the training set)
X_test = tfidf_vect.transform(df_test_sent_reduced.text) 
y_test = df_test_sent_reduced["truth-score"].values
X_test.shape

(1191, 11765)

In [67]:
# evaluating the tfidf model
lr.fit(X_train, y_train)
y_hat_test = lr.predict(X_test)

# evaluate using accuracy: proportion of correctly predicted over total
print(accuracy_score(y_test, y_hat_test))
print(accuracy_score(y_test, y_hat_test, normalize=False))

0.251049538203
299




In [68]:
# converting train to a matrix
new_train = np.empty([9577, 11765])
array_X_train = X_train.toarray()

for n in range(9577):
    new_train[n] = array_X_train[n]

# converting test to a matrix
new_test = np.empty([1191, 11765])
array_X_test = X_test.toarray()

for n in range(1191):
    new_test[n] = array_X_test[n]

In [104]:
# 50 dimensions
train_SVD50Mat = decomposition.TruncatedSVD(n_components = 50, algorithm = "arpack").fit_transform(new_train)
test_SVD50Mat = decomposition.TruncatedSVD(n_components = 50, algorithm = "arpack").fit_transform(new_test)

In [105]:
# evaluating 50 dimensions SVD model
lr.fit(train_SVD50Mat, y_train)
y_hat_test_SVD = lr.predict(test_SVD50Mat)

# evaluate using accuracy: proportion of correctly predicted over total
print(accuracy_score(y_test, y_hat_test_SVD))
print(accuracy_score(y_test, y_hat_test_SVD, normalize=False))

0.218303946264
260


## Implementing more features: politics & sentiment

In [73]:
# we need to add 3 new columns: one for the positive sentiment score, 
# one for the negative score and one for the political score. 
X_train2 = np.empty([9577, 11768])

for n in range(9577):
    pos = df_reduced_sent["pos-sentiment"].values[n]
    neg = df_reduced_sent["neg-sentiment"].values[n]
    pol = df_reduced_sent["political-score"].values[n]
    X_train2[n] = np.append(new_train[n], [pos, neg, pol])

print(X_train2)   

[[ 0.          0.          0.         ...,  0.00797194  0.01290816  0.        ]
 [ 0.          0.          0.         ...,  0.01148072  0.0142225   1.        ]
 [ 0.          0.          0.         ...,  0.00925808  0.01196854  1.        ]
 ..., 
 [ 0.          0.          0.         ...,  0.00999878  0.01135029  0.        ]
 [ 0.          0.          0.         ...,  0.00741618  0.01049563  1.        ]
 [ 0.          0.          0.         ...,  0.01075098  0.00883072  0.        ]]


In [71]:
# we also need to do this for the test set.
X_test2 = np.empty([1191, 11768])

for n in range(1191):
    pos = df_test_sent_reduced["pos-sentiment"].values[n]
    neg = df_test_sent_reduced["neg-sentiment"].values[n]
    pol = df_test_sent_reduced["political-score"].values[n]
    X_test2[n] = np.append(new_test[n], [pos, neg, pol])

print(X_test2) 

[[ 0.          0.          0.         ...,  0.00797194  0.01290816  0.        ]
 [ 0.          0.          0.         ...,  0.01148072  0.0142225   1.        ]
 [ 0.          0.          0.         ...,  0.00925808  0.01196854  0.        ]
 ..., 
 [ 0.          0.          0.         ...,  0.00970238  0.01297619  1.        ]
 [ 0.          0.          0.         ...,  0.00965795  0.00716801  0.        ]
 [ 0.          0.          0.         ...,  0.00975765  0.01259566  1.        ]]


In [74]:
# evaluating the new model
lr.fit(X_train2, y_train)
y_hat_test = lr.predict(X_test2)

# evaluate using accuracy: proportion of correctly predicted over total
print(accuracy_score(y_test, y_hat_test))
print(accuracy_score(y_test, y_hat_test, normalize=False))

0.252728799328
301




### Feature importance

In [75]:
print(lr.coef_)
print(lr.coef_.shape) #is of size (n_classes, n_features)

[[-0.04752767 -0.41708012 -0.0371028  ..., -0.07067812 -0.27048791
  -0.15765825]
 [-0.03456917 -0.2710325   0.17343884 ...,  0.00767341  0.07877831
  -0.15604081]
 [-0.06063145  0.66533971 -0.03342733 ..., -0.02959719  0.09569888
  -0.0153626 ]
 [-0.03132313  1.26773272 -0.0475349  ...,  0.12738169  0.19015447
   0.10484853]
 [-0.01839588 -0.20426712 -0.03560098 ..., -0.03611805  0.02074004
   0.00946544]
 [ 0.1924473  -1.0406927  -0.01977283 ...,  0.00133826 -0.11488379
   0.21474769]]
(6, 11768)


In [87]:
features = []
for key in tfidf_vect.vocabulary_.keys():
    features.append(key)

features = features + ["positive-score", "negative-score", "political-score"]

In [90]:
coef_dict_false = dict()
start = 0
for feature in features: 
    coef_dict_false[feature] = lr.coef_[0][start]
    start += 1 
    
coef_dict_barely = dict()
start = 0
for feature in features: 
    coef_dict_barely[feature] = lr.coef_[1][start]
    start += 1 

coef_dict_half = dict()
start = 0
for feature in features: 
    coef_dict_half[feature] = lr.coef_[2][start]
    start += 1 
    
coef_dict_mostly = dict()
start = 0
for feature in features: 
    coef_dict_mostly[feature] = lr.coef_[3][start]
    start += 1 
    
coef_dict_true = dict()
start = 0
for feature in features: 
    coef_dict_true[feature] = lr.coef_[4][start]
    start += 1 
    
coef_dict_pantsfire = dict()
start = 0
for feature in features: 
    coef_dict_pantsfire[feature] = lr.coef_[5][start]
    start += 1 

In [91]:
# most important features determining the "false" labeled statements 
ordered_false_coefs = [(k, coef_dict_false[k]) for k in sorted(coef_dict_false, key=coef_dict_false.get, reverse=True)]
ordered_false_coefs[0:10]

[('campbell', 1.5514009372940964),
 ('reveal', 1.4204777899958099),
 ('vetoing', 1.3378204600073866),
 ('160', 1.3333355222925962),
 ('dioxide', 1.2893928312562191),
 ('authored', 1.2651959516125),
 ('karen', 1.2618004549643798),
 ('fsas', 1.2395506990394158),
 ('spill', 1.2237527964892905),
 ('bureaus', 1.223451243322266)]

In [92]:
# most important features determining the "barely true" labeled statements 
ordered_barely_coefs = [(k, coef_dict_barely[k]) for k in sorted(coef_dict_barely, key=coef_dict_barely.get, reverse=True)]
ordered_barely_coefs[0:10]

[('regularly', 1.4891080350411461),
 ('safer', 1.2653403063840096),
 ('identity', 1.2348181864638454),
 ('topics', 1.2292261122743249),
 ('offender', 1.2274607718830501),
 ('package', 1.189296131791274),
 ('wastes', 1.133781841091499),
 ('thatcher', 1.1283452001889964),
 ('lazy', 1.1278096347111295),
 ('drop', 1.1199054660037748)]

In [None]:
# etc..

In [100]:
# let's see what the importance is of the positive score for all labels. 
for line in lr.coef_:
    print(line[-3])

-0.0706781212921
0.00767340941101
-0.0295971886476
0.127381687191
-0.0361180504101
0.00133826374786


In [101]:
# let's see what the importance is of the negative score for all labels. 
for line in lr.coef_:
    print(line[-2])

-0.27048790946
0.0787783058196
0.095698883791
0.190154473183
0.0207400362779
-0.114883789612


In [102]:
# let's see what the importance is of the political score for all labels. 
for line in lr.coef_:
    print(line[-1])

-0.157658250775
-0.156040812142
-0.0153626006286
0.104848530231
0.00946543984909
0.214747693466


So, we see that the positive score is only of sort of significant value in the labeling of "mostly true" (giving coefficient = 0.127). For the negative score this is in labeling of "mostly true" too (coefficient = 0.190). And for political score this is in "mostly true" (coefficient = 0.105) and "pants on fire" (coefficient = 0.215). However, in comparison with the most important features, these scores are still not super significant, since they all score far below 1 still. 