In [103]:
import pandas as pd
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import accuracy_score
import numpy as np
from sklearn import decomposition

In [39]:
df_liar_sentiment = pd.read_csv("sentiment_train.csv", encoding="utf8", sep=",")

df_liar_sentiment.shapes

(10240, 16)

In [54]:
# scoring the different truthvalues 
truthlabels ={"false":0, "barely-true":1,"half-true":2,"mostly-true":3,"true":4, "pants-fire":5}

# classification formula for political background
def classify_truth(text):
    if text not in truthlabels.keys(): 
        return -1
    else:
        return truthlabels[text]

# add this new class of politic scores
df_liar_sentiment["truth-score"] = df_liar_sentiment["truth-value"].apply(classify_truth) 

In [55]:
# let's see how many political preferences there are
politics = dict()
for line in df_liar_sentiment["politics"]:
    if line not in politics.keys():
        politics[line] = 1
    else:
        politics[line] +=1
print(politics)

{'republican': 4497, 'democrat': 3336, 'none': 1744, 'organization': 219, 'independent': 147, 'columnist': 35, 'activist': 39, 'talk-show-host': 26, 'libertarian': 40, 'newsmaker': 56, 'journalist': 38, 'labor-leader': 11, 'state-official': 20, 'business-leader': 9, 'education-official': 2, 'tea-party-member': 10, nan: 2, 'green': 3, 'liberal-party-canada': 1, 'government-body': 1, 'Moderate': 1, 'democratic-farmer-labor': 1, 'ocean-state-tea-party-action': 1, 'constitution-party': 1}


In [56]:
# we'll only focus on the 3 most occuring political preferences
politics = dict({"republican": 0, "democrat" : 1, "none" : 2})

# classification formula for political background
def classify_politics(text):
    if text not in politics.keys(): 
        return -1
    else:
        return politics[text]

# add this new class of politic scores
df_liar_sentiment["political-score"] = df_liar_sentiment["politics"].apply(classify_politics) 

In [85]:
# filtering out the statement with a truth-score or a political preference that we don't like
df_reduced_sent1 = df_liar_sentiment[df_liar_sentiment["truth-score"] != -1]
df_reduced_sent2 = df_reduced_sent1[df_reduced_sent1["political-score"] != -1]
print(df_reduced_sent.shape)
df_reduced_sent.head(1)

(9577, 18)


Unnamed: 0,id,truth-value,text,topic,name,job,state,politics,count1,count2,count3,count4,count5,context,pos-sentiment,neg-sentiment,political-score,truth-score
0,2635.json,False,Says the Annies List political group supports ...,abortion,dwayne-bohac,State representative,Texas,republican,0.0,1.0,0.0,0.0,0.0,a mailer,0.007972,0.012908,0,0


In [86]:
# reducing the dataframe to only the important/most interesting data
df_reduced_sent = df_reduced_sent2.drop(['id', 'count1', 'count2', 'count3', 'count4', 'count5'], axis=1)  
df_reduced_sent.head(1)

Unnamed: 0,truth-value,text,topic,name,job,state,politics,context,pos-sentiment,neg-sentiment,political-score,truth-score
0,False,Says the Annies List political group supports ...,abortion,dwayne-bohac,State representative,Texas,republican,a mailer,0.007972,0.012908,0,0


In [82]:
# create df of the test data
df_test_sent = pd.read_csv("sentiment_test.csv", encoding="utf8", sep=",", names=["id", "truth-value", 
                                                                     "text", "topic", "name", "job", 
                                                                     "state", "politics", "count1", "count2", 
                                                                     "count3", "count4", "count5", "context", 
                                                                                        "positive", "negative"])
df_test_sent["truth-score"] = df_test_sent["truth-value"].apply(classify_truth)
df_test_sent["political-score"] = df_test_sent["politics"].apply(classify_politics)

df_test_sent_reduced1 = df_test_sent[df_test_sent["truth-score"] != -1]
df_test_sent_reduced2 = df_test_sent_reduced1[df_test_sent_reduced1["political-score"] != -1]

In [83]:
df_test_sent_reduced = df_test_sent_reduced2.drop(['id', 'count1', 'count2', 'count3', 'count4', 'count5'], axis=1)  
df_test_sent_reduced.head(1)

Unnamed: 0,truth-value,text,topic,name,job,state,politics,context,positive,negative,truth-score,political-score
1,True,Building a wall on the U.S.-Mexico border will...,immigration,rick-perry,Governor,Texas,republican,Radio interview,0.0079719387755102,0.0129081632653061,4,0


## Regression using TF-IDF Vectorizer

In [96]:
tfidf_vect = TfidfVectorizer()

# creating the training vector
X_train = tfidf_vect.fit(df_reduced_sent.text)
X_train = tfidf_vect.transform(df_reduced_sent.text)
y_train = df_reduced_sent["truth-score"].values

X_train.shape

(9577, 11765)

In [97]:
lr = LogisticRegression(solver='lbfgs',multi_class='multinomial')

In [98]:
# Create an instance of Logistic Regression Classifier and fit the data.
lr.fit(X_train, y_train)



LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='multinomial', n_jobs=None, penalty='l2',
                   random_state=None, solver='lbfgs', tol=0.0001, verbose=0,
                   warm_start=False)

In [99]:
# transform the test data to the right format, aligning with the training data 
# (so that it has the size of the vocab of the training set)
X_test = tfidf_vect.transform(df_test_sent_reduced.text) 
y_test = df_test_sent_reduced["truth-score"].values
X_test.shape

(1191, 11765)

In [100]:
# evaluating the tfidf model
lr.fit(X_train, y_train)
y_hat_test = lr.predict(X_test)

# evaluate using accuracy: proportion of correctly predicted over total
print(accuracy_score(y_test, y_hat_test))
print(accuracy_score(y_test, y_hat_test, normalize=False))

0.251049538203
299




In [101]:
# converting train to a matrix
new_train = np.empty([9577, 11765])
array_X_train = X_train.toarray()

for n in range(3671):
    new_train[n] = array_X_train[n]

# converting test to a matrix
new_test = np.empty([1191, 11765])
array_X_test = X_test.toarray()

for n in range(457):
    new_test[n] = array_X_test[n]

In [104]:
# 50 dimensions
train_SVD50Mat = decomposition.TruncatedSVD(n_components = 50, algorithm = "arpack").fit_transform(new_train)
test_SVD50Mat = decomposition.TruncatedSVD(n_components = 50, algorithm = "arpack").fit_transform(new_test)

In [105]:
# evaluating 50 dimensions SVD model
lr.fit(train_SVD50Mat, y_train)
y_hat_test_SVD = lr.predict(test_SVD50Mat)

# evaluate using accuracy: proportion of correctly predicted over total
print(accuracy_score(y_test, y_hat_test_SVD))
print(accuracy_score(y_test, y_hat_test_SVD, normalize=False))

0.218303946264
260
