# Binomial regression

In this notebook, you will find a binomial regression done on a reduced part of the training set (i.e. we took out all the statements that were not labeled 'false' or 'true'). We created this notebook to get ourself acquainted with how logistic regression and feature importance works. 

In [1]:
import pandas as pd

# create df
df_liar = pd.read_csv("train.tsv", encoding="utf8", sep="\t", names=["id", "truth-value", 
                                                                     "text", "topic", "name", "job", 
                                                                     "state", "politics", "count1", "count2", 
                                                                     "count3", "count4", "count5", "context"])

df_liar.head(3)

Unnamed: 0,id,truth-value,text,topic,name,job,state,politics,count1,count2,count3,count4,count5,context
0,2635.json,false,Says the Annies List political group supports ...,abortion,dwayne-bohac,State representative,Texas,republican,0.0,1.0,0.0,0.0,0.0,a mailer
1,10540.json,half-true,When did the decline of coal start? It started...,"energy,history,job-accomplishments",scott-surovell,State delegate,Virginia,democrat,0.0,0.0,1.0,1.0,0.0,a floor speech.
2,324.json,mostly-true,"Hillary Clinton agrees with John McCain ""by vo...",foreign-policy,barack-obama,President,Illinois,democrat,70.0,71.0,160.0,163.0,9.0,Denver


In [2]:
# classification formula
def classify(text):
    if text == "false":
        return 0
    elif text == "true":
        return 1 
    else: 
        return -1 

In [3]:
# add the class of truth-values
df_liar["class"] = df_liar["truth-value"].apply(classify) 

In [4]:
# reduce the dataset to only true/false sentences
df_reduced = df_liar[df_liar["class"] != -1]
print(df_reduced.shape)
df_reduced.head(3)

(3671, 15)


Unnamed: 0,id,truth-value,text,topic,name,job,state,politics,count1,count2,count3,count4,count5,context,class
0,2635.json,False,Says the Annies List political group supports ...,abortion,dwayne-bohac,State representative,Texas,republican,0.0,1.0,0.0,0.0,0.0,a mailer,0
3,1123.json,False,Health care reform legislation is likely to ma...,health-care,blog-posting,,,none,7.0,19.0,3.0,5.0,44.0,a news release,0
5,12465.json,True,The Chicago Bears have had more starting quart...,education,robin-vos,Wisconsin Assembly speaker,Wisconsin,republican,0.0,3.0,2.0,5.0,1.0,a an online opinion-piece,1


In [5]:
# count 
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
count_vect = CountVectorizer()

# training set
X_train = count_vect.fit(df_reduced.text)
X_train = count_vect.transform(df_reduced.text)
X_train.shape


(3671, 7451)

In [6]:
# subtract the classes
y_train = df_reduced["class"].values

In [7]:
from sklearn.linear_model import LogisticRegression
logreg = LogisticRegression(solver='lbfgs')

# Create an instance of Logistic Regression Classifier and fit the data.
logreg.fit(X_train, y_train)



LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='warn', n_jobs=None, penalty='l2',
                   random_state=None, solver='lbfgs', tol=0.0001, verbose=0,
                   warm_start=False)

In [8]:
# create df of the test data
df_liar_test = pd.read_csv("test.tsv", encoding="utf8", sep="\t", names=["id", "truth-value", 
                                                                     "text", "topic", "name", "job", 
                                                                     "state", "politics", "count1", "count2", 
                                                                     "count3", "count4", "count5", "context"])
df_liar_test["class"] = df_liar_test["truth-value"].apply(classify)
df_test_reduced = df_liar_test[df_liar_test["class"] != -1]

# transform the test data to the right format, aligning with the training data 
# (so that it has the size of the vocab of the training set)
X_test = count_vect.transform(df_test_reduced.text) 
y_test = df_test_reduced["class"].values
X_test.shape

(457, 7451)

In [9]:
# we could try to evaluate the model
logreg.fit(X_train, y_train)
y_hat_test = logreg.predict(X_test)

# evaluate using accuracy: proportion of correctly predicted over total
from sklearn.metrics import accuracy_score
print(accuracy_score(y_test, y_hat_test))
print(accuracy_score(y_test, y_hat_test, normalize=False))

0.599562363239
274




## Comments
> This is not super high accuracy, but it is better than the multinomal regression we experimented with in update 1 (whihch was around 0.3). However, we don't really expect a high accuracy, since our dataset is not very dense, so this is already way better than we expected! 

In [10]:
# this prints out all the coefficients (which is in fact all the vocabulary of the training set)
print(logreg.coef_)
print(logreg.coef_.shape)

[[-0.17203638 -0.27699619  0.05011177 ..., -0.15039476  0.27433863
   0.47206197]]
(1, 7451)


In [11]:
# create a dictionary of all coefficients and their weights from the logistic regression
coef_dict = dict()
for n, key in enumerate(count_vect.vocabulary_.keys()):
    coef_dict[key] = logreg.coef_[0][n] 

In [12]:
coef_dict

{'says': -0.17203637902256194,
 'the': -0.27699619286997451,
 'annies': 0.05011176823867837,
 'list': 0.14942538913492664,
 'political': 0.11716453602654241,
 'group': -0.092538451088969395,
 'supports': 0.0085350399790884539,
 'third': 0.13434952251712989,
 'trimester': 0.0084091534112239272,
 'abortions': 0.1342368027864384,
 'on': 0.36475708888748087,
 'demand': 0.213236164351612,
 'health': 0.031085675489561142,
 'care': -0.23697213910507725,
 'reform': -0.33416172430507118,
 'legislation': 0.35651077943116427,
 'is': -0.063736457380172168,
 'likely': -0.33376020791019517,
 'to': 0.38212160938752171,
 'mandate': -0.18892486828958799,
 'free': 0.0078327843571648643,
 'sex': -0.41410563411562878,
 'change': 0.25449300803140462,
 'surgeries': -0.34690828965710463,
 'chicago': 0.0078327843571648643,
 'bears': 0.1955976856648608,
 'have': 0.13434952251712989,
 'had': 0.57227443235443787,
 'more': 0.3036737991459309,
 'starting': -0.058194368338498316,
 'quarterbacks': 0.3169049405490369

In [13]:
# ordering
from collections import OrderedDict

ordered_coefs = [(k, coef_dict[k]) for k in sorted(coef_dict, key=coef_dict.get, reverse=True)]

In [14]:
# 10 highest weighted coefficients:
ordered_coefs[0:10]

[('opposes', 1.2852508564154546),
 ('scanners', 1.2302072443718222),
 ('duel', 1.223908577820521),
 ('democrats', 1.2135750192948989),
 ('dramatically', 1.1738511525095916),
 ('except', 1.1639518946957186),
 ('protects', 1.1618225704045153),
 ('predecessor', 1.1582500993003202),
 ('176', 1.1460457212623074),
 ('fiduciary', 1.133202658098218)]

In [15]:
# 10 lowest weighted coefficients:
ordered_coefs[-10:-1]

[('answer', -1.0141049421385862),
 ('arm', -1.0344518907635407),
 ('sunset', -1.0349690821953719),
 ('1994', -1.0363564779213954),
 ('steroids', -1.0501622342165082),
 ('specter', -1.0730122253388568),
 ('canal', -1.1522027240337045),
 ('alligator', -1.2245222947742149),
 ('stopping', -1.2934142876362542)]

### Comments
> As a result, the highest coefficients are the most important in determining a true statement, whereas the lowest coefficients are the most important in determining a false statement. 


> However, our training data is super thin, and therefore these words might not be the best indicators of fake/true news statements. For example, the word 'scanners' is a highly neutral word, and I doubt whether it would be used relatively often in fake news. Same thing for the number '176'...

## Dimensionality reduction

We will now perform dimensionality reduction, to check whether our model performs better when it's inputted a smaller matrix. We will compare them by their accuracy. For the dimensionality reduction, we used the Singular Value Decomposition method described in Lab 8 (Vector Semantics). 

In [71]:
# for them to fit in the SVD model, they will have to be converted to numpy matrices first
# converting X_train to a matrix
new_train = np.empty([3671, 7451])
array_X_train = X_train.toarray()

for n in range(3671):
    new_train[n] = array_X_train[n]

# converting X_test to a matrix
new_test = np.empty([457, 7451])
array_X_test = X_test.toarray()

for n in range(457):
    new_test[n] = array_X_test[n]

In [80]:
print(new_train.shape, new_test.shape)

(3671, 7451) (457, 7451)


In [110]:
# picking the different dimensions
import sklearn
from sklearn import decomposition

dimensions1 = 50
dimensions2 = 100
dimensions3 = 300
dimensions4 = 400

In [111]:
# 50 dimensions SVD
train_SVD50Mat = decomposition.TruncatedSVD(n_components = dimensions1, algorithm = "arpack").fit_transform(new_train)
test_SVD50Mat = decomposition.TruncatedSVD(n_components = dimensions1, algorithm = "arpack").fit_transform(new_test)

In [112]:
train_SVD50Mat.shape

(3671, 50)

In [113]:
# 100 dimensions SVD
train_SVD100Mat = decomposition.TruncatedSVD(n_components = dimensions2, algorithm ="arpack").fit_transform(new_train)
test_SVD100Mat = decomposition.TruncatedSVD(n_components = dimensions2, algorithm = "arpack").fit_transform(new_test)

In [114]:
# 300 dimensions SVD
train_SVD300Mat = decomposition.TruncatedSVD(n_components = dimensions3, algorithm = "arpack").fit_transform(new_train)
test_SVD300Mat = decomposition.TruncatedSVD(n_components = dimensions3, algorithm = "arpack").fit_transform(new_test)

In [115]:
# 400 dimensions SVD
train_SVD400Mat = decomposition.TruncatedSVD(n_components = dimensions4, algorithm ="arpack").fit_transform(new_train)
test_SVD400Mat = decomposition.TruncatedSVD(n_components = dimensions4, algorithm = "arpack").fit_transform(new_test)

In [116]:
# evaluating 50 dimensions SVD model
logreg.fit(train_SVD50Mat, y_train)
y_hat_test_SVD = logreg.predict(test_SVD50Mat)

# evaluate using accuracy: proportion of correctly predicted over total
from sklearn.metrics import accuracy_score
print(accuracy_score(y_test, y_hat_test_SVD))
print(accuracy_score(y_test, y_hat_test_SVD, normalize=False))

0.492341356674
225


In [117]:
# evaluating 100 dimensions SVD model
logreg.fit(train_SVD100Mat, y_train)
y_hat_test_SVD100 = logreg.predict(test_SVD100Mat)

# evaluate using accuracy: proportion of correctly predicted over total
from sklearn.metrics import accuracy_score
print(accuracy_score(y_test, y_hat_test_SVD100))
print(accuracy_score(y_test, y_hat_test_SVD100, normalize=False))

0.474835886214
217


In [118]:
# evaluating 300 dimensions SVD model
logreg.fit(train_SVD300Mat, y_train)
y_hat_test_SVD300 = logreg.predict(test_SVD300Mat)

# evaluate using accuracy: proportion of correctly predicted over total
from sklearn.metrics import accuracy_score
print(accuracy_score(y_test, y_hat_test_SVD300))
print(accuracy_score(y_test, y_hat_test_SVD300, normalize=False))

0.487964989059
223


In [119]:
# evaluating 400 dimensions SVD model
logreg.fit(train_SVD400Mat, y_train)
y_hat_test_SVD400 = logreg.predict(test_SVD400Mat)

# evaluate using accuracy: proportion of correctly predicted over total
from sklearn.metrics import accuracy_score

print(accuracy_score(y_test, y_hat_test_SVD400))
print(accuracy_score(y_test, y_hat_test_SVD400, normalize=False))

0.492341356674
225


## Results

Model | Accuracy 
--- | --- 
nonSVD | 0.59956
SVD50 | 0.49234
SVD100 | 0.47483
SVD300 | 0.48796
SVD400 | 0.49234

As you can see in the table above, the higher the dimension, the higher the accuracy. The model that works the best is the one in which the dimensions have not been reduced. We did not expect this, because since we're using a sparse vector, we though the regression might 


## using TF-IDF representation 

In [90]:
tfidf_vect = TfidfVectorizer()

# training set
X_train_tfidf = tfidf_vect.fit(df_reduced.text)
X_train_tfidf = tfidf_vect.transform(df_reduced.text)
X_train_tfidf.shape

(3671, 7451)

In [91]:
logreg.fit(X_train_tfidf, y_train)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='warn', n_jobs=None, penalty='l2',
                   random_state=None, solver='lbfgs', tol=0.0001, verbose=0,
                   warm_start=False)

In [93]:
X_test_tfidf = tfidf_vect.transform(df_test_reduced.text) 
X_test_tfidf.shape

(457, 7451)

In [94]:
# we could try to evaluate the model
logreg.fit(X_train_tfidf, y_train)
y_hat_test = logreg.predict(X_test_tfidf)

# evaluate using accuracy: proportion of correctly predicted over total
from sklearn.metrics import accuracy_score
print(accuracy_score(y_test, y_hat_test))
print(accuracy_score(y_test, y_hat_test, normalize=False))

0.628008752735
287


## Dimensionality reduction of tfidf 

In [95]:
# converting train to a matrix
new_train_tfidf = np.empty([3671, 7451])
array_X_train_tfidf = X_train_tfidf.toarray()

for n in range(3671):
    new_train_tfidf[n] = array_X_train_tfidf[n]

# converting test to a matrix
new_test_tfidf = np.empty([457, 7451])
array_X_test_tfidf = X_test_tfidf.toarray()

for n in range(457):
    new_test_tfidf[n] = array_X_test_tfidf[n]

In [100]:
# performing SVD for dimensionality reduction
dimensions1 = 50
dimensions2 = 100
dimensions3 = 300
dimensions4 = 400

In [102]:
# 50 dimensions
train_SVD50Mat_tfidf = decomposition.TruncatedSVD(n_components = dimensions1, algorithm = "arpack").fit_transform(new_train_tfidf)
test_SVD50Mat_tfidf = decomposition.TruncatedSVD(n_components = dimensions1, algorithm = "arpack").fit_transform(new_test_tfidf)

In [103]:
# 100 dimensions
train_SVD100Mat_tfidf = decomposition.TruncatedSVD(n_components = dimensions2, algorithm = "arpack").fit_transform(new_train_tfidf)
test_SVD100Mat_tfidf = decomposition.TruncatedSVD(n_components = dimensions2, algorithm = "arpack").fit_transform(new_test_tfidf)

In [104]:
# 300 dimensions 
train_SVD300Mat_tfidf = decomposition.TruncatedSVD(n_components = dimensions3, algorithm = "arpack").fit_transform(new_train_tfidf)
test_SVD300Mat_tfidf = decomposition.TruncatedSVD(n_components = dimensions3, algorithm = "arpack").fit_transform(new_test_tfidf)

In [105]:
# 400 dimensions 
train_SVD400Mat_tfidf = decomposition.TruncatedSVD(n_components = dimensions4, algorithm = "arpack").fit_transform(new_train_tfidf)
test_SVD400Mat_tfidf = decomposition.TruncatedSVD(n_components = dimensions4, algorithm = "arpack").fit_transform(new_test_tfidf)

In [106]:
# evaluating 50 dimensions tfidf model
logreg.fit(train_SVD50Mat_tfidf, y_train)
y_hat_test = logreg.predict(test_SVD50Mat_tfidf)

# evaluate using accuracy: proportion of correctly predicted over total
from sklearn.metrics import accuracy_score
print(accuracy_score(y_test, y_hat_test))
print(accuracy_score(y_test, y_hat_test, normalize=False))

0.544857768053
249


In [107]:
# evaluating 100 dimensions tfidf model
logreg.fit(train_SVD100Mat_tfidf, y_train)
y_hat_test = logreg.predict(test_SVD100Mat_tfidf)

# evaluate using accuracy: proportion of correctly predicted over total
from sklearn.metrics import accuracy_score
print(accuracy_score(y_test, y_hat_test))
print(accuracy_score(y_test, y_hat_test, normalize=False))

0.584245076586
267


In [108]:
# evaluating 300 dimensions tfidf model
logreg.fit(train_SVD300Mat_tfidf, y_train)
y_hat_test = logreg.predict(test_SVD300Mat_tfidf)

# evaluate using accuracy: proportion of correctly predicted over total
from sklearn.metrics import accuracy_score
print(accuracy_score(y_test, y_hat_test))
print(accuracy_score(y_test, y_hat_test, normalize=False))

0.599562363239
274


In [109]:
# evaluating 400 dimensions tfidf model
logreg.fit(train_SVD400Mat_tfidf, y_train)
y_hat_test = logreg.predict(test_SVD400Mat_tfidf)

# evaluate using accuracy: proportion of correctly predicted over total
from sklearn.metrics import accuracy_score
print(accuracy_score(y_test, y_hat_test))
print(accuracy_score(y_test, y_hat_test, normalize=False))

0.599562363239
274
