In [1]:
import sys
!{sys.executable} -m pip install numpy
!{sys.executable} -m pip install pandas
!{sys.executable} -m pip install sklearn



In [1]:
import pandas as pd
import numpy as np
import sklearn

In [2]:
clickbait_percentage = pd.read_csv('percent_file.csv')
dic_source_clickbait = {row['source']: row['percent_clickbait'] for i, row in clickbait_percentage.iterrows()}
average_clickbait = clickbait_percentage.percent_clickbait.sum() / len(clickbait_percentage)
data = pd.read_csv('edata_all_no_www.csv')

In [3]:
def add_clickbait(source):
    if source.startswith('www.'):
        source = source[4:]
    if source in dic_source_clickbait:
        return dic_source_clickbait[source]
    return average_clickbait
    
data['clickbait_percentage'] = data['source'].apply(add_clickbait)


def get_features(data, source_len = 724):
    """
    features for claims
    """
    dic_f = {} # claimCount -> features
    
    for i in range(len(data)):
        row = data.iloc[i]
        stance = row['articleHeadlineStance']
        stance_id = -1 if stance == 'against' else 0 if stance == 'observing'\
            else 1
        source = row.sourceCount - 1 # 1-index to 0-index
        claim = row.claimCount
        
        if claim not in dic_f: dic_f[claim] = np.zeros((source_len,))
        dic_f[claim][source] = stance_id
    
    #claims = dic_f.keys()
    return dic_f


def extract_truth_labels(data):
    claims = sorted(data.claimCount.unique().tolist())
    l = [''] * len(claims)
    for i in range(len(data)):
        row = data.iloc[i]
        truth = row.claimTruth
        claim = row.claimCount
        claimIdx = claims.index(claim)
        l[claimIdx] = truth        
    return (claims, l)


def build_veracity_prediction_matrix():
    dic_f = get_features(data)
        
    (claims, veracity) = extract_truth_labels(data)
    
    n = len(claims)
    m = dic_f.items()[0][1].shape[0]
    
    F = np.zeros((n, m))
    for i, c in enumerate(claims): F[i, :] = dic_f[c]
    
    return (claims, F, veracity)


In [17]:
from sklearn.linear_model import LogisticRegression
claims, F, vera = build_veracity_prediction_matrix()
clf = sklearn.linear_model.LogisticRegression()

In [18]:
cross_var = 8

print(np.mean(sklearn.model_selection.cross_val_score(clf, F, vera, cv=cross_var)))
print(np.var(sklearn.model_selection.cross_val_score(clf, F, vera, cv=cross_var)))

0.5864252410305042
0.009686965948098532


In [19]:
G = F.copy()
for i, row in data.iterrows():
    source_index = row['sourceCount'] - 1
    percent_clickbait = row['clickbait_percentage']
    G[:, source_index] = F[:, source_index] * (1 - (percent_clickbait * 0.01))
            

In [20]:
clf_g = sklearn.linear_model.LogisticRegression()

In [21]:
print(np.mean(sklearn.model_selection.cross_val_score(clf_g, G, vera, cv=cross_var)))
print(np.var(sklearn.model_selection.cross_val_score(clf_g, G, vera, cv=cross_var)))

0.6131954322743797
0.013387398431191792


Try different cross validation value and do the paired T test

In [86]:
#cross_var = 5
#cross_var = 10
#cross_var = 15
cross_var = 20
leave_one_out = sklearn.model_selection.LeaveOneOut()
original_accuracies = sklearn.model_selection.cross_val_score(clf, F, vera, cv=cross_var)
new_accuracies = sklearn.model_selection.cross_val_score(clf_g, G, vera, cv=cross_var)
print np.mean(original_accuracies), np.std(original_accuracies)
print np.mean(new_accuracies), np.std(new_accuracies)
import scipy
print scipy.stats.ttest_1samp(new_accuracies - original_accuracies, 0)

0.5754096638655462 0.11670510696250525
0.6076663165266106 0.13459689962025628
Ttest_1sampResult(statistic=1.7345952654975938, pvalue=0.09900676051044668)
