In [1]:
import pandas as pd

df = pd.read_csv('../datasets/gossipcop_graph_features.csv')
df.head()

Unnamed: 0,label,num_nodes,num_tweets,avg_num_retweet,retweet_perc,num_users,total_propagation_time,avg_num_followers,avg_num_friends,avg_time_diff,perc_post_1_hour,users_10h,id
0,fake,116,110,0.045455,0.051724,61,1525941000.0,20970.565217,1149.026087,743706.0,0.991379,56,gossipcop-1000240645
1,fake,5,3,0.333333,0.4,3,1485491000.0,158959.75,791.75,6278.0,0.2,2,gossipcop-1000908841
2,fake,3,2,0.0,0.333333,1,1495247000.0,317729.0,723.0,0.0,0.333333,1,gossipcop-1009248558
3,fake,15,10,0.4,0.333333,14,1496761000.0,26939.0,3446.928571,2765.666667,0.466667,7,gossipcop-1012123555
4,fake,30,22,0.318182,0.266667,21,1530403000.0,30835.965517,5045.862069,12419.083333,0.166667,11,gossipcop-1014383679


In [2]:
import json
dataset = json.load(open("../../dataset/gossipcop_news_ids_dataset.json"))

train_dataset_ids = dataset['train_dataset']
val_dataset_ids = dataset['val_dataset']
test_dataset_ids = dataset['test_dataset']

df = df[df.id.isin(train_dataset_ids+test_dataset_ids+val_dataset_ids)]
train_dataset  = df[df['id'].isin(train_dataset_ids+val_dataset_ids)].copy().reset_index(drop=True)

In [32]:
from pandas_profiling import ProfileReport
profile = ProfileReport(df, title="Politifact EDA")

In [None]:
profile.to_file("politifact_eda.html")

In [33]:
# Politifact dataset statistics

data = [
    ("news_articles", len(df[df['label'] == "fake"]), len(df[df['label'] == "real"])),
    ("tweets", df[df['label'] == "fake"]['num_tweets'].sum(), df[df['label'] == "real"]['num_tweets'].sum()),
    ("unique_users", df[df['label'] == "fake"]['num_users'].sum(), df[df['label'] == "real"]['num_users'].sum())
]

pd.DataFrame(data=data, columns=["type", 'fake', "real"]).set_index("type")

Unnamed: 0_level_0,fake,real
type,Unnamed: 1_level_1,Unnamed: 2_level_1
news_articles,4717,15534
tweets,405214,753029
unique_users,567399,799750


In [24]:
df.groupby("label").mean().apply(lambda s: s.apply('{0:.3f}'.format))

Unnamed: 0_level_0,num_nodes,num_tweets,avg_num_retweet,retweet_perc,num_users,total_propagation_time,avg_num_followers,avg_num_friends,avg_time_diff,perc_post_1_hour,users_10h
label,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
fake,148.32,85.905,0.429,0.287,120.288,1515895820.326,69703.467,2357.907,88762.276,0.373,40.577
real,60.572,48.476,0.157,0.136,51.484,1513276686.797,34857.504,2332.385,56235.151,0.415,37.963


In [25]:
df.groupby("label").std().apply(lambda s: s.apply('{0:.3f}'.format))

Unnamed: 0_level_0,num_nodes,num_tweets,avg_num_retweet,retweet_perc,num_users,total_propagation_time,avg_num_followers,avg_num_friends,avg_time_diff,perc_post_1_hour,users_10h
label,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
fake,397.001,206.244,1.085,0.191,319.509,20749027.266,105222.301,3322.045,612542.026,0.291,114.123
real,111.949,65.902,0.83,0.176,95.881,12423087.219,138981.752,2318.121,640280.886,0.243,49.272


In [3]:
from sklearn.preprocessing import LabelEncoder

le = LabelEncoder()
train_dataset['label'] = le.fit_transform(train_dataset['label'])

In [4]:
from sklearn.model_selection import PredefinedSplit

test_fold = [-1 if x in train_dataset_ids else 0 for x in train_dataset['id']]
ps = PredefinedSplit(test_fold=test_fold)

In [5]:
from sklearn.model_selection import GridSearchCV
from imblearn.pipeline import Pipeline
from imblearn.over_sampling import RandomOverSampler
from sklearn.linear_model import LogisticRegression, RidgeClassifier, PassiveAggressiveClassifier, SGDClassifier
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier, HistGradientBoostingClassifier, \
    GradientBoostingClassifier, BaggingClassifier, AdaBoostClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.gaussian_process import GaussianProcessClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import LinearSVC
from sklearn.preprocessing import StandardScaler

params = {
    'estimator': [
        LogisticRegression(), RidgeClassifier(), PassiveAggressiveClassifier(), SGDClassifier(),
        RandomForestClassifier(), ExtraTreesClassifier(), 
        HistGradientBoostingClassifier(), GradientBoostingClassifier(), 
        BaggingClassifier(), AdaBoostClassifier(),
        KNeighborsClassifier(), 
        GaussianProcessClassifier(),
        GaussianNB()
    ]
}

pipeline = Pipeline([
    ('sampling', RandomOverSampler()),
    ('scaling', StandardScaler()),
    ('estimator', LogisticRegression())
])

X_train, y_train = train_dataset.drop(['label', 'id'],axis=1), train_dataset['label']
gs = GridSearchCV(pipeline, params, cv=ps, scoring="f1_weighted")
gs.fit(X_train, y_train)
results = pd.DataFrame(data=gs.cv_results_)
results.sort_values('rank_test_score')

In [31]:
from sklearn.ensemble import StackingClassifier
from sklearn.model_selection import cross_val_score

stc = StackingClassifier(estimators=[
    ('rfc', RandomForestClassifier()), 
    ('abc', AdaBoostClassifier()), 
    ('gbc', GradientBoostingClassifier()), 
    ('etc', ExtraTreesClassifier())
])

cross_val_score(stc, X_train, y_train, cv=ps, scoring="f1_weighted")

array([0.88942026, 0.92623056, 0.91001168, 0.90263114, 0.92029834,
       0.89821304, 0.89238462, 0.87610619, 0.8967102 , 0.89359273])