In [1]:
import pandas as pd

df = pd.read_csv('../datasets/gossipcop_graph_features.csv')
df.head()

Unnamed: 0,label,num_nodes,num_tweets,avg_num_retweet,retweet_perc,num_users,total_propagation_time,avg_num_followers,avg_num_friends,avg_time_diff,perc_post_1_hour,users_10h,id
0,fake,116,110,0.045455,0.051724,61,1525941000.0,20970.565217,1149.026087,743706.0,0.991379,56,gossipcop-1000240645
1,fake,5,3,0.333333,0.4,3,1485491000.0,158959.75,791.75,6278.0,0.2,2,gossipcop-1000908841
2,fake,3,2,0.0,0.333333,1,1495247000.0,317729.0,723.0,0.0,0.333333,1,gossipcop-1009248558
3,fake,15,10,0.4,0.333333,14,1496761000.0,26939.0,3446.928571,2765.666667,0.466667,7,gossipcop-1012123555
4,fake,30,22,0.318182,0.266667,21,1530403000.0,30835.965517,5045.862069,12419.083333,0.166667,11,gossipcop-1014383679


In [None]:
from pandas_profiling import ProfileReport
profile = ProfileReport(df, title="Politifact EDA")

In [None]:
profile.to_file("politifact_eda.html")

In [2]:
# Politifact dataset statistics

data = [
    ("news_articles", len(df[df['label'] == "fake"]), len(df[df['label'] == "real"])),
    ("tweets", df[df['label'] == "fake"]['num_tweets'].sum(), df[df['label'] == "real"]['num_tweets'].sum()),
    ("unique_users", df[df['label'] == "fake"]['num_users'].sum(), df[df['label'] == "real"]['num_users'].sum())
]

pd.DataFrame(data=data, columns=["type", 'fake', "real"]).set_index("type")

Unnamed: 0_level_0,fake,real
type,Unnamed: 1_level_1,Unnamed: 2_level_1
news_articles,5059,15648
tweets,451452,759356
unique_users,624147,805647


In [3]:
df.groupby("label").mean().apply(lambda s: s.apply('{0:.3f}'.format))

Unnamed: 0_level_0,num_nodes,num_tweets,avg_num_retweet,retweet_perc,num_users,total_propagation_time,avg_num_followers,avg_num_friends,avg_time_diff,perc_post_1_hour,users_10h
label,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
fake,154.351,89.237,0.427,0.288,123.374,1515784986.334,70546.463,2339.371,86132.982,0.373,41.781
real,60.615,48.527,0.157,0.136,51.486,1513359948.575,34896.594,2329.693,55941.102,0.415,37.991


In [4]:
df.groupby("label").std().apply(lambda s: s.apply('{0:.3f}'.format))

Unnamed: 0_level_0,num_nodes,num_tweets,avg_num_retweet,retweet_perc,num_users,total_propagation_time,avg_num_followers,avg_num_friends,avg_time_diff,perc_post_1_hour,users_10h
label,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
fake,409.015,212.531,1.07,0.19,323.564,20532042.796,103568.428,3279.721,594777.349,0.289,115.867
real,112.205,66.347,0.83,0.176,96.085,12430112.648,138713.122,2313.495,637970.696,0.243,49.698


In [14]:
from imblearn.under_sampling import RandomUnderSampler

rus = RandomUnderSampler()
X_res, y_res = rus.fit_resample(df.drop(['label', 'id'], axis=1), df['label'])

In [15]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder

le = LabelEncoder()
df['label'] = le.fit_transform(df['label'])

X_train, X_test, y_train, y_test = train_test_split(X_res, y_res, test_size=0.33)

In [16]:
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression, RidgeClassifier, PassiveAggressiveClassifier, SGDClassifier
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier, HistGradientBoostingClassifier, \
    GradientBoostingClassifier, BaggingClassifier, AdaBoostClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.gaussian_process import GaussianProcessClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import LinearSVC
from sklearn.preprocessing import StandardScaler

params = {
    'estimator': [
        LogisticRegression(), RidgeClassifier(), PassiveAggressiveClassifier(), SGDClassifier(), LinearSVC(max_iter=20000),
        RandomForestClassifier(), ExtraTreesClassifier(), 
        HistGradientBoostingClassifier(), GradientBoostingClassifier(), 
        BaggingClassifier(), AdaBoostClassifier(),
        KNeighborsClassifier(), 
        GaussianProcessClassifier(),
        GaussianNB()
    ]
}

pipeline = Pipeline([
    ('scaling', StandardScaler()),
    ('estimator', LogisticRegression())
])

gs = GridSearchCV(pipeline, params, cv=10, scoring="f1_weighted")
gs.fit(X_train, y_train)
results = pd.DataFrame(data=gs.cv_results_)
results.sort_values('rank_test_score')



Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_estimator,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,split5_test_score,split6_test_score,split7_test_score,split8_test_score,split9_test_score,mean_test_score,std_test_score,rank_test_score
5,0.911486,0.010029,0.021383,0.002384,RandomForestClassifier(),{'estimator': RandomForestClassifier()},0.918845,0.902641,0.909926,0.892246,0.895262,0.905592,0.898212,0.893782,0.901175,0.917233,0.903491,0.008904,1
7,0.895273,0.011534,0.011482,0.005841,HistGradientBoostingClassifier(),{'estimator': HistGradientBoostingClassifier()},0.912973,0.89528,0.912925,0.892302,0.889378,0.911497,0.890832,0.901169,0.902654,0.918758,0.902777,0.010145,2
6,0.47002,0.006494,0.025088,0.001569,ExtraTreesClassifier(),{'estimator': ExtraTreesClassifier()},0.907067,0.88492,0.909959,0.88789,0.884956,0.90413,0.88787,0.877581,0.879039,0.914328,0.893774,0.012936,3
9,0.292072,0.006766,0.004728,0.000418,BaggingClassifier(),{'estimator': BaggingClassifier()},0.901125,0.868683,0.895138,0.878849,0.88194,0.902624,0.890855,0.889375,0.887858,0.902478,0.889893,0.010557,4
8,1.283827,0.017447,0.003572,0.002666,GradientBoostingClassifier(),{'estimator': GradientBoostingClassifier()},0.88938,0.879056,0.898212,0.878971,0.881997,0.893804,0.889311,0.88934,0.870188,0.912847,0.888311,0.01129,5
12,51.512431,4.457766,0.355244,0.450416,GaussianProcessClassifier(),{'estimator': GaussianProcessClassifier()},0.874627,0.864231,0.892311,0.853979,0.865758,0.876102,0.881956,0.881997,0.862824,0.884783,0.873857,0.011281,6
11,0.018243,0.002849,0.04612,0.002873,KNeighborsClassifier(),{'estimator': KNeighborsClassifier()},0.899705,0.858376,0.876105,0.852461,0.870115,0.876053,0.868649,0.876053,0.856924,0.889191,0.872363,0.01386,7
10,0.333195,0.004837,0.011807,0.000685,AdaBoostClassifier(),{'estimator': AdaBoostClassifier()},0.883472,0.858407,0.887881,0.864302,0.867255,0.874609,0.85667,0.853957,0.845124,0.887681,0.867936,0.014215,8
3,0.028009,0.007202,0.002913,0.001115,SGDClassifier(),{'estimator': SGDClassifier()},0.688319,0.744744,0.709297,0.710195,0.75516,0.716725,0.743363,0.747189,0.688833,0.725225,0.722905,0.022963,9
0,0.037831,0.014722,0.004916,0.004715,LogisticRegression(),{'estimator': LogisticRegression()},0.725229,0.694504,0.710347,0.681086,0.716929,0.689877,0.698749,0.714489,0.687982,0.683325,0.700252,0.014676,10


In [17]:
gs.best_estimator_.score(X_test, y_test)

0.9062593590895478