In [1]:
import pandas as pd

df = pd.read_csv('../datasets/politifact_graph_features.csv')
df.head()

Unnamed: 0,label,num_nodes,num_tweets,avg_num_retweet,retweet_perc,num_users,total_propagation_time,avg_num_followers,avg_num_friends,avg_time_diff,perc_post_1_hour,users_10h,id
0,fake,124,82,0.5,0.33871,122,1454356000.0,6980.203252,5410.723577,66045.631151,0.153226,88,politifact11773
1,fake,12,9,0.222222,0.25,11,1486939000.0,2670.454545,1903.0,28906.5,0.083333,1,politifact13038
2,fake,59,40,0.45,0.322034,47,1543481000.0,3597.689655,871.87931,41604.9,0.610169,23,politifact13467
3,fake,333,219,0.515982,0.342342,316,1524245000.0,109006.966867,2361.521084,160908.689676,0.453453,207,politifact13468
4,fake,1530,712,1.147472,0.534641,1421,1506620000.0,3942.915631,3699.542184,90408.423591,0.001307,3,politifact13475


In [None]:
from pandas_profiling import ProfileReport
profile = ProfileReport(df, title="Politifact EDA")

In [6]:
profile.to_file("politifact_eda.html")

Summarize dataset: 100%|██████████| 147/147 [00:39<00:00,  3.74it/s, Completed]
Generate report structure: 100%|██████████| 1/1 [00:05<00:00,  5.35s/it]
Render HTML: 100%|██████████| 1/1 [00:05<00:00,  5.98s/it]
Export report to file: 100%|██████████| 1/1 [00:00<00:00, 11.36it/s]


In [8]:
# Politifact dataset statistics

data = [
    ("news_articles", len(df[df['label'] == "fake"]), len(df[df['label'] == "real"])),
    ("tweets", df[df['label'] == "fake"]['num_tweets'].sum(), df[df['label'] == "real"]['num_tweets'].sum()),
    ("unique_users", df[df['label'] == "fake"]['num_users'].sum(), df[df['label'] == "real"]['num_users'].sum())
]

pd.DataFrame(data=data, columns=["type", 'fake', "real"]).set_index("type")


Unnamed: 0_level_0,fake,real
type,Unnamed: 1_level_1,Unnamed: 2_level_1
news_articles,322,263
tweets,108922,224265
unique_users,172054,493966


In [9]:
df.groupby("label").mean().apply(lambda s: s.apply('{0:.3f}'.format))

Unnamed: 0_level_0,num_nodes,num_tweets,avg_num_retweet,retweet_perc,num_users,total_propagation_time,avg_num_followers,avg_num_friends,avg_time_diff,perc_post_1_hour,users_10h
label,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
fake,631.748,338.267,0.848,0.362,534.329,1516524227.149,8714.431,3346.605,96954.436,0.383,157.531
real,2278.433,852.719,1.036,0.404,1878.198,1458752351.224,59140.746,2936.06,398690.436,0.472,211.951


In [10]:
df.groupby("label").std().apply(lambda s: s.apply('{0:.3f}'.format))

Unnamed: 0_level_0,num_nodes,num_tweets,avg_num_retweet,retweet_perc,num_users,total_propagation_time,avg_num_followers,avg_num_friends,avg_time_diff,perc_post_1_hour,users_10h
label,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
fake,3256.58,1418.47,1.297,0.218,2479.548,19975265.673,17490.121,3057.385,436228.01,0.4,391.242
real,6679.075,2298.408,1.853,0.227,5518.158,104670391.829,303544.73,4070.05,3961482.683,0.349,329.953


In [11]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder

le = LabelEncoder()
df['label'] = le.fit_transform(df['label'])

X_train, X_test, y_train, y_test = train_test_split(df.drop(['label', 'id'], axis=1), df['label'], test_size=0.33)

In [12]:
from sklearn.model_selection import KFold

kf = KFold(n_splits=10, shuffle=True)

In [13]:
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression, RidgeClassifier, PassiveAggressiveClassifier, SGDClassifier
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier, HistGradientBoostingClassifier, \
    GradientBoostingClassifier, BaggingClassifier, AdaBoostClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.gaussian_process import GaussianProcessClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import LinearSVC
from sklearn.preprocessing import StandardScaler

params = {
    'estimator': [
        LogisticRegression(), RidgeClassifier(), PassiveAggressiveClassifier(), SGDClassifier(), LinearSVC(max_iter=20000),
        RandomForestClassifier(), ExtraTreesClassifier(), 
        HistGradientBoostingClassifier(), GradientBoostingClassifier(), 
        BaggingClassifier(), AdaBoostClassifier(),
        KNeighborsClassifier(), 
        GaussianProcessClassifier(),
        GaussianNB()
    ]
}

pipeline = Pipeline([
    ('scaling', StandardScaler()),
    ('estimator', LogisticRegression())
])

gs = GridSearchCV(pipeline, params, cv=kf, scoring="f1_weighted")
gs.fit(X_train, y_train)
results = pd.DataFrame(data=gs.cv_results_)
results.sort_values('rank_test_score')

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_estimator,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,split5_test_score,split6_test_score,split7_test_score,split8_test_score,split9_test_score,mean_test_score,std_test_score,rank_test_score
5,0.316088,0.122771,0.025611,0.007568,RandomForestClassifier(),{'estimator': RandomForestClassifier()},0.85,0.894605,0.745304,0.872313,0.845543,0.844231,0.793244,0.820513,0.819803,0.923077,0.840863,0.047969,1
7,0.574473,0.297616,0.029625,0.062365,HistGradientBoostingClassifier(),{'estimator': HistGradientBoostingClassifier()},0.85,0.841908,0.767964,0.846154,0.897029,0.844231,0.819803,0.846154,0.689866,0.897029,0.830014,0.058174,2
8,0.21805,0.087862,0.006126,0.004957,GradientBoostingClassifier(),{'estimator': GradientBoostingClassifier()},0.85,0.894605,0.821944,0.846154,0.870773,0.788425,0.766164,0.794602,0.794602,0.870773,0.829804,0.040662,3
9,0.041306,0.017015,0.00566,0.00416,BaggingClassifier(),{'estimator': BaggingClassifier()},0.9,0.846154,0.795682,0.844231,0.844097,0.792308,0.743252,0.743252,0.819803,0.897436,0.822621,0.052267,4
10,0.126114,0.098462,0.01187,0.007396,AdaBoostClassifier(),{'estimator': AdaBoostClassifier()},0.848437,0.922104,0.821944,0.819527,0.792129,0.764359,0.819803,0.794602,0.766164,0.845543,0.819461,0.043936,5
6,0.155997,0.046673,0.018099,0.007421,ExtraTreesClassifier(),{'estimator': ExtraTreesClassifier()},0.85,0.894605,0.719088,0.794872,0.845543,0.792308,0.819803,0.793244,0.7157,0.871795,0.809696,0.056494,6
4,0.152285,0.087919,0.002808,0.003454,LinearSVC(max_iter=20000),{'estimator': LinearSVC(max_iter=20000)},0.875397,0.804501,0.719808,0.705659,0.841764,0.591492,0.815436,0.762704,0.659933,0.794872,0.757157,0.083171,7
11,0.01273,0.029139,0.011115,0.007341,KNeighborsClassifier(),{'estimator': KNeighborsClassifier()},0.773985,0.804501,0.694364,0.816724,0.760361,0.688462,0.665348,0.741555,0.740162,0.846154,0.753162,0.056095,8
0,0.171105,0.403578,0.009619,0.008241,LogisticRegression(),{'estimator': LogisticRegression()},0.80101,0.836074,0.720198,0.735531,0.784615,0.591492,0.815436,0.709972,0.659933,0.794872,0.744913,0.072926,9
12,0.288191,0.207319,0.010709,0.021739,GaussianProcessClassifier(),{'estimator': GaussianProcessClassifier()},0.771001,0.841908,0.61792,0.812692,0.760361,0.711994,0.741555,0.709972,0.659933,0.820513,0.744785,0.068045,10


In [15]:
gs.best_estimator_.score(X_test, y_test)

0.8556701030927835