In [1]:
import pandas as pd

df = pd.read_csv('../datasets/politifact_graph_features.csv')
df.head()

Unnamed: 0,label,num_nodes,num_tweets,avg_num_retweet,retweet_perc,num_users,total_propagation_time,avg_num_followers,avg_num_friends,avg_time_diff,perc_post_1_hour,users_10h,id
0,fake,124,82,0.5,0.33871,122,1454356000.0,6980.203252,5410.723577,66045.631151,0.153226,88,politifact11773
1,fake,12,9,0.222222,0.25,11,1486939000.0,2670.454545,1903.0,28906.5,0.083333,1,politifact13038
2,fake,59,40,0.45,0.322034,47,1543481000.0,3597.689655,871.87931,41604.9,0.610169,23,politifact13467
3,fake,333,219,0.515982,0.342342,316,1524245000.0,109006.966867,2361.521084,160908.689676,0.453453,207,politifact13468
4,fake,1530,712,1.147472,0.534641,1421,1506620000.0,3942.915631,3699.542184,90408.423591,0.001307,3,politifact13475


In [None]:
import json
dataset = json.load(open("../../dataset/politifact_news_ids_dataset.json"))

train_dataset_ids = dataset['train_dataset']
kfolds = dataset['kfolds']
test_dataset_ids = dataset['test_dataset']

df = df[df.id.isin(train_dataset_ids+test_dataset_ids)]
train_dataset  = df[df['id'].isin(train_dataset_ids)].copy().reset_index(drop=True)

In [None]:
from pandas_profiling import ProfileReport
profile = ProfileReport(df, title="Politifact EDA")

In [6]:
profile.to_file("politifact_eda.html")

Summarize dataset: 100%|██████████| 147/147 [00:39<00:00,  3.74it/s, Completed]
Generate report structure: 100%|██████████| 1/1 [00:05<00:00,  5.35s/it]
Render HTML: 100%|██████████| 1/1 [00:05<00:00,  5.98s/it]
Export report to file: 100%|██████████| 1/1 [00:00<00:00, 11.36it/s]


In [8]:
# Politifact dataset statistics

data = [
    ("news_articles", len(df[df['label'] == "fake"]), len(df[df['label'] == "real"])),
    ("tweets", df[df['label'] == "fake"]['num_tweets'].sum(), df[df['label'] == "real"]['num_tweets'].sum()),
    ("unique_users", df[df['label'] == "fake"]['num_users'].sum(), df[df['label'] == "real"]['num_users'].sum())
]

pd.DataFrame(data=data, columns=["type", 'fake', "real"]).set_index("type")


Unnamed: 0_level_0,fake,real
type,Unnamed: 1_level_1,Unnamed: 2_level_1
news_articles,322,263
tweets,108922,224265
unique_users,172054,493966


In [9]:
df.groupby("label").mean().apply(lambda s: s.apply('{0:.3f}'.format))

Unnamed: 0_level_0,num_nodes,num_tweets,avg_num_retweet,retweet_perc,num_users,total_propagation_time,avg_num_followers,avg_num_friends,avg_time_diff,perc_post_1_hour,users_10h
label,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
fake,631.748,338.267,0.848,0.362,534.329,1516524227.149,8714.431,3346.605,96954.436,0.383,157.531
real,2278.433,852.719,1.036,0.404,1878.198,1458752351.224,59140.746,2936.06,398690.436,0.472,211.951


In [10]:
df.groupby("label").std().apply(lambda s: s.apply('{0:.3f}'.format))

Unnamed: 0_level_0,num_nodes,num_tweets,avg_num_retweet,retweet_perc,num_users,total_propagation_time,avg_num_followers,avg_num_friends,avg_time_diff,perc_post_1_hour,users_10h
label,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
fake,3256.58,1418.47,1.297,0.218,2479.548,19975265.673,17490.121,3057.385,436228.01,0.4,391.242
real,6679.075,2298.408,1.853,0.227,5518.158,104670391.829,303544.73,4070.05,3961482.683,0.349,329.953


In [12]:
from sklearn.model_selection import PredefinedSplit

test_fold = [-1] * len(train_dataset)
for fold, (train_index, val_index) in enumerate(kfolds):
  for i in val_index:
    test_fold[i] = fold

ps = PredefinedSplit(test_fold=test_fold)

In [None]:
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression, RidgeClassifier, PassiveAggressiveClassifier, SGDClassifier
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier, HistGradientBoostingClassifier, \
    GradientBoostingClassifier, BaggingClassifier, AdaBoostClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.gaussian_process import GaussianProcessClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import LinearSVC
from sklearn.preprocessing import StandardScaler

params = {
    'estimator': [
        LogisticRegression(), RidgeClassifier(), PassiveAggressiveClassifier(), SGDClassifier(), LinearSVC(max_iter=20000),
        RandomForestClassifier(), ExtraTreesClassifier(), 
        HistGradientBoostingClassifier(), GradientBoostingClassifier(), 
        BaggingClassifier(), AdaBoostClassifier(),
        KNeighborsClassifier(), 
        GaussianProcessClassifier(),
        GaussianNB()
    ]
}

pipeline = Pipeline([
    ('scaling', StandardScaler()),
    ('estimator', LogisticRegression())
])

X_train, y_train = train_dataset.drop(['label', 'id'],axis=1), train_dataset['label']
gs = GridSearchCV(pipeline, params, cv=ps, scoring="f1_weighted")
gs.fit(X_train, y_train)
results = pd.DataFrame(data=gs.cv_results_)
results.sort_values('rank_test_score')