In [1]:
import pandas as pd

df = pd.read_csv('../datasets/politifact_graph_features.csv')
df.head()

Unnamed: 0,label,num_nodes,num_tweets,avg_num_retweet,retweet_perc,num_users,total_propagation_time,avg_num_followers,avg_num_friends,avg_time_diff,perc_post_1_hour,users_10h,id
0,fake,124,82,0.5,0.33871,122,1454356000.0,6980.203252,5410.723577,66045.631151,0.153226,88,politifact11773
1,fake,12,9,0.222222,0.25,11,1486939000.0,2670.454545,1903.0,28906.5,0.083333,1,politifact13038
2,fake,59,40,0.45,0.322034,47,1543481000.0,3597.689655,871.87931,41604.9,0.610169,23,politifact13467
3,fake,333,219,0.515982,0.342342,316,1524245000.0,109006.966867,2361.521084,160908.689676,0.453453,207,politifact13468
4,fake,1530,712,1.147472,0.534641,1421,1506620000.0,3942.915631,3699.542184,90408.423591,0.001307,3,politifact13475


In [None]:
from pandas_profiling import ProfileReport
profile = ProfileReport(df, title="Politifact EDA")

In [6]:
profile.to_file("politifact_eda.html")

Summarize dataset: 100%|██████████| 147/147 [00:39<00:00,  3.74it/s, Completed]
Generate report structure: 100%|██████████| 1/1 [00:05<00:00,  5.35s/it]
Render HTML: 100%|██████████| 1/1 [00:05<00:00,  5.98s/it]
Export report to file: 100%|██████████| 1/1 [00:00<00:00, 11.36it/s]


In [2]:
# Politifact dataset statistics

data = [
    ("news_articles", len(df[df['label'] == "fake"]), len(df[df['label'] == "real"])),
    ("tweets", df[df['label'] == "fake"]['num_tweets'].sum(), df[df['label'] == "real"]['num_tweets'].sum()),
    ("unique_users", df[df['label'] == "fake"]['num_users'].sum(), df[df['label'] == "real"]['num_users'].sum())
]

pd.DataFrame(data=data, columns=["type", 'fake', "real"]).set_index("type")


Unnamed: 0_level_0,fake,real
type,Unnamed: 1_level_1,Unnamed: 2_level_1
news_articles,385,404
tweets,114151,310891
unique_users,182040,690963


In [3]:
df.groupby("label").mean().apply(lambda s: s.apply('{0:.3f}'.format))

Unnamed: 0_level_0,num_nodes,num_tweets,avg_num_retweet,retweet_perc,num_users,total_propagation_time,avg_num_followers,avg_num_friends,avg_time_diff,perc_post_1_hour,users_10h
label,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
fake,556.431,296.496,0.892,0.364,472.831,1515281458.203,7984.711,3258.577,105159.395,0.366,138.335
real,2071.842,769.532,0.979,0.402,1710.304,1454351859.653,50191.299,2832.341,292362.515,0.481,193.641


In [4]:
df.groupby("label").std().apply(lambda s: s.apply('{0:.3f}'.format))

Unnamed: 0_level_0,num_nodes,num_tweets,avg_num_retweet,retweet_perc,num_users,total_propagation_time,avg_num_followers,avg_num_friends,avg_time_diff,perc_post_1_hour,users_10h
label,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
fake,2984.767,1301.321,1.599,0.22,2273.992,19592017.396,16188.196,2958.628,433041.423,0.392,362.367
real,6032.508,2047.832,1.72,0.227,4968.101,109255603.808,255783.479,3770.064,3202275.354,0.339,313.196


In [5]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder

le = LabelEncoder()
df['label'] = le.fit_transform(df['label'])

X_train, X_test, y_train, y_test = train_test_split(df.drop(['label', 'id'], axis=1), df['label'], test_size=0.33)

In [6]:
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline

In [13]:
from sklearn.linear_model import LogisticRegression, RidgeClassifier, PassiveAggressiveClassifier, SGDClassifier
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier, HistGradientBoostingClassifier, \
    GradientBoostingClassifier, BaggingClassifier, AdaBoostClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.gaussian_process import GaussianProcessClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import LinearSVC
from sklearn.preprocessing import StandardScaler

params = {
    'estimator': [
        LogisticRegression(), RidgeClassifier(), PassiveAggressiveClassifier(), SGDClassifier(), LinearSVC(max_iter=20000),
        RandomForestClassifier(), ExtraTreesClassifier(), 
        HistGradientBoostingClassifier(), GradientBoostingClassifier(), 
        BaggingClassifier(), AdaBoostClassifier(),
        KNeighborsClassifier(), 
        GaussianProcessClassifier(),
        GaussianNB()
    ]
}

pipeline = Pipeline([
    ('scaling', StandardScaler()),
    ('estimator', LogisticRegression())
])

gs = GridSearchCV(pipeline, params, cv=10, scoring="f1_weighted")
gs.fit(X_train, y_train)
results = pd.DataFrame(data=gs.cv_results_)
results.sort_values('rank_test_score')

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_estimator,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,split5_test_score,split6_test_score,split7_test_score,split8_test_score,split9_test_score,mean_test_score,std_test_score,rank_test_score
8,0.345484,0.008636,0.005426,0.000387,GradientBoostingClassifier(),{'estimator': GradientBoostingClassifier()},0.811321,0.90464,0.886792,0.830068,0.791561,0.848733,0.829825,0.735661,0.942286,0.845238,0.842613,0.055788,1
5,0.337831,0.102772,0.024155,0.015871,RandomForestClassifier(),{'estimator': RandomForestClassifier()},0.811321,0.867357,0.962183,0.791561,0.792305,0.830068,0.772615,0.772615,0.865335,0.845238,0.83106,0.054928,2
7,0.819527,0.039292,0.012049,0.001587,HistGradientBoostingClassifier(),{'estimator': HistGradientBoostingClassifier()},0.773424,0.88655,0.924528,0.7731,0.811321,0.773585,0.848949,0.773424,0.846154,0.825308,0.823634,0.050538,3
10,0.186053,0.002869,0.019958,0.000729,AdaBoostClassifier(),{'estimator': AdaBoostClassifier()},0.811186,0.86783,0.905593,0.749713,0.735849,0.867925,0.830189,0.697898,0.884444,0.826859,0.817749,0.065705,4
9,0.068421,0.001415,0.007609,0.000468,BaggingClassifier(),{'estimator': BaggingClassifier()},0.829459,0.866496,0.88655,0.792453,0.829459,0.792453,0.771145,0.69486,0.864935,0.825308,0.815312,0.053068,5
6,0.305224,0.085187,0.031458,0.006678,ExtraTreesClassifier(),{'estimator': ExtraTreesClassifier()},0.811186,0.867357,0.88655,0.790207,0.753663,0.830189,0.771145,0.773424,0.884615,0.769231,0.813757,0.048021,6
12,0.374826,0.01744,0.00739,0.0005,GaussianProcessClassifier(),{'estimator': GaussianProcessClassifier()},0.811321,0.826724,0.90464,0.72024,0.731829,0.886792,0.789309,0.751002,0.823729,0.710575,0.795616,0.0642,7
4,0.081207,0.020676,0.005244,0.001548,LinearSVC(max_iter=20000),{'estimator': LinearSVC(max_iter=20000)},0.728877,0.84676,0.885979,0.702963,0.749713,0.792009,0.789309,0.727134,0.787755,0.766117,0.777662,0.053271,8
0,0.035257,0.012677,0.00698,0.002813,LogisticRegression(),{'estimator': LogisticRegression()},0.711207,0.84676,0.867357,0.697268,0.749713,0.810512,0.771145,0.727134,0.768889,0.710575,0.766056,0.056023,9
11,0.009549,0.00049,0.009991,0.000378,KNeighborsClassifier(),{'estimator': KNeighborsClassifier()},0.716779,0.828352,0.866496,0.72024,0.752063,0.829825,0.712695,0.653486,0.788383,0.651786,0.752011,0.070613,10


In [14]:
gs.best_estimator_.score(X_test, y_test)

0.8582375478927203