In [1]:
import pandas as pd

df = pd.read_csv('../datasets/gossipcop_graph_features.csv')
df.head()

Unnamed: 0,label,num_nodes,num_tweets,avg_num_retweet,retweet_perc,num_users,total_propagation_time,avg_num_followers,avg_num_friends,avg_time_diff,perc_post_1_hour,users_10h,id
0,fake,116,110,0.045455,0.051724,61,1525941000.0,20970.565217,1149.026087,743706.0,0.991379,56,gossipcop-1000240645
1,fake,5,3,0.333333,0.4,3,1485491000.0,158959.75,791.75,6278.0,0.2,2,gossipcop-1000908841
2,fake,3,2,0.0,0.333333,1,1495247000.0,317729.0,723.0,0.0,0.333333,1,gossipcop-1009248558
3,fake,15,10,0.4,0.333333,14,1496761000.0,26939.0,3446.928571,2765.666667,0.466667,7,gossipcop-1012123555
4,fake,30,22,0.318182,0.266667,21,1530403000.0,30835.965517,5045.862069,12419.083333,0.166667,11,gossipcop-1014383679


In [6]:
import os
p = "../../preprocessed_data/gossipcop"

for label in ['fake', 'real']:
    for news in os.listdir(os.path.join(p, label)):
        with open(os.path.join(p, label, news, f'{news}_text.txt'), 'r', encoding='utf-8') as f:
            text = f.read()
        if not text:
            df[df.label != news]



PermissionError: [Errno 13] Permission denied: '../../preprocessed_data/gossipcop\\fake\\gossipcop-1000240645'

In [None]:
from pandas_profiling import ProfileReport
profile = ProfileReport(df, title="Politifact EDA")

In [None]:
profile.to_file("politifact_eda.html")

In [2]:
# Politifact dataset statistics

data = [
    ("news_articles", len(df[df['label'] == "fake"]), len(df[df['label'] == "real"])),
    ("tweets", df[df['label'] == "fake"]['num_tweets'].sum(), df[df['label'] == "real"]['num_tweets'].sum()),
    ("unique_users", df[df['label'] == "fake"]['num_users'].sum(), df[df['label'] == "real"]['num_users'].sum())
]

pd.DataFrame(data=data, columns=["type", 'fake', "real"]).set_index("type")

Unnamed: 0_level_0,fake,real
type,Unnamed: 1_level_1,Unnamed: 2_level_1
news_articles,5059,15648
tweets,451452,759356
unique_users,624147,805647


In [3]:
df.groupby("label").mean().apply(lambda s: s.apply('{0:.3f}'.format))

Unnamed: 0_level_0,num_nodes,num_tweets,avg_num_retweet,retweet_perc,num_users,total_propagation_time,avg_num_followers,avg_num_friends,avg_time_diff,perc_post_1_hour,users_10h
label,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
fake,154.351,89.237,0.427,0.288,123.374,1515784986.334,70546.463,2339.371,86132.982,0.373,41.781
real,60.615,48.527,0.157,0.136,51.486,1513359948.575,34896.594,2329.693,55941.102,0.415,37.991


In [4]:
df.groupby("label").std().apply(lambda s: s.apply('{0:.3f}'.format))

Unnamed: 0_level_0,num_nodes,num_tweets,avg_num_retweet,retweet_perc,num_users,total_propagation_time,avg_num_followers,avg_num_friends,avg_time_diff,perc_post_1_hour,users_10h
label,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
fake,409.015,212.531,1.07,0.19,323.564,20532042.796,103568.428,3279.721,594777.349,0.289,115.867
real,112.205,66.347,0.83,0.176,96.085,12430112.648,138713.122,2313.495,637970.696,0.243,49.698


In [14]:
from imblearn.under_sampling import RandomUnderSampler

rus = RandomUnderSampler()
X_res, y_res = rus.fit_resample(df.drop(['label', 'id'], axis=1), df['label'])

In [15]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder

le = LabelEncoder()
df['label'] = le.fit_transform(df['label'])

X_train, X_test, y_train, y_test = train_test_split(X_res, y_res, test_size=0.33)

In [18]:
from sklearn.model_selection import KFold
kf = KFold(n_splits=10, shuffle=True)

In [19]:
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression, RidgeClassifier, PassiveAggressiveClassifier, SGDClassifier
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier, HistGradientBoostingClassifier, \
    GradientBoostingClassifier, BaggingClassifier, AdaBoostClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.gaussian_process import GaussianProcessClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import LinearSVC
from sklearn.preprocessing import StandardScaler

params = {
    'estimator': [
        LogisticRegression(), RidgeClassifier(), PassiveAggressiveClassifier(), SGDClassifier(), LinearSVC(max_iter=200000),
        RandomForestClassifier(), ExtraTreesClassifier(), 
        HistGradientBoostingClassifier(), GradientBoostingClassifier(), 
        BaggingClassifier(), AdaBoostClassifier(),
        KNeighborsClassifier(), 
        GaussianProcessClassifier(),
        GaussianNB()
    ]
}

pipeline = Pipeline([
    ('scaling', StandardScaler()),
    ('estimator', LogisticRegression())
])

gs = GridSearchCV(pipeline, params, cv=kf, scoring="f1_weighted")
gs.fit(X_train, y_train)
results = pd.DataFrame(data=gs.cv_results_)
results.sort_values('rank_test_score')

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_estimator,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,split5_test_score,split6_test_score,split7_test_score,split8_test_score,split9_test_score,mean_test_score,std_test_score,rank_test_score
5,2.141217,0.017821,0.052119,0.008312,RandomForestClassifier(),{'estimator': RandomForestClassifier()},0.905595,0.896559,0.912977,0.904205,0.914378,0.898272,0.896804,0.905575,0.899705,0.899502,0.903357,0.006065,1
7,1.843789,0.019087,0.017494,0.005524,HistGradientBoostingClassifier(),{'estimator': HistGradientBoostingClassifier()},0.918843,0.884891,0.90851,0.901258,0.914395,0.899677,0.901207,0.893797,0.90268,0.898083,0.902334,0.009279,2
6,1.094479,0.026645,0.058771,0.006763,ExtraTreesClassifier(),{'estimator': ExtraTreesClassifier()},0.905605,0.886415,0.892217,0.885034,0.89235,0.90411,0.88501,0.887905,0.892276,0.901001,0.893192,0.007377,3
9,0.50823,0.103816,0.009466,0.002214,BaggingClassifier(),{'estimator': BaggingClassifier()},0.898253,0.893746,0.9041,0.874641,0.893638,0.892376,0.88796,0.886359,0.902643,0.890576,0.892429,0.008089,4
8,2.432927,0.490395,0.00649,0.004552,GradientBoostingClassifier(),{'estimator': GradientBoostingClassifier()},0.90112,0.865863,0.895248,0.886495,0.908521,0.884924,0.898258,0.877574,0.884996,0.883318,0.888632,0.011783,5
12,80.153946,6.640584,0.319914,0.061494,GaussianProcessClassifier(),{'estimator': GaussianProcessClassifier()},0.87466,0.854037,0.873035,0.877677,0.882035,0.886271,0.879083,0.853979,0.883352,0.875873,0.874,0.010706,6
10,0.641228,0.125223,0.032412,0.015994,AdaBoostClassifier(),{'estimator': AdaBoostClassifier()},0.879029,0.857161,0.883445,0.868696,0.881994,0.862783,0.873185,0.868667,0.874668,0.8656,0.871523,0.008091,7
11,0.036483,0.010974,0.099742,0.067448,KNeighborsClassifier(),{'estimator': KNeighborsClassifier()},0.884875,0.845428,0.865614,0.880574,0.858444,0.867052,0.873097,0.859845,0.876139,0.870027,0.86811,0.011014,8
3,0.057145,0.016216,0.006055,0.003911,SGDClassifier(),{'estimator': SGDClassifier()},0.7419,0.684445,0.700874,0.735865,0.732435,0.747494,0.728492,0.691321,0.692841,0.729705,0.718537,0.022309,9
0,0.138857,0.013189,0.010694,0.002849,LogisticRegression(),{'estimator': LogisticRegression()},0.713977,0.675132,0.694567,0.729697,0.667741,0.717472,0.711503,0.705705,0.697077,0.70977,0.702264,0.018135,10


In [31]:
from sklearn.ensemble import StackingClassifier
from sklearn.model_selection import cross_val_score

stc = StackingClassifier(estimators=[
    ('rfc', RandomForestClassifier()), 
    ('abc', AdaBoostClassifier()), 
    ('gbc', GradientBoostingClassifier()), 
    ('etc', ExtraTreesClassifier())
])

cross_val_score(stc, X_train, y_train, cv=kf, scoring="f1_weighted")

array([0.88942026, 0.92623056, 0.91001168, 0.90263114, 0.92029834,
       0.89821304, 0.89238462, 0.87610619, 0.8967102 , 0.89359273])

In [28]:
stc.cv

KFold(n_splits=10, random_state=RandomState(MT19937) at 0x27E2E2CB740,
   shuffle=True)

In [20]:
gs.best_estimator_.score(X_test, y_test)

0.906858340820605