In [85]:
import sys

import importlib
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import sqlalchemy
import gensim
import logging

from scipy import stats

import common
import util
import ml_util
importlib.reload(common)
importlib.reload(util)
importlib.reload(ml_util)

from common import create_engine
from common import display_all
from common import figsize
from common import save_df, load_df
from common import save_session, load_session

from util import show_importances
from util import split_X_y_all, split_X_y, split_data
from util import empty_features, column_feature, str_contains

from ml_util import SelectFromModelPandas

from pbar import Pbar

from pandas.plotting import register_matplotlib_converters

register_matplotlib_converters() # converters e.g. for datetime in plots
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)

In [2]:
RANDOM_STATE = 123
np_random = np.random.RandomState(RANDOM_STATE)

In [211]:
class FakePredictor:
    
    def __init__(self, values):
        self.values = values
    
    def predict(self, X):
        return pd.to_numeric(self.values)

### Load dataset

In [3]:
df = load_df('final_data.pickle')

In [4]:
df.head()

Unnamed: 0_level_0,title,perex,body,raw_body,published_at,extracted_at,category,other_info,image_count,video_count,...,fb_popularity_ad_2,fb_popularity_ad_3,fb_popularity_ad_4,fb_popularity_ad_5,fb_popularity_ad_6,fb_popularity_ad_7,fb_popularity_ad_8,fb_popularity_ad_9,fb_popularity_ad_10,body_urls
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
431065,put communities at the center of universal hea...,if universal health care is truly meant to ref...,if universal health care is truly meant to ref...,"<p>The <a href=""https://www.who.int/news-room/...",2019-10-21 10:45:10,2019-10-21 12:13:53.281652,[First Opinion],"{'tags': ['public health', 'global health', 'H...",1,0,...,165.0,176.0,185.0,192.0,193.0,207.0,210.0,228.0,233.0,[https://www.who.int/news-room/detail/23-09-20...
431066,rapid expansion of telehealth comes with new c...,although new delivery methods will help telehe...,although new delivery methods will help telehe...,<p>It&#x2019;s a boom time for telehealth. Sta...,2019-10-21 10:40:26,2019-10-21 12:13:53.499347,[First Opinion],"{'tags': ['telehealth'], 'keywords': ['']}",1,0,...,44.0,47.0,47.0,47.0,49.0,55.0,56.0,62.0,67.0,[https://www.statnews.com/2019/10/21/telehealt...
431068,"after decades-long campaign, type 3 poliovirus...",the formal bid to eradicate all polio began in...,the formal bid to eradicate all polio began in...,"<p>After <a href=""https://www.statnews.com/201...",2019-10-21 10:30:40,2019-10-21 12:13:53.714328,[Health],"{'tags': ['public health', 'infectious disease...",1,0,...,617.0,673.0,698.0,705.0,709.0,913.0,1137.0,1197.0,1232.0,[https://www.statnews.com/2019/10/21/decades-l...
431081,"be humble, and proudly, psychologists say",humility is not the boldest of personality tra...,humility is not the boldest of personality tra...,,2019-10-21 00:00:00,2019-10-21 12:14:05.770730,,"{'tags': [], 'keywords': ['']}",1,0,...,3714.0,4217.0,5480.0,8674.0,9476.0,9867.0,10241.0,10792.0,11391.0,[]
431082,when teen drinking becomes a disorder,why are some adolescents more vulnerable than ...,why are some adolescents more vulnerable than ...,,2019-10-21 00:00:00,2019-10-21 12:14:05.836162,,"{'tags': [], 'keywords': ['']}",1,0,...,172.0,198.0,218.0,229.0,243.0,253.0,273.0,290.0,303.0,[]


In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 15458 entries, 431065 to 812426
Data columns (total 68 columns):
 #   Column                   Non-Null Count  Dtype         
---  ------                   --------------  -----         
 0   title                    15458 non-null  object        
 1   perex                    15458 non-null  object        
 2   body                     15458 non-null  object        
 3   raw_body                 15458 non-null  object        
 4   published_at             15458 non-null  datetime64[ns]
 5   extracted_at             15458 non-null  datetime64[ns]
 6   category                 10307 non-null  object        
 7   other_info               15458 non-null  object        
 8   image_count              15458 non-null  int64         
 9   video_count              15458 non-null  int64         
 10  author_name              15458 non-null  object        
 11  source_id                15458 non-null  int64         
 12  source_name              1

In [6]:
len(df)

15458

-----

In [7]:
pop = pd.DataFrame()
qrange = [np.round(i, 2) for i in np.arange(0, 1, 0.05)]
pop['q'] = qrange
for i in range(0, 11):
    col = f'fb_popularity_ad_{i}'
    pop[col] = [df[col].quantile(q) for q in qrange]
pop

Unnamed: 0,q,fb_popularity_ad_0,fb_popularity_ad_1,fb_popularity_ad_2,fb_popularity_ad_3,fb_popularity_ad_4,fb_popularity_ad_5,fb_popularity_ad_6,fb_popularity_ad_7,fb_popularity_ad_8,fb_popularity_ad_9,fb_popularity_ad_10
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.05,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.15,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,0.25,1.0,1.0,1.0,1.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0
6,0.3,2.0,3.0,4.0,4.0,5.0,5.0,6.0,6.0,6.0,6.0,7.0
7,0.35,4.0,7.0,11.0,13.0,14.0,15.0,17.0,17.0,17.0,18.0,19.0
8,0.4,7.0,16.0,23.0,25.0,27.0,29.0,31.0,32.0,32.0,33.0,33.0
9,0.45,12.0,27.0,36.0,42.0,45.0,47.0,49.0,50.0,51.0,53.0,53.0


Rozdelenie hodnot popularity do 4 skupin

- `0 - 0.4`
- `0.4 - 0.75`
- `0.75 - 0.9`
- `0.9 - 1`

In [8]:
def add_labels(df, quantiles, column='fb_popularity_ad_10'):
    df = df.copy()
    label_str = f'{column}_label'
    
    df[label_str] = -1
    
    label = 1    
    for i in range(len(quantiles) - 1):
        low = df[column].quantile(quantiles[i])
        high = df[column].quantile(quantiles[i + 1])
        
        df.loc[(low <= df[column]) & (df[column] <= high), label_str] = int(label)
        
        label += 1
    df = df.drop(columns=[column])    
    return df.copy()

In [9]:
quantiles = [
    0,
    .4,
    .75,
    .9,
    1
]

cols = [
    'fb_ad_10_reaction_count',
    'fb_ad_10_comment_count',
    'fb_ad_10_share_count',
    'fb_popularity_ad_10'
]

for i in cols:
    print(df[i].quantile(quantiles))
    df = add_labels(df, quantiles, column=i)

0.00          0.00
0.40         10.76
0.75        386.00
0.90       2008.24
1.00    1368305.00
Name: fb_ad_10_reaction_count, dtype: float64
0.00         0.0
0.40         1.0
0.75        84.9
0.90       513.3
1.00    897945.0
Name: fb_ad_10_comment_count, dtype: float64
0.00         0.0
0.40        17.0
0.75       171.3
0.90       676.0
1.00    298199.0
Name: fb_ad_10_share_count, dtype: float64
0.00          0.0
0.40         33.0
0.75        685.0
0.90       3312.0
1.00    2564449.0
Name: fb_popularity_ad_10, dtype: float64


In [10]:
df.fb_popularity_ad_10_label.value_counts()

1    6129
2    5461
3    2322
4    1546
Name: fb_popularity_ad_10_label, dtype: int64

Pri jednotlivych zlozkach sme pri tomto rozdeleni nasli len 4 skupiny (lebo 1 == 2)

---

Jednoducha heuristika: ak je zdroj nedoveryhodny tak aj clanok je nedoveryhodny

In [11]:
df['is_fake_news_label'] = df.source_is_reliable.replace({0:1, 1:0})

---

In [12]:
label_names = list(filter(lambda x: x.endswith('_label'), df.columns))

In [13]:
for ln in label_names:
    df[ln] = pd.to_numeric(df[ln])

In [14]:
labels_df = pd.DataFrame()

In [15]:
# labely
labels_df = pd.concat([labels_df] + [df[label_name] for label_name in label_names], axis=1)

## Rozdelenie dat

In [16]:
from sklearn.model_selection import train_test_split

Vytvorenie mnoziny clankov a zdrojov na ktorych sa validuje cela metoda

In [17]:
excluded_data = df[df.source_name.isin([
    'youngwomenshealth.org',
    'vaxopedia.org',
    'emedicinehealth.com',
    
    'hsionline.com',
    'wakeup-world.com',
    'genuinehealth.com',
    'realfarmacy.com',
    'educateinspirechange.org',
    'vaxxter.com'
])].copy()

df = df[~df.source_name.isin(excluded_data.source_name.unique())]

In [18]:
excluded_data.source_name.value_counts()

emedicinehealth.com         230
realfarmacy.com             182
hsionline.com               171
vaxopedia.org                78
vaxxter.com                  38
wakeup-world.com             38
educateinspirechange.org     31
youngwomenshealth.org        17
genuinehealth.com            17
Name: source_name, dtype: int64

In [19]:
excluded_data.source_is_reliable.value_counts()

0    477
1    325
Name: source_is_reliable, dtype: int64

In [20]:
df = df.reset_index()

In [21]:
train, test = tuple(split_data(df, sizes=[3, 1], shuffle=True, np_random=np_random))

In [22]:
train.set_index('id', inplace=True)
test.set_index('id', inplace=True)

In [23]:
excluded_data = excluded_data.reset_index()
excluded_data.set_index('id', inplace=True)

In [24]:
save_df(train, 'train_data')
save_df(test, 'test_data')
save_df(excluded_data, 'excluded_data')

In [25]:
print([len(i) for i in [train,test, excluded_data]])

[10992, 3664, 802]


In [26]:
label_names

['fb_ad_10_reaction_count_label',
 'fb_ad_10_comment_count_label',
 'fb_ad_10_share_count_label',
 'fb_popularity_ad_10_label',
 'is_fake_news_label']

In [27]:
data = split_X_y_all(train, test, excluded_data, 'is_fake_news_label', label_names)

---

### d2v

In [28]:
from gensim.models.doc2vec import Doc2Vec
from gensim.models.doc2vec import TaggedDocument

In [29]:
import spacy
importlib.reload(spacy)
nlp = spacy.load("en_core_web_sm")

def tokenize(text):
    doc = nlp(text, disable=['parser', 'tagger', 'ner'])

    return words_from_doc(doc)

def words_from_doc(doc):
    res = []
    for i in doc:
        if i.is_stop:
            continue
        if i.is_punct:
            continue

        res.append(str(i))

    return res

def tokenize_to_file(data, file):
    with open(file, 'w', encoding='utf-8') as f:
        for i in Pbar(data):
            f.write(f"{' '.join(tokenize(i))}\n")

### Trening doc2vec modelu

In [30]:
tokenize_to_file(data.train.X.body, './data/train_body_tokenized.txt')
tokenize_to_file(data.test.X.body, './data/test_body_tokenized.txt')
tokenize_to_file(data.validation.X.body, './data/validation_data_body_tokenized.txt')



In [31]:
d2v = Doc2Vec(corpus_file='./data/train_body_tokenized.txt', vector_size=300, min_count=2, epochs=25, workers=12, seed=RANDOM_STATE)

2020-05-06 18:51:22,256 : INFO : collecting all words and their counts
2020-05-06 18:51:22,257 : INFO : PROGRESS: at example #0, processed 0 words (0/s), 0 word types, 0 tags
2020-05-06 18:51:23,474 : INFO : PROGRESS: at example #10000, processed 4182706 words (3439909/s), 110285 word types, 10000 tags
2020-05-06 18:51:23,592 : INFO : collected 116016 word types and 10992 unique tags from a corpus of 10992 examples and 4599596 words
2020-05-06 18:51:23,593 : INFO : Loading a fresh vocabulary
2020-05-06 18:51:23,774 : INFO : effective_min_count=2 retains 66229 unique words (57% of original 116016, drops 49787)
2020-05-06 18:51:23,775 : INFO : effective_min_count=2 leaves 4549809 word corpus (98% of original 4599596, drops 49787)
2020-05-06 18:51:23,997 : INFO : deleting the raw counts dictionary of 116016 items
2020-05-06 18:51:24,000 : INFO : sample=0.001 downsamples 9 most-common words
2020-05-06 18:51:24,001 : INFO : downsampling leaves estimated 4499511 word corpus (98.9% of prior 4

In [32]:
def infer_d2v(d2v_model, data_file):
    res = []
    
    with open(data_file, 'r', encoding='utf-8') as f:
        for i in Pbar(f.readlines()):
            res.append(d2v_model.infer_vector(i.split(' '), steps=20, alpha=0.025)) 
    
    return res

def infer_for_df(df, d2v_model, data_file):
    lst = infer_d2v(d2v_model, data_file)
    d2v_df = pd.DataFrame(lst, index=df.index, columns=[f'd2v_{i}' for i in range(1, 301)] )
    
    return d2v_df

### Nacitanie predpripravenych crt

In [33]:
f_sentiment = load_df('features_sentiment.pickle')
f_readability = load_df('features_readability.pickle')
f_metadata = load_df('features_metadata.pickle')
f_empath = load_df('features_empath.pickle')
f_content = load_df('features_content.pickle')
f_named_entities = load_df('features_named_entities.pickle')
f_popularity = load_df('features_popularity.pickle')
f_popularity_d0 = load_df('features_popularity_0.pickle')
f_popularity_d1 = load_df('features_popularity_1.pickle')
f_popularity_d2 = load_df('features_popularity_2.pickle')

ziskanie d2v vektorov pre train a test body

In [34]:
f_d2v_train = infer_for_df(data.train.X, d2v, './data/train_body_tokenized.txt')
save_df(f_d2v_train, 'd2v_train.pickle')



In [35]:
f_d2v_test = infer_for_df(data.test.X, d2v, './data/test_body_tokenized.txt')
save_df(f_d2v_test, 'd2v_test.pickle')



In [36]:
f_d2v_validation = infer_for_df(data.validation.X, d2v, './data/validation_data_body_tokenized.txt')



In [150]:
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.metrics import f1_score, accuracy_score, precision_score, recall_score, make_scorer

from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import RandomizedSearchCV, GridSearchCV
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier

### Vytvorenie jednotlivych feature setov

In [39]:
data.train.features = pd.concat([
    pd.DataFrame(index=data.train.X.index),
    f_sentiment,
    f_readability,
    f_metadata,
    f_empath,
    f_content,
    f_named_entities,
    f_d2v_train,
    f_popularity_d0,
    f_popularity_d1,
    f_popularity_d2
], join='inner', axis=1)

In [40]:
data.test.features = pd.concat([
    pd.DataFrame(index=data.test.X.index),
    f_sentiment,
    f_readability,
    f_metadata,
    f_empath,
    f_content,
    f_named_entities,
    f_d2v_test,
    f_popularity_d0,
    f_popularity_d1,
    f_popularity_d2
], join='inner', axis=1)

In [41]:
data.validation.features = pd.concat([
    pd.DataFrame(index=data.validation.X.index),
    f_sentiment,
    f_readability,
    f_metadata,
    f_empath,
    f_content,
    f_named_entities,
    f_d2v_validation,
    f_popularity_d0,
    f_popularity_d1,
    f_popularity_d2
], join='inner', axis=1)

utilitne metody pre jednoduchsiu pracu

In [157]:
class Runner:

    def __init__(self, train, train_y, test, test_y, clfs=None):
        if clfs is None:
            self.classifiers = [
                XGBClassifier(n_jobs=12, seed=RANDOM_STATE),
                RandomForestClassifier(n_estimators=100, n_jobs=12, class_weight='balanced', random_state=RANDOM_STATE),
                DecisionTreeClassifier(class_weight='balanced', random_state=RANDOM_STATE)
            ]
        else:
            self.classifiers = clfs
            
        
        self.train = train
        self.train_y = train_y
        self.test = test
        self.test_y = test_y
        

    def fit_predict(self, clf):
        clf.fit(self.train, self.train_y)
        return clf.predict(self.test)
    
    def run(self):
        _pbar_conf = {
            'refresh_rate': 1,
            'pbar_width': 52,
            'length': len(self.classifiers),
            'action_names': [i.__class__.__name__ for i in self.classifiers]
        }
        
        for p in Pbar((self.fit_predict(clf) for clf in self.classifiers), **_pbar_conf):
            print(classification_report(self.test_y, p))
            print(confusion_matrix(self.test_y, p))
            print('-' * 54)
            
        return self.classifiers

In [191]:
def drop_source_features(df):
    cols = [col for col in df if col.startswith('source_')]
    return df.drop(columns=cols)

def limit_features(df, day, source=False):
    
    if source is False:
        df = drop_source_features(df)  
    
    for i in range(2, day-1, -1):
        cols = [
                f'fb_ad_{i}_reaction_count',
                f'fb_ad_{i}_share_count',
                f'fb_ad_{i}_comment_count',
                f'fb_popularity_ad_{i}'
        ]
        df = df.drop(columns=cols)
        
    return df.copy()

def popularity_features(df, day):
    df = df.drop(columns=['popularity_prediction'], errors='ignore')
    return limit_features(df, day=day, source=True)

def detection_features(df, day, pop_predictor=None):
    df = df.drop(columns=['popularity_prediction'], errors='ignore')
    if pop_predictor is not None:
        df['popularity_prediction'] = pop_predictor.predict(limit_features(df, day=day, source=True))

    return limit_features(df, day=day, source=False)

Pociatocne parametre pre random search

---

# Realizacia experimentov

#### Situacia: Moment publikovania (ziadne data o popularite)

In [213]:
# nastavenie labelu
data.train.switch_label('fb_popularity_ad_10_label')
data.test.switch_label('fb_popularity_ad_10_label')
data.validation.switch_label('fb_popularity_ad_10_label')

Predikcia popularity

In [215]:
ad0_pp_clf = Runner(train=popularity_features(pd.concat([data.train.features, data.validation.features]), day=0),
                train_y=pd.concat([data.train.y, data.validation.y]),
                test=popularity_features(data.test.features, day=0),
                test_y=data.test.y).run()

[                                                    ] (processing: XGBClassifier) -- 0 / 3              precision    recall  f1-score   support

           1       0.91      0.78      0.84      1397
           2       0.59      0.88      0.70      1312
           3       0.42      0.10      0.16       575
           4       0.51      0.49      0.50       380

    accuracy                           0.68      3664
   macro avg       0.61      0.56      0.55      3664
weighted avg       0.68      0.68      0.65      3664

[[1083  308    2    4]
 [  75 1157   29   51]
 [  20  374   57  124]
 [   8  138   47  187]]
------------------------------------------------------

           1       0.68      0.82      0.74      1397
           2       0.54      0.73      0.62      1312
           3       0.30      0.06      0.10       575
           4       0.52      0.11      0.19       380

    accuracy                           0.59      3664
   macro avg       0.51      0.43      0.41      3664


Detekcia falosnych sprav

In [216]:
data.train.switch_label('is_fake_news_label')
data.test.switch_label('is_fake_news_label')
data.validation.switch_label('is_fake_news_label')

bez pouzitia predikcie

In [217]:
ad0_fn_clf1 = Runner(train=detection_features(data.train.features, day=0),
                train_y=data.train.y,
                test=detection_features(data.test.features, day=0),
                test_y=data.test.y).run()

[                                                    ] (processing: XGBClassifier) -- 0 / 3              precision    recall  f1-score   support

           0       0.97      0.99      0.98      2184
           1       0.98      0.95      0.96      1480

    accuracy                           0.97      3664
   macro avg       0.97      0.97      0.97      3664
weighted avg       0.97      0.97      0.97      3664

[[2153   31]
 [  76 1404]]
------------------------------------------------------

           0       0.92      0.99      0.96      2184
           1       0.98      0.88      0.93      1480

    accuracy                           0.95      3664
   macro avg       0.95      0.94      0.94      3664
weighted avg       0.95      0.95      0.95      3664

[[2163   21]
 [ 178 1302]]
------------------------------------------------------

           0       0.91      0.93      0.92      2184
           1       0.89      0.87      0.88      1480

    accuracy                       

s vyuzitim predikcie

In [218]:
ad0_fn_clf2 = Runner(train=detection_features(data.train.features, day=0, pop_predictor=ad0_pp_clf[0]),
                train_y=data.train.y,
                test=detection_features(data.test.features, day=0, pop_predictor=ad0_pp_clf[0]),
                test_y=data.test.y).run()

[                                                    ] (processing: XGBClassifier) -- 0 / 3              precision    recall  f1-score   support

           0       0.97      0.98      0.98      2184
           1       0.98      0.95      0.96      1480

    accuracy                           0.97      3664
   macro avg       0.97      0.97      0.97      3664
weighted avg       0.97      0.97      0.97      3664

[[2150   34]
 [  69 1411]]
------------------------------------------------------

           0       0.93      0.99      0.96      2184
           1       0.99      0.89      0.94      1480

    accuracy                           0.95      3664
   macro avg       0.96      0.94      0.95      3664
weighted avg       0.95      0.95      0.95      3664

[[2169   15]
 [ 160 1320]]
------------------------------------------------------

           0       0.91      0.93      0.92      2184
           1       0.89      0.86      0.87      1480

    accuracy                       

validacna sada

validacia bez pouzitia predikcie

In [219]:
ad0_fn_valid_clf1 = Runner(train=detection_features(pd.concat([data.train.features, data.test.features]), day=0),
                train_y=pd.concat([data.train.y, data.test.y]),
                test=detection_features(data.validation.features, day=0),
                test_y=data.validation.y).run()

[                                                    ] (processing: XGBClassifier) -- 0 / 3              precision    recall  f1-score   support

           0       0.88      0.74      0.80       325
           1       0.84      0.93      0.88       477

    accuracy                           0.85       802
   macro avg       0.86      0.83      0.84       802
weighted avg       0.85      0.85      0.85       802

[[239  86]
 [ 33 444]]
------------------------------------------------------

           0       0.79      0.81      0.80       325
           1       0.87      0.85      0.86       477

    accuracy                           0.83       802
   macro avg       0.83      0.83      0.83       802
weighted avg       0.84      0.83      0.83       802

[[263  62]
 [ 71 406]]
------------------------------------------------------

           0       0.71      0.73      0.72       325
           1       0.81      0.79      0.80       477

    accuracy                           0.77

validacia s pouzitim predikcie

In [221]:
ad0_fn_valid_clf2 = Runner(train=detection_features(pd.concat([data.train.features, data.test.features]), day=0, pop_predictor=ad0_pp_clf[0]),
                train_y=pd.concat([data.train.y, data.test.y]),
                test=detection_features(data.validation.features, day=0, pop_predictor=ad0_pp_clf[0]),
                test_y=data.validation.y).run()

[                                                    ] (processing: XGBClassifier) -- 0 / 3              precision    recall  f1-score   support

           0       0.89      0.75      0.81       325
           1       0.85      0.94      0.89       477

    accuracy                           0.86       802
   macro avg       0.87      0.84      0.85       802
weighted avg       0.86      0.86      0.86       802

[[243  82]
 [ 29 448]]
------------------------------------------------------

           0       0.78      0.84      0.81       325
           1       0.88      0.84      0.86       477

    accuracy                           0.84       802
   macro avg       0.83      0.84      0.83       802
weighted avg       0.84      0.84      0.84       802

[[272  53]
 [ 76 401]]
------------------------------------------------------

           0       0.71      0.76      0.73       325
           1       0.83      0.79      0.81       477

    accuracy                           0.77

validacia s realnymi datami

In [227]:
ad0_fn_valid_clf3 = Runner(train=detection_features(pd.concat([data.train.features, data.test.features]), day=0,
                                                              pop_predictor=FakePredictor(pd.concat([data.train.y_all['fb_popularity_ad_10_label'], data.test.y_all['fb_popularity_ad_10_label']]))),
                train_y=pd.concat([data.train.y, data.test.y]),
                test=detection_features(data.validation.features, day=0, pop_predictor=FakePredictor(data.validation.y_all['fb_popularity_ad_10_label'])),
                test_y=data.validation.y).run()

[                                                    ] (processing: XGBClassifier) -- 0 / 3              precision    recall  f1-score   support

           0       0.90      0.74      0.81       325
           1       0.84      0.94      0.89       477

    accuracy                           0.86       802
   macro avg       0.87      0.84      0.85       802
weighted avg       0.86      0.86      0.86       802

[[241  84]
 [ 28 449]]
------------------------------------------------------

           0       0.77      0.82      0.79       325
           1       0.87      0.83      0.85       477

    accuracy                           0.83       802
   macro avg       0.82      0.83      0.82       802
weighted avg       0.83      0.83      0.83       802

[[267  58]
 [ 80 397]]
------------------------------------------------------

           0       0.70      0.74      0.72       325
           1       0.82      0.79      0.80       477

    accuracy                           0.77

#### Situacia: Moment 1 den po

In [228]:
# nastavenie labelu
data.train.switch_label('fb_popularity_ad_10_label')
data.test.switch_label('fb_popularity_ad_10_label')
data.validation.switch_label('fb_popularity_ad_10_label')

In [229]:
ad1_pp_clf = Runner(train=popularity_features(pd.concat([data.train.features, data.validation.features]), day=1),
                train_y=pd.concat([data.train.y, data.validation.y]),
                test=popularity_features(data.test.features, day=1),
                test_y=data.test.y).run()

[                                                    ] (processing: XGBClassifier) -- 0 / 3              precision    recall  f1-score   support

           1       0.88      0.95      0.91      1397
           2       0.81      0.83      0.82      1312
           3       0.76      0.65      0.70       575
           4       0.89      0.75      0.81       380

    accuracy                           0.84      3664
   macro avg       0.83      0.80      0.81      3664
weighted avg       0.84      0.84      0.84      3664

[[1324   70    0    3]
 [ 162 1092   51    7]
 [  19  156  373   27]
 [   2   28   64  286]]
------------------------------------------------------

           1       0.87      0.92      0.89      1397
           2       0.72      0.86      0.78      1312
           3       0.69      0.41      0.52       575
           4       0.91      0.67      0.77       380

    accuracy                           0.79      3664
   macro avg       0.80      0.71      0.74      3664


In [230]:
data.train.switch_label('is_fake_news_label')
data.test.switch_label('is_fake_news_label')
data.validation.switch_label('is_fake_news_label')

In [231]:
ad1_fn_clf1 = Runner(train=detection_features(data.train.features, day=1),
                train_y=data.train.y,
                test=detection_features(data.test.features, day=1),
                test_y=data.test.y).run()

[                                                    ] (processing: XGBClassifier) -- 0 / 3              precision    recall  f1-score   support

           0       0.97      0.98      0.98      2184
           1       0.98      0.96      0.97      1480

    accuracy                           0.97      3664
   macro avg       0.97      0.97      0.97      3664
weighted avg       0.97      0.97      0.97      3664

[[2149   35]
 [  65 1415]]
------------------------------------------------------

           0       0.95      0.99      0.97      2184
           1       0.98      0.92      0.95      1480

    accuracy                           0.96      3664
   macro avg       0.96      0.95      0.96      3664
weighted avg       0.96      0.96      0.96      3664

[[2160   24]
 [ 122 1358]]
------------------------------------------------------

           0       0.91      0.93      0.92      2184
           1       0.89      0.87      0.88      1480

    accuracy                       

In [232]:
ad1_fn_clf2 = Runner(train=detection_features(data.train.features, day=1, pop_predictor=ad1_pp_clf[0]),
                train_y=data.train.y,
                test=detection_features(data.test.features, day=1, pop_predictor=ad1_pp_clf[0]),
                test_y=data.test.y).run()

[                                                    ] (processing: XGBClassifier) -- 0 / 3              precision    recall  f1-score   support

           0       0.97      0.98      0.98      2184
           1       0.98      0.96      0.97      1480

    accuracy                           0.97      3664
   macro avg       0.97      0.97      0.97      3664
weighted avg       0.97      0.97      0.97      3664

[[2149   35]
 [  65 1415]]
------------------------------------------------------

           0       0.94      0.99      0.96      2184
           1       0.99      0.90      0.94      1480

    accuracy                           0.95      3664
   macro avg       0.96      0.94      0.95      3664
weighted avg       0.96      0.95      0.95      3664

[[2165   19]
 [ 150 1330]]
------------------------------------------------------

           0       0.90      0.93      0.91      2184
           1       0.89      0.85      0.87      1480

    accuracy                       

validacna mnozina

In [239]:
ad1_fn_valid_clf1 = Runner(train=detection_features(pd.concat([data.train.features, data.test.features]), day=1),
                train_y=pd.concat([data.train.y, data.test.y]),
                test=detection_features(data.validation.features, day=1),
                test_y=data.validation.y).run()

[                                                    ] (processing: XGBClassifier) -- 0 / 3              precision    recall  f1-score   support

           0       0.92      0.74      0.82       325
           1       0.85      0.95      0.90       477

    accuracy                           0.87       802
   macro avg       0.88      0.85      0.86       802
weighted avg       0.87      0.87      0.87       802

[[242  83]
 [ 22 455]]
------------------------------------------------------

           0       0.78      0.86      0.82       325
           1       0.89      0.84      0.87       477

    accuracy                           0.85       802
   macro avg       0.84      0.85      0.84       802
weighted avg       0.85      0.85      0.85       802

[[278  47]
 [ 77 400]]
------------------------------------------------------

           0       0.69      0.73      0.71       325
           1       0.81      0.78      0.79       477

    accuracy                           0.76

In [242]:
ad1_fn_valid_clf2 = Runner(train=detection_features(pd.concat([data.train.features, data.test.features]), day=1, pop_predictor=ad1_pp_clf[0]),
                train_y=pd.concat([data.train.y, data.test.y]),
                test=detection_features(data.validation.features, day=1, pop_predictor=ad1_pp_clf[0]),
                test_y=data.validation.y).run()

[                                                    ] (processing: XGBClassifier) -- 0 / 3              precision    recall  f1-score   support

           0       0.90      0.75      0.82       325
           1       0.85      0.94      0.89       477

    accuracy                           0.87       802
   macro avg       0.87      0.85      0.86       802
weighted avg       0.87      0.87      0.86       802

[[244  81]
 [ 27 450]]
------------------------------------------------------

           0       0.77      0.86      0.81       325
           1       0.89      0.82      0.86       477

    accuracy                           0.84       802
   macro avg       0.83      0.84      0.83       802
weighted avg       0.84      0.84      0.84       802

[[279  46]
 [ 85 392]]
------------------------------------------------------

           0       0.73      0.74      0.74       325
           1       0.82      0.81      0.82       477

    accuracy                           0.78

In [243]:
ad1_fn_valid_clf3 = Runner(train=detection_features(pd.concat([data.train.features, data.test.features]), day=1,
                                                              pop_predictor=FakePredictor(pd.concat([data.train.y_all['fb_popularity_ad_10_label'], data.test.y_all['fb_popularity_ad_10_label']]))),
                train_y=pd.concat([data.train.y, data.test.y]),
                test=detection_features(data.validation.features, day=1, pop_predictor=FakePredictor(data.validation.y_all['fb_popularity_ad_10_label'])),
                test_y=data.validation.y).run()

[                                                    ] (processing: XGBClassifier) -- 0 / 3              precision    recall  f1-score   support

           0       0.91      0.75      0.82       325
           1       0.85      0.95      0.90       477

    accuracy                           0.87       802
   macro avg       0.88      0.85      0.86       802
weighted avg       0.87      0.87      0.87       802

[[245  80]
 [ 25 452]]
------------------------------------------------------

           0       0.78      0.85      0.81       325
           1       0.89      0.84      0.86       477

    accuracy                           0.84       802
   macro avg       0.84      0.84      0.84       802
weighted avg       0.85      0.84      0.84       802

[[275  50]
 [ 77 400]]
------------------------------------------------------

           0       0.71      0.75      0.73       325
           1       0.82      0.79      0.81       477

    accuracy                           0.77

#### Situacia: Moment 2 dni po

In [244]:
# nastavenie labelu
data.train.switch_label('fb_popularity_ad_10_label')
data.test.switch_label('fb_popularity_ad_10_label')
data.validation.switch_label('fb_popularity_ad_10_label')

In [245]:
ad2_pp_clf = Runner(train=popularity_features(pd.concat([data.train.features, data.validation.features]), day=2),
                train_y=pd.concat([data.train.y, data.validation.y]),
                test=popularity_features(data.test.features, day=2),
                test_y=data.test.y).run()

[                                                    ] (processing: XGBClassifier) -- 0 / 3              precision    recall  f1-score   support

           1       0.92      0.98      0.95      1397
           2       0.88      0.90      0.89      1312
           3       0.84      0.77      0.80       575
           4       0.95      0.79      0.86       380

    accuracy                           0.90      3664
   macro avg       0.90      0.86      0.88      3664
weighted avg       0.90      0.90      0.90      3664

[[1369   27    1    0]
 [ 107 1176   24    5]
 [  12  110  442   11]
 [   1   16   62  301]]
------------------------------------------------------

           1       0.91      0.95      0.93      1397
           2       0.83      0.90      0.86      1312
           3       0.80      0.69      0.74       575
           4       0.96      0.74      0.84       380

    accuracy                           0.87      3664
   macro avg       0.88      0.82      0.84      3664


In [246]:
# nastavenie labelu

data.train.switch_label('is_fake_news_label')
data.test.switch_label('is_fake_news_label')
data.validation.switch_label('is_fake_news_label')

In [247]:
ad2_fn_clf1 = Runner(train=detection_features(data.train.features, day=2),
                train_y=data.train.y,
                test=detection_features(data.test.features, day=2),
                test_y=data.test.y).run()

[                                                    ] (processing: XGBClassifier) -- 0 / 3              precision    recall  f1-score   support

           0       0.97      0.98      0.98      2184
           1       0.98      0.95      0.97      1480

    accuracy                           0.97      3664
   macro avg       0.97      0.97      0.97      3664
weighted avg       0.97      0.97      0.97      3664

[[2149   35]
 [  67 1413]]
------------------------------------------------------

           0       0.94      0.99      0.97      2184
           1       0.98      0.91      0.95      1480

    accuracy                           0.96      3664
   macro avg       0.96      0.95      0.96      3664
weighted avg       0.96      0.96      0.96      3664

[[2161   23]
 [ 129 1351]]
------------------------------------------------------

           0       0.91      0.93      0.92      2184
           1       0.89      0.86      0.88      1480

    accuracy                       

In [None]:
# + predikcia popularity

In [248]:
ad2_fn_clf2 = Runner(train=detection_features(data.train.features, day=2, pop_predictor=ad2_pp_clf[0]),
                train_y=data.train.y,
                test=detection_features(data.test.features, day=2, pop_predictor=ad2_pp_clf[0]),
                test_y=data.test.y).run()

[                                                    ] (processing: XGBClassifier) -- 0 / 3              precision    recall  f1-score   support

           0       0.97      0.98      0.98      2184
           1       0.98      0.95      0.97      1480

    accuracy                           0.97      3664
   macro avg       0.97      0.97      0.97      3664
weighted avg       0.97      0.97      0.97      3664

[[2149   35]
 [  67 1413]]
------------------------------------------------------

           0       0.94      0.99      0.97      2184
           1       0.98      0.91      0.95      1480

    accuracy                           0.96      3664
   macro avg       0.96      0.95      0.96      3664
weighted avg       0.96      0.96      0.96      3664

[[2159   25]
 [ 131 1349]]
------------------------------------------------------

           0       0.91      0.93      0.92      2184
           1       0.89      0.86      0.88      1480

    accuracy                       

In [249]:
# validacna

In [250]:
ad2_fn_valid_clf1 = Runner(train=detection_features(pd.concat([data.train.features, data.test.features]), day=2),
                train_y=pd.concat([data.train.y, data.test.y]),
                test=detection_features(data.validation.features, day=2),
                test_y=data.validation.y).run()

[                                                    ] (processing: XGBClassifier) -- 0 / 3              precision    recall  f1-score   support

           0       0.91      0.75      0.82       325
           1       0.85      0.95      0.90       477

    accuracy                           0.87       802
   macro avg       0.88      0.85      0.86       802
weighted avg       0.88      0.87      0.87       802

[[244  81]
 [ 23 454]]
------------------------------------------------------

           0       0.80      0.83      0.81       325
           1       0.88      0.86      0.87       477

    accuracy                           0.85       802
   macro avg       0.84      0.84      0.84       802
weighted avg       0.85      0.85      0.85       802

[[269  56]
 [ 67 410]]
------------------------------------------------------

           0       0.75      0.74      0.74       325
           1       0.82      0.83      0.83       477

    accuracy                           0.79

In [251]:
# validacna + predikcia
ad2_fn_valid_clf2 = Runner(train=detection_features(pd.concat([data.train.features, data.test.features]), day=2,
                                                              pop_predictor=ad2_pp_clf[0]),
                train_y=pd.concat([data.train.y, data.test.y]),
                test=detection_features(data.validation.features, day=2, pop_predictor=ad2_pp_clf[0]),
                test_y=data.validation.y).run()

[                                                    ] (processing: XGBClassifier) -- 0 / 3              precision    recall  f1-score   support

           0       0.91      0.75      0.82       325
           1       0.85      0.95      0.90       477

    accuracy                           0.87       802
   macro avg       0.88      0.85      0.86       802
weighted avg       0.88      0.87      0.87       802

[[244  81]
 [ 23 454]]
------------------------------------------------------

           0       0.79      0.87      0.83       325
           1       0.90      0.84      0.87       477

    accuracy                           0.85       802
   macro avg       0.85      0.86      0.85       802
weighted avg       0.86      0.85      0.85       802

[[282  43]
 [ 74 403]]
------------------------------------------------------

           0       0.73      0.73      0.73       325
           1       0.81      0.82      0.82       477

    accuracy                           0.78

In [253]:
# validacna + real
ad2_fn_valid_clf3 = Runner(train=detection_features(pd.concat([data.train.features, data.test.features]), day=2,
                                                              pop_predictor=FakePredictor(pd.concat([data.train.y_all['fb_popularity_ad_10_label'], data.test.y_all['fb_popularity_ad_10_label']]))),
                train_y=pd.concat([data.train.y, data.test.y]),
                test=detection_features(data.validation.features, day=2, pop_predictor=FakePredictor(data.validation.y_all['fb_popularity_ad_10_label'])),
                test_y=data.validation.y).run()

[                                                    ] (processing: XGBClassifier) -- 0 / 3              precision    recall  f1-score   support

           0       0.91      0.75      0.82       325
           1       0.85      0.95      0.90       477

    accuracy                           0.87       802
   macro avg       0.88      0.85      0.86       802
weighted avg       0.87      0.87      0.87       802

[[243  82]
 [ 24 453]]
------------------------------------------------------

           0       0.77      0.86      0.82       325
           1       0.90      0.83      0.86       477

    accuracy                           0.84       802
   macro avg       0.84      0.85      0.84       802
weighted avg       0.85      0.84      0.84       802

[[281  44]
 [ 83 394]]
------------------------------------------------------

           0       0.76      0.74      0.75       325
           1       0.83      0.84      0.84       477

    accuracy                           0.80