In [244]:
import sys

import importlib
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import sqlalchemy
import gensim
import logging

import common
import util
importlib.reload(common)
importlib.reload(util)

from common import create_engine
from common import display_all
from common import figsize
from common import save_df, load_df
from common import save_session, load_session


from util import show_importances
from util import split_X_y_all, split_X_y, split_data
from util import empty_features, column_feature, str_contains

from pbar import Pbar

from pandas.plotting import register_matplotlib_converters

register_matplotlib_converters() # converters e.g. for datetime in plots
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)

In [4]:
RANDOM_STATE = 123
np_random = np.random.RandomState(RANDOM_STATE)

### Load dataset

In [196]:
df = load_df('final_data.pickle')

In [197]:
df.head()

Unnamed: 0_level_0,title,perex,body,raw_body,published_at,extracted_at,category,other_info,image_count,video_count,...,fb_popularity_ad_2,fb_popularity_ad_3,fb_popularity_ad_4,fb_popularity_ad_5,fb_popularity_ad_6,fb_popularity_ad_7,fb_popularity_ad_8,fb_popularity_ad_9,fb_popularity_ad_10,body_urls
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
431065,put communities at the center of universal hea...,if universal health care is truly meant to ref...,if universal health care is truly meant to ref...,"<p>The <a href=""https://www.who.int/news-room/...",2019-10-21 10:45:10,2019-10-21 12:13:53.281652,[First Opinion],"{'tags': ['public health', 'global health', 'H...",1,0,...,165.0,176.0,185.0,192.0,193.0,207.0,210.0,228.0,233.0,[https://www.statnews.com/2019/10/21/communiti...
431066,rapid expansion of telehealth comes with new c...,although new delivery methods will help telehe...,although new delivery methods will help telehe...,<p>It&#x2019;s a boom time for telehealth. Sta...,2019-10-21 10:40:26,2019-10-21 12:13:53.499347,[First Opinion],"{'tags': ['telehealth'], 'keywords': ['']}",1,0,...,44.0,47.0,47.0,47.0,49.0,55.0,56.0,62.0,67.0,[https://www.statnews.com/2019/10/21/telehealt...
431067,a biotech real estate firm wants a new slogan....,"alexandria real estate, the lab-focused manage...","alexandria real estate, the lab-focused manage...",<p>Embattled office-subleasing and &#x201C;<a ...,2019-10-21 10:35:01,2019-10-21 12:13:53.593596,[Biotech],"{'tags': ['legal', 'ethics', 'STAT Plus', 'bio...",1,0,...,7.0,7.0,7.0,10.0,14.0,18.0,19.0,19.0,19.0,[https://www.statnews.com/2019/10/21/wework-ch...
431068,"after decades-long campaign, type 3 poliovirus...",the formal bid to eradicate all polio began in...,the formal bid to eradicate all polio began in...,"<p>After <a href=""https://www.statnews.com/201...",2019-10-21 10:30:40,2019-10-21 12:13:53.714328,[Health],"{'tags': ['public health', 'infectious disease...",1,0,...,617.0,673.0,698.0,705.0,709.0,913.0,1137.0,1197.0,1232.0,[https://www.statnews.com/2019/10/21/decades-l...
431081,"be humble, and proudly, psychologists say",humility is not the boldest of personality tra...,humility is not the boldest of personality tra...,,2019-10-21 00:00:00,2019-10-21 12:14:05.770730,,"{'tags': [], 'keywords': ['']}",1,0,...,3714.0,4217.0,5480.0,8674.0,9476.0,9867.0,10241.0,10792.0,11391.0,[]


In [198]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 18605 entries, 431065 to 812426
Data columns (total 68 columns):
 #   Column                   Non-Null Count  Dtype         
---  ------                   --------------  -----         
 0   title                    18605 non-null  object        
 1   perex                    18605 non-null  object        
 2   body                     18605 non-null  object        
 3   raw_body                 18605 non-null  object        
 4   published_at             18605 non-null  datetime64[ns]
 5   extracted_at             18605 non-null  datetime64[ns]
 6   category                 13024 non-null  object        
 7   other_info               18601 non-null  object        
 8   image_count              18605 non-null  int64         
 9   video_count              18605 non-null  int64         
 10  author_name              18605 non-null  object        
 11  source_id                18605 non-null  int64         
 12  source_name              1

-----

In [199]:
pop = pd.DataFrame()
qrange = [np.round(i, 2) for i in np.arange(0, 1, 0.05)]
pop['q'] = qrange
for i in range(0, 11):
    col = f'fb_popularity_ad_{i}'
    pop[col] = [df[col].quantile(q) for q in qrange]
pop

Unnamed: 0,q,fb_popularity_ad_0,fb_popularity_ad_1,fb_popularity_ad_2,fb_popularity_ad_3,fb_popularity_ad_4,fb_popularity_ad_5,fb_popularity_ad_6,fb_popularity_ad_7,fb_popularity_ad_8,fb_popularity_ad_9,fb_popularity_ad_10
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.05,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.15,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,0.25,0.0,0.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
6,0.3,1.0,2.0,2.0,2.0,3.0,3.0,3.0,3.0,3.0,4.0,4.0
7,0.35,2.0,4.0,6.0,7.0,8.0,8.0,9.0,9.0,10.0,10.0,11.0
8,0.4,5.0,11.0,15.0,17.0,19.0,20.0,21.0,22.0,23.0,24.0,24.0
9,0.45,9.0,21.0,28.0,32.0,34.0,35.0,37.0,38.0,39.0,40.0,41.0


Rozdelenie hodnot popularity do 4 skupin

- `0 - 0.4`
- `0.4 - 0.75`
- `0.75 - 0.9`
- `0.9 - 1`

In [200]:
def add_labels(df, quantiles, column='fb_popularity_ad_10'):
    df = df.copy()
    label_str = f'{column}_label'
    
    df[label_str] = -1
    
    label = 1    
    for i in range(len(quantiles) - 1):
        low = df[column].quantile(quantiles[i])
        high = df[column].quantile(quantiles[i + 1])
        
        df.loc[(low <= df[column]) & (df[column] <= high), label_str] = int(label)
        
        label += 1
    df = df.drop(columns=[column])    
    return df.copy()

In [201]:
quantiles = [
    0,
    .4,
    .75,
    .9,
    1
]

cols = [
    'fb_ad_10_reaction_count',
    'fb_ad_10_comment_count',
    'fb_ad_10_share_count',
    'fb_popularity_ad_10'
]

for i in cols:
    print(df[i].quantile(quantiles))
    df = add_labels(df, quantiles, column=i)

0.00          0.0
0.40          7.0
0.75        364.0
0.90       2014.0
1.00    1368305.0
Name: fb_ad_10_reaction_count, dtype: float64
0.00         0.0
0.40         0.0
0.75        80.0
0.90       511.6
1.00    897945.0
Name: fb_ad_10_comment_count, dtype: float64
0.00         0.0
0.40        12.0
0.75       165.0
0.90       695.0
1.00    298199.0
Name: fb_ad_10_share_count, dtype: float64
0.00          0.0
0.40         24.0
0.75        646.0
0.90       3337.0
1.00    2564449.0
Name: fb_popularity_ad_10, dtype: float64


In [202]:
df.fb_popularity_ad_10_label.value_counts()

1    7390
2    6559
3    2795
4    1861
Name: fb_popularity_ad_10_label, dtype: int64

Pri jednotlivych zlozkach sme pri tomto rozdeleni nasli len 4 skupiny (lebo 1 == 2)

---

Jednoducha heuristika: ak je zdroj nedoveryhodny tak aj clanok je nedoveryhodny

In [203]:
df['is_fake_news_label'] = df.source_is_reliable.replace({0:1, 1:0})

---

In [204]:
label_names = list(filter(lambda x: x.endswith('_label'), df.columns))

In [205]:
for ln in label_names:
    df[ln] = pd.to_numeric(df[ln])

In [206]:
labels_df = pd.DataFrame()

In [207]:
# labely
labels_df = pd.concat([labels_df] + [df[label_name] for label_name in label_names], axis=1)

In [208]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 18605 entries, 431065 to 812426
Data columns (total 69 columns):
 #   Column                         Non-Null Count  Dtype         
---  ------                         --------------  -----         
 0   title                          18605 non-null  object        
 1   perex                          18605 non-null  object        
 2   body                           18605 non-null  object        
 3   raw_body                       18605 non-null  object        
 4   published_at                   18605 non-null  datetime64[ns]
 5   extracted_at                   18605 non-null  datetime64[ns]
 6   category                       13024 non-null  object        
 7   other_info                     18601 non-null  object        
 8   image_count                    18605 non-null  int64         
 9   video_count                    18605 non-null  int64         
 10  author_name                    18605 non-null  object        
 11  source_id

## Rozdelenie dat

In [209]:
from sklearn.model_selection import train_test_split

Vytvorenie mnoziny clankov a zdrojov na ktorych sa validuje cela metoda

In [214]:
excluded_data = df[df.source_name.isin([
    'youngwomenshealth.org',
    'vaxopedia.org',
    'emedicinehealth.com',
    
    'hsionline.com',
    'wakeup-world.com',
    'genuinehealth.com',
    'realfarmacy.com',
    'educateinspirechange.org',
    'vaxxter.com'
])].copy()

df = df[~df.source_name.isin(excluded_data.source_name.unique())]

In [215]:
df = df.reset_index()

In [217]:
train, test = tuple(split_data(df, sizes=[3, 1], shuffle=True, np_random=np_random))

In [218]:
train.set_index('id', inplace=True)
test.set_index('id', inplace=True)

In [219]:
save_df(train, 'train_data')
save_df(test, 'test_data')
save_df(test, 'excluded_data')

In [220]:
print([len(i) for i in [train,test, excluded_data]])

[13194, 4397, 1014]


In [221]:
label_names

['fb_ad_10_reaction_count_label',
 'fb_ad_10_comment_count_label',
 'fb_ad_10_share_count_label',
 'fb_popularity_ad_10_label',
 'is_fake_news_label']

In [275]:
data = split_X_y_all(train, test, excluded_data, 'is_fake_news_label', label_names)

In [276]:
data.validation.y.value_counts()

1    542
0    472
Name: is_fake_news_label, dtype: int64

---

In [223]:
from gensim.models.doc2vec import Doc2Vec
from gensim.models.doc2vec import TaggedDocument

In [224]:
import spacy
importlib.reload(spacy)
nlp = spacy.load("en_core_web_sm")

def tokenize(text):
    doc = nlp(text, disable=['parser', 'tagger', 'ner'])

    return words_from_doc(doc)

def words_from_doc(doc):
    res = []
    for i in doc:
        if i.is_stop:
            continue
        if i.is_punct:
            continue

        res.append(str(i))

    return res

def tokenize_to_file(data, file):
    with open(file, 'w', encoding='utf-8') as f:
        for i in Pbar(data):
            f.write(f"{' '.join(tokenize(i))}\n")

### Trening doc2vec modelu

In [225]:
tokenize_to_file(data.train.X.body, './data/train_body_tokenized.txt')
tokenize_to_file(data.test.X.body, './data/test_body_tokenized.txt')
tokenize_to_file(data.validation.X.body, './data/excluded_data_body_tokenized.txt')



In [226]:
d2v = Doc2Vec(corpus_file='./data/train_body_tokenized.txt', vector_size=300, min_count=2, epochs=25, workers=12)

2020-05-05 21:15:40,856 : INFO : collecting all words and their counts
2020-05-05 21:15:40,858 : INFO : PROGRESS: at example #0, processed 0 words (0/s), 0 word types, 0 tags
2020-05-05 21:15:41,983 : INFO : PROGRESS: at example #10000, processed 3623149 words (3220943/s), 101622 word types, 10000 tags
2020-05-05 21:15:42,351 : INFO : collected 118466 word types and 13194 unique tags from a corpus of 13194 examples and 4804337 words
2020-05-05 21:15:42,352 : INFO : Loading a fresh vocabulary
2020-05-05 21:15:42,479 : INFO : effective_min_count=2 retains 67959 unique words (57% of original 118466, drops 50507)
2020-05-05 21:15:42,479 : INFO : effective_min_count=2 leaves 4753830 word corpus (98% of original 4804337, drops 50507)
2020-05-05 21:15:42,702 : INFO : deleting the raw counts dictionary of 118466 items
2020-05-05 21:15:42,705 : INFO : sample=0.001 downsamples 9 most-common words
2020-05-05 21:15:42,706 : INFO : downsampling leaves estimated 4699932 word corpus (98.9% of prior 4

In [227]:
def infer_d2v(d2v_model, data_file):
    res = []
    
    with open(data_file, 'r', encoding='utf-8') as f:
        for i in Pbar(f.readlines()):
            res.append(d2v_model.infer_vector(i.split(' '), steps=20, alpha=0.025)) 
    
    return res

def infer_for_df(df, d2v_model, data_file):
    lst = infer_d2v(d2v_model, data_file)
    d2v_df = pd.DataFrame(lst, index=df.index, columns=[f'd2v_{i}' for i in range(1, 301)] )
    
    return d2v_df

### Nacitanie predpripravenych crt

In [228]:
f_sentiment = load_df('features_sentiment.pickle')
f_readability = load_df('features_readability.pickle')
f_metadata = load_df('features_metadata.pickle')
f_empath = load_df('features_empath.pickle')
f_content = load_df('features_content.pickle')
f_named_entities = load_df('features_named_entities.pickle')
f_popularity = load_df('features_popularity.pickle')
f_popularity_d0 = load_df('features_popularity_0.pickle')
f_popularity_d1 = load_df('features_popularity_1.pickle')
f_popularity_d2 = load_df('features_popularity_2.pickle')

ziskanie d2v vektorov pre train a test body

In [229]:
f_d2v_train = infer_for_df(data.train.X, d2v, './data/train_body_tokenized.txt')
save_df(f_d2v_train, 'd2v_train.pickle')



In [230]:
f_d2v_test = infer_for_df(data.test.X, d2v, './data/test_body_tokenized.txt')
save_df(f_d2v_test, 'd2v_test.pickle')



In [232]:
f_d2v_validation = infer_for_df(data.validation.X, d2v, './data/excluded_data_body_tokenized.txt')



In [233]:
label_names

['fb_ad_10_reaction_count_label',
 'fb_ad_10_comment_count_label',
 'fb_ad_10_share_count_label',
 'fb_popularity_ad_10_label',
 'is_fake_news_label']

In [280]:
from sklearn.metrics import classification_report
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC

from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier

In [281]:
class Runner:

    def __init__(self, train, train_y, test, test_y):
        self.classifiers = [
            XGBClassifier(n_jobs=12, seed=RANDOM_STATE),
            RandomForestClassifier(n_jobs=12, class_weight='balanced', random_state=RANDOM_STATE),
        ]
        
        self.train = train
        self.train_y = train_y
        self.test = test
        self.test_y = test_y
        

    def fit_predict(self, clf):
        clf.fit(self.train, self.train_y)
        return clf.predict(self.test)
    
    def run(self):
        _pbar_conf = {
            'refresh_rate': 1,
            'pbar_width': 52,
            'length': len(self.classifiers),
            'action_names': [i.__class__.__name__ for i in self.classifiers]
        }
        
        for p in Pbar((self.fit_predict(clf) for clf in self.classifiers), **_pbar_conf):
            print(classification_report(self.test_y, p))
            print('-' * 54)


In [282]:
data.train.features = pd.concat([
    pd.DataFrame(index=data.train.X.index),
    f_sentiment,
    f_readability,
    f_metadata,
    f_empath,
    f_content,
    f_named_entities,
    f_d2v_train], join='inner', axis=1)

In [283]:
data.test.features = pd.concat([
    pd.DataFrame(index=data.test.X.index),
    f_sentiment,
    f_readability,
    f_metadata,
    f_empath,
    f_content,
    f_named_entities,
    f_d2v_test], join='inner', axis=1)

In [284]:
data.validation.features = pd.concat([
    pd.DataFrame(index=data.validation.X.index),
    f_sentiment,
    f_readability,
    f_metadata,
    f_empath,
    f_content,
    f_named_entities,
    f_d2v_validation], join='inner', axis=1)

## Predikcia popularity

In [285]:
data.train.switch_label('fb_popularity_ad_10_label')
data.test.switch_label('fb_popularity_ad_10_label')
data.validation.switch_label('fb_popularity_ad_10_label')

#### V case publikovania 

In [286]:
train_data = data.train.features.copy()
test_data = data.test.features.copy()

runner = Runner(train=train_data,
                train_y=data.train.y,
                test=test_data,
                test_y=data.test.y)

runner.run()

[                                                    ] (processing: XGBClassifier) -- 0 / 2              precision    recall  f1-score   support

           1       0.89      0.83      0.86      1747
           2       0.59      0.83      0.69      1507
           3       0.42      0.14      0.21       701
           4       0.53      0.50      0.51       442

    accuracy                           0.69      4397
   macro avg       0.61      0.57      0.57      4397
weighted avg       0.68      0.69      0.66      4397

------------------------------------------------------

           1       0.75      0.85      0.80      1747
           2       0.54      0.74      0.63      1507
           3       0.36      0.10      0.16       701
           4       0.54      0.19      0.28       442

    accuracy                           0.63      4397
   macro avg       0.55      0.47      0.46      4397
weighted avg       0.60      0.63      0.58      4397

--------------------------------------

## Detekcia falosnych sprav

In [292]:
data.train.switch_label('is_fake_news_label')
data.test.switch_label('is_fake_news_label')
data.validation.switch_label('is_fake_news_label')

In [293]:
def drop_source_features(df):
    cols = [col for col in df if col.startswith('source_')]
    return df.drop(columns=cols)

In [294]:
train_data = drop_source_features(data.train.features.copy())
test_data = drop_source_features(data.test.features.copy())
validation_data = drop_source_features(data.validation.features.copy())

runner = Runner(train=train_data,
                train_y=data.train.y,
                test=validation_data,
                test_y=data.validation.y)

runner.run()

[                                                    ] (processing: XGBClassifier) -- 0 / 2              precision    recall  f1-score   support

           0       0.85      0.80      0.82       472
           1       0.83      0.88      0.86       542

    accuracy                           0.84      1014
   macro avg       0.84      0.84      0.84      1014
weighted avg       0.84      0.84      0.84      1014

------------------------------------------------------

           0       0.76      0.87      0.81       472
           1       0.87      0.76      0.81       542

    accuracy                           0.81      1014
   macro avg       0.82      0.82      0.81      1014
weighted avg       0.82      0.81      0.81      1014

------------------------------------------------------


In [296]:
display_all(pd.DataFrame((i for i in runner.classifiers[0].feature_importances_), index=train_data.columns, columns=['importance']).sort_values(by=['importance'], ascending=False))

Unnamed: 0,importance
content_pos_ADD,0.119116
content_body_word_count,0.066312
content_..._count,0.046205
content_media_count,0.040395
readability_fcgl,0.038013
content_word_over_5ch,0.027778
content_!_count,0.027062
readability_ari,0.026925
content_sentence_count,0.026705
content_pos__SP,0.024446
