In [2]:
import sys

import importlib
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import sqlalchemy
import gensim
import logging
import empath

import common
import util
importlib.reload(common)
importlib.reload(util)

from common import create_engine
from common import display_all
from common import figsize
from common import save_df, load_df
from common import save_session, load_session

from util import show_importances
from util import split_X_y_all, split_X_y, split_data
from util import empty_features, column_feature, str_contains

from pbar import Pbar

from pandas.plotting import register_matplotlib_converters

register_matplotlib_converters() # converters e.g. for datetime in plots
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)

In [3]:
RANDOM_STATE = 123
np_random = np.random.RandomState(RANDOM_STATE)

### Load dataset

In [50]:
df = load_df('final_data.pickle')

In [51]:
df.head()

Unnamed: 0_level_0,title,perex,body,raw_body,published_at,extracted_at,category,other_info,image_count,video_count,...,fb_popularity_ad_2,fb_popularity_ad_3,fb_popularity_ad_4,fb_popularity_ad_5,fb_popularity_ad_6,fb_popularity_ad_7,fb_popularity_ad_8,fb_popularity_ad_9,fb_popularity_ad_10,body_urls
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
431065,put communities at the center of universal hea...,if universal health care is truly meant to ref...,if universal health care is truly meant to ref...,"<p>The <a href=""https://www.who.int/news-room/...",2019-10-21 10:45:10,2019-10-21 12:13:53.281652,[First Opinion],"{'tags': ['public health', 'global health', 'H...",1,0,...,165.0,176.0,185.0,192.0,193.0,207.0,210.0,228.0,233.0,[https://www.statnews.com/2019/10/21/communiti...
431066,rapid expansion of telehealth comes with new c...,although new delivery methods will help telehe...,although new delivery methods will help telehe...,<p>It&#x2019;s a boom time for telehealth. Sta...,2019-10-21 10:40:26,2019-10-21 12:13:53.499347,[First Opinion],"{'tags': ['telehealth'], 'keywords': ['']}",1,0,...,44.0,47.0,47.0,47.0,49.0,55.0,56.0,62.0,67.0,[https://www.statnews.com/2019/10/21/telehealt...
431067,a biotech real estate firm wants a new slogan....,"alexandria real estate, the lab-focused manage...","alexandria real estate, the lab-focused manage...",<p>Embattled office-subleasing and &#x201C;<a ...,2019-10-21 10:35:01,2019-10-21 12:13:53.593596,[Biotech],"{'tags': ['legal', 'ethics', 'STAT Plus', 'bio...",1,0,...,7.0,7.0,7.0,10.0,14.0,18.0,19.0,19.0,19.0,[https://www.statnews.com/2019/10/21/wework-ch...
431068,"after decades-long campaign, type 3 poliovirus...",the formal bid to eradicate all polio began in...,the formal bid to eradicate all polio began in...,"<p>After <a href=""https://www.statnews.com/201...",2019-10-21 10:30:40,2019-10-21 12:13:53.714328,[Health],"{'tags': ['public health', 'infectious disease...",1,0,...,617.0,673.0,698.0,705.0,709.0,913.0,1137.0,1197.0,1232.0,[https://www.statnews.com/2019/10/21/decades-l...
431081,"be humble, and proudly, psychologists say",humility is not the boldest of personality tra...,humility is not the boldest of personality tra...,,2019-10-21 00:00:00,2019-10-21 12:14:05.770730,,"{'tags': [], 'keywords': ['']}",1,0,...,3714.0,4217.0,5480.0,8674.0,9476.0,9867.0,10241.0,10792.0,11391.0,[]


In [6]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 18605 entries, 431065 to 812426
Data columns (total 68 columns):
 #   Column                   Non-Null Count  Dtype         
---  ------                   --------------  -----         
 0   title                    18605 non-null  object        
 1   perex                    18605 non-null  object        
 2   body                     18605 non-null  object        
 3   raw_body                 18605 non-null  object        
 4   published_at             18605 non-null  datetime64[ns]
 5   extracted_at             18605 non-null  datetime64[ns]
 6   category                 13024 non-null  object        
 7   other_info               18601 non-null  object        
 8   image_count              18605 non-null  int64         
 9   video_count              18605 non-null  int64         
 10  author_name              18605 non-null  object        
 11  source_id                18605 non-null  int64         
 12  source_name              1

-----

In [None]:
          \item Nepopulárne správy 0 - 0.6
            \item Bežné správy  0.6 - 0.8
            \item Populárne správy - 0.8 - 0.9
            \item Veľmi populárne správy 0.9 - 

In [52]:
pop = pd.DataFrame()
qrange = [np.round(i, 2) for i in np.arange(0, 1, 0.05)]
pop['q'] = qrange
for i in range(0, 11):
    col = f'fb_popularity_ad_{i}'
    pop[col] = [df[col].quantile(q) for q in qrange]
pop

Unnamed: 0,q,fb_popularity_ad_0,fb_popularity_ad_1,fb_popularity_ad_2,fb_popularity_ad_3,fb_popularity_ad_4,fb_popularity_ad_5,fb_popularity_ad_6,fb_popularity_ad_7,fb_popularity_ad_8,fb_popularity_ad_9,fb_popularity_ad_10
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.05,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.15,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,0.25,0.0,0.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
6,0.3,1.0,2.0,2.0,2.0,3.0,3.0,3.0,3.0,3.0,4.0,4.0
7,0.35,2.0,4.0,6.0,7.0,8.0,8.0,9.0,9.0,10.0,10.0,11.0
8,0.4,5.0,11.0,15.0,17.0,19.0,20.0,21.0,22.0,23.0,24.0,24.0
9,0.45,9.0,21.0,28.0,32.0,34.0,35.0,37.0,38.0,39.0,40.0,41.0


Rozdelenie hodnot popularity do 5 skupin

- `0 - 0.6`
- `0.6 - 0.8`
- `0.8 - 0.9`
- `0.9 - 1`

In [53]:
def add_labels(df, quantiles, column='fb_popularity_ad_10'):
    df = df.copy()
    label_str = f'{column}_label'
    
    df[label_str] = -1
    
    label = 1    
    for i in range(len(quantiles) - 1):
        low = df[column].quantile(quantiles[i])
        high = df[column].quantile(quantiles[i + 1])
        
        df.loc[(low <= df[column]) & (df[column] <= high), label_str] = int(label)
        
        label += 1
    df = df.drop(columns=[column])    
    return df.copy()

In [54]:
quantiles = [
    0,
    .55,
    .7,
    .9,
    1
]

cols = [
    'fb_ad_10_reaction_count',
    'fb_ad_10_comment_count',
    'fb_ad_10_share_count',
    'fb_popularity_ad_10'
]

for i in cols:
    print(df[i].quantile(quantiles))
    df = add_labels(df, quantiles, column=i)

0.00          0.0
0.55         47.0
0.70        233.0
0.90       2014.0
1.00    1368305.0
Name: fb_ad_10_reaction_count, dtype: float64
0.00         0.0
0.55         8.0
0.70        45.0
0.90       511.6
1.00    897945.0
Name: fb_ad_10_comment_count, dtype: float64
0.00         0.0
0.55        40.0
0.70       115.2
0.90       695.0
1.00    298199.0
Name: fb_ad_10_share_count, dtype: float64
0.00          0.0
0.55        108.0
0.70        415.0
0.90       3337.0
1.00    2564449.0
Name: fb_popularity_ad_10, dtype: float64


In [47]:
df.fb_popularity_ad_10_label.value_counts()

1    11151
2     3732
4     1861
3     1861
Name: fb_popularity_ad_10_label, dtype: int64

Pri jednotlivych zlozkach sme pri tomto rozdeleni nasli len 4 skupiny (lebo 1 == 2)

---

Jednoducha heuristika: ak je zdroj nedoveryhodny tak aj clanok je nedoveryhodny

In [9]:
df['is_fake_news_label'] = df.source_is_reliable.replace({0:1, 1:0})

---

In [13]:
label_names = list(filter(lambda x: x.endswith('_label'), df.columns))

In [14]:
for ln in label_names:
    df[ln] = pd.to_numeric(df[ln])

In [15]:
labels_df = pd.DataFrame()

In [15]:
# labely
labels_df = pd.concat([labels_df] + [df[label_name] for label_name in label_names], axis=1)

In [17]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 20246 entries, 428781 to 812426
Data columns (total 87 columns):
 #   Column                         Non-Null Count  Dtype         
---  ------                         --------------  -----         
 0   title                          20246 non-null  object        
 1   perex                          20246 non-null  object        
 2   body                           20246 non-null  object        
 3   published_at                   20246 non-null  datetime64[ns]
 4   extracted_at                   20246 non-null  datetime64[ns]
 5   category                       13685 non-null  object        
 6   other_info                     20242 non-null  object        
 7   image_count                    20246 non-null  int64         
 8   video_count                    20246 non-null  int64         
 9   author_name                    20246 non-null  object        
 10  source_id                      20246 non-null  int64         
 11  source_na

## Rozdelenie dat

In [16]:
train, test, validation = tuple(split_data(df, sizes=[2, 2, 1], shuffle=True, np_random=np_random))

In [17]:
print([len(i) for i in [train,test,validation]])

[8099, 8098, 4049]


In [18]:
label_names

['fb_ad_15_reaction_count_label',
 'fb_ad_15_comment_count_label',
 'fb_ad_15_share_count_label',
 'fb_popularity_ad_15_label',
 'is_fake_news_label']

---

### Skupina 'metadata'

In [24]:
def media_count_total(df):
    res = pd.DataFrame(index=df.index)
    
    res['media_count_total'] = df['image_count'] + df['video_count']
    
    return res
    
def media_count_image(df):
    return column_feature(df, 'image_count')

def media_count_video(df):
    return column_feature(df, 'video_count')

---

In [32]:
data = split_X_y_all(train, test, validation, selected_label='is_fake_news_label', all_labels=label_names)

In [33]:
data.train.features = add_features(data.train.X)



In [34]:
data.test.features = add_features(data.test.X)



In [35]:
data.validation.features = add_features(data.validation.X)



In [36]:
label_names

['fb_ad_15_reaction_count_label',
 'fb_ad_15_comment_count_label',
 'fb_ad_15_share_count_label',
 'fb_popularity_ad_15_label',
 'is_fake_news_label']

In [37]:
from gensim.models.doc2vec import Doc2Vec
from gensim.models.doc2vec import TaggedDocument


import multiprocessing

In [38]:
def tokenize_to_file(data, file):
    with open(file, 'w', encoding='utf-8') as f:
        for i in Pbar(data):
            f.write(f"{' '.join(tokenize(i))}\n")

In [39]:
cores = multiprocessing.cpu_count()
print(f'>>> {cores} cores available')

>>> 12 cores available


In [40]:
tokenize_to_file(data.train.X.body, './data/train_body_tokenized.txt')
tokenize_to_file(data.test.X.body, './data/test_body_tokenized.txt')
tokenize_to_file(data.validation.X.body, './data/validation_body_tokenized.txt')



In [41]:
d2v = Doc2Vec(corpus_file='./data/train_body_tokenized.txt', vector_size=300, min_count=2, epochs=15, workers=12)

2020-04-27 11:30:50,863 : INFO : collecting all words and their counts
2020-04-27 11:30:50,864 : INFO : PROGRESS: at example #0, processed 0 words (0/s), 0 word types, 0 tags
2020-04-27 11:30:51,711 : INFO : collected 90408 word types and 8099 unique tags from a corpus of 8099 examples and 2789006 words
2020-04-27 11:30:51,712 : INFO : Loading a fresh vocabulary
2020-04-27 11:30:51,878 : INFO : effective_min_count=2 retains 51892 unique words (57% of original 90408, drops 38516)
2020-04-27 11:30:51,878 : INFO : effective_min_count=2 leaves 2750490 word corpus (98% of original 2789006, drops 38516)
2020-04-27 11:30:52,056 : INFO : deleting the raw counts dictionary of 90408 items
2020-04-27 11:30:52,059 : INFO : sample=0.001 downsamples 9 most-common words
2020-04-27 11:30:52,060 : INFO : downsampling leaves estimated 2721940 word corpus (99.0% of prior 2750490)
2020-04-27 11:30:52,213 : INFO : estimated required memory for 51892 words and 300 dimensions: 160205600 bytes
2020-04-27 11:3

In [42]:
def infer_d2v(d2v_model, data_file):
    
    res = []
    
    with open(data_file, 'r', encoding='utf-8') as f:
        for i in Pbar(f.readlines()):
            res.append(d2v_model.infer_vector(i.split(' '), steps=20, alpha=0.025)) 
    
    return res

def infer_for_df(df, d2v_model, data_file):
    lst = infer_d2v(d2v_model, data_file)
    d2v_df = pd.DataFrame(lst, index=df.index, columns=[f'd2v_{i}' for i in range(1, 301)] )
    
    return pd.concat([df, d2v_df], axis=1, sort=False)

In [43]:
data.train.features = infer_for_df(data.train.features, d2v, './data/train_body_tokenized.txt')



In [44]:
data.test.features = infer_for_df(data.test.features, d2v, './data/test_body_tokenized.txt')



In [45]:
from sklearn.metrics import classification_report

from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier

In [46]:
def fit_predict(clf, data):
    clf.fit(data.train.features, data.train.y)
    return clf.predict(data.test.features)

In [47]:
label_names

['fb_ad_15_reaction_count_label',
 'fb_ad_15_comment_count_label',
 'fb_ad_15_share_count_label',
 'fb_popularity_ad_15_label',
 'is_fake_news_label']

In [48]:
data.train.switch_label('is_fake_news_label')
data.test.switch_label('is_fake_news_label')

In [49]:
data.train.features['media_count_total'] = pd.to_numeric(data.train.features['media_count_total'])
data.train.features['image_count'] = pd.to_numeric(data.train.features['image_count'])
data.train.features['video_count'] = pd.to_numeric(data.train.features['video_count'])

data.test.features['media_count_total'] = pd.to_numeric(data.test.features['media_count_total'])
data.test.features['image_count'] = pd.to_numeric(data.test.features['image_count'])
data.test.features['video_count'] = pd.to_numeric(data.test.features['video_count'])

data.train.features['av_claims_false'] = pd.to_numeric(data.train.features['av_claims_false'])
data.train.features['av_claims_mostly_false'] = pd.to_numeric(data.train.features['av_claims_mostly_false'])
data.train.features['av_claims_mixture'] = pd.to_numeric(data.train.features['av_claims_mixture'])
data.train.features['av_claims_mostly_true'] = pd.to_numeric(data.train.features['av_claims_mostly_true'])
data.train.features['av_claims_true'] = pd.to_numeric(data.train.features['av_claims_true'])
data.train.features['av_claims_unknown'] = pd.to_numeric(data.train.features['av_claims_unknown'])


data.test.features['av_claims_false'] = pd.to_numeric(data.test.features['av_claims_false'])
data.test.features['av_claims_mostly_false'] = pd.to_numeric(data.test.features['av_claims_mostly_false'])
data.test.features['av_claims_mixture'] = pd.to_numeric(data.test.features['av_claims_mixture'])
data.test.features['av_claims_mostly_true'] = pd.to_numeric(data.test.features['av_claims_mostly_true'])
data.test.features['av_claims_true'] = pd.to_numeric(data.test.features['av_claims_true'])
data.test.features['av_claims_unknown'] = pd.to_numeric(data.test.features['av_claims_unknown'])

In [50]:
data.train.features.drop(columns=['perex_word_count', 'perex_char_length'], inplace=True)
data.test.features.drop(columns=['perex_word_count', 'perex_char_length'], inplace=True)

In [51]:
data.train.y = pd.to_numeric(data.train.y)
data.test.y = pd.to_numeric(data.test.y)

In [52]:
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB

In [53]:
classifiers = [
    RandomForestClassifier(n_estimators=100, class_weight='balanced', n_jobs=cores),
    XGBClassifier(n_jobs=cores, seed=RANDOM_STATE),
    GaussianNB(),
    LogisticRegression()
]

pbar_conf = {
    'refresh_rate': 1,
    'length': len(classifiers), 
    'pbar_width': 52,
    'action_names': [i.__class__.__name__ for i in classifiers]
}

predictions = list(Pbar((fit_predict(clf, data) for clf in classifiers), **pbar_conf))

for p in predictions:
    print(classification_report(data.test.y, p))
    print('-' * 54)



STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


              precision    recall  f1-score   support

           0       0.88      0.96      0.92      4838
           1       0.93      0.81      0.87      3260

    accuracy                           0.90      8098
   macro avg       0.90      0.88      0.89      8098
weighted avg       0.90      0.90      0.90      8098

------------------------------------------------------
              precision    recall  f1-score   support

           0       0.90      0.94      0.92      4838
           1       0.91      0.84      0.87      3260

    accuracy                           0.90      8098
   macro avg       0.90      0.89      0.90      8098
weighted avg       0.90      0.90      0.90      8098

------------------------------------------------------
              precision    recall  f1-score   support

           0       0.77      0.88      0.82      4838
           1       0.77      0.61      0.68      3260

    accuracy                           0.77      8098
   macro avg      

In [None]:
[====================================================] -- 2 / 2 -- (finished)
              precision    recall  f1-score   support

           0       0.89      0.96      0.92      4838
           1       0.92      0.82      0.87      3260

    accuracy                           0.90      8098
   macro avg       0.91      0.89      0.89      8098
weighted avg       0.90      0.90      0.90      8098

------------------------------------------------------
              precision    recall  f1-score   support

           0       0.90      0.94      0.92      4838
           1       0.90      0.84      0.87      3260

    accuracy                           0.90      8098
   macro avg       0.90      0.89      0.89      8098
weighted avg       0.90      0.90      0.90      8098

------------------------------------------------------

[====================================================] -- 2 / 2 -- (finished)
              precision    recall  f1-score   support

           0       0.83      0.88      0.85      4838
           1       0.80      0.73      0.76      3260

    accuracy                           0.82      8098
   macro avg       0.81      0.80      0.81      8098
weighted avg       0.82      0.82      0.82      8098

------------------------------------------------------
              precision    recall  f1-score   support

           0       0.82      0.88      0.85      4838
           1       0.81      0.72      0.76      3260

    accuracy                           0.82      8098
   macro avg       0.82      0.80      0.81      8098
weighted avg       0.82      0.82      0.82      8098

------------------------------------------------------

In [54]:
display_all(pd.DataFrame((i for i in classifiers[0].feature_importances_), index=data.train.features.columns, columns=['importance']).sort_values(by=['importance'], ascending=False))

Unnamed: 0,importance
fb_ad_0_share_count,0.037344
fb_ad_0_reaction_count,0.032782
title_char_length,0.030757
fb_ad_0_comment_count,0.026981
content_word_count,0.025531
fb_popularity_ad_0,0.024881
title_word_count,0.021971
media_count_total,0.021888
image_count,0.021173
fb_ad_1_reaction_count,0.020109


In [None]:
data.train.y