In [58]:
import sys

import importlib
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import sqlalchemy
import gensim
import logging

import common
import util
importlib.reload(common)
importlib.reload(util)

from common import create_engine
from common import display_all
from common import figsize
from common import save_df, load_df
from common import save_session, load_session

from util import show_importances
from util import split_X_y_all, split_X_y, split_data
from util import empty_features, column_feature, str_contains

from pbar import Pbar

from pandas.plotting import register_matplotlib_converters

register_matplotlib_converters() # converters e.g. for datetime in plots
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)

In [59]:
RANDOM_STATE = 123
np_random = np.random.RandomState(RANDOM_STATE)

### Load dataset

In [210]:
df = load_df('final_data.pickle')

In [211]:
df.head()

Unnamed: 0_level_0,title,perex,body,published_at,extracted_at,source_id,category,other_info,image_count,video_count,...,fb_popularity_ad_6,fb_popularity_ad_7,fb_popularity_ad_8,fb_popularity_ad_9,fb_popularity_ad_10,fb_popularity_ad_11,fb_popularity_ad_12,fb_popularity_ad_13,fb_popularity_ad_14,fb_popularity_ad_15
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
428781,Want to Support Immunity? Look to Your Gut,<p>For thousands of years we’ve relied on our ...,For thousands of years we’ve relied on our mic...,2019-10-10 00:04:42,2019-10-10 07:13:11.637640,146,[gut health],"{'tags': ['gut health', 'immune system', 'immu...",0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
428783,NY Judge Denies Stay: Children Locked Out of S...,,\n \n,2019-10-10 01:01:32,2019-10-10 07:13:17.715180,148,,"{'tags': None, 'updated_at': '2019-10-09 23:01...",0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
428831,Cucumber + Turmeric = Gorgeous Skin and Your F...,<p>Face masks – the best means of expressing c...,Face masks – the best means of expressing care...,2019-10-10 02:21:28,2019-10-10 09:24:06.693998,227,[Beauty],"{'tags': [], 'updated_at': '2019-10-10T02:30:54'}",1,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
428832,‘Sesame Street’ launches initiative to help ex...,"In a new initiative, “Sesame Street” is addres...",“Sesame Street” is introducing a new storyline...,2019-10-10 10:40:22,2019-10-10 12:02:48.264837,165,[Health],"{'tags': ['pediatrics', 'addiction'], 'keyword...",1,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
428833,Silicosis outbreak highlights the 'malignant n...,Fourteen U.S. workers are killed on the job ev...,Cutting or polishing the quartz-based composit...,2019-10-10 10:35:03,2019-10-10 12:02:48.419273,165,[First Opinion],"{'tags': ['public health', 'government agencie...",1,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [212]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 18556 entries, 428781 to 808421
Data columns (total 80 columns):
title                      18556 non-null object
perex                      14772 non-null object
body                       18523 non-null object
published_at               18556 non-null datetime64[ns]
extracted_at               18556 non-null datetime64[ns]
source_id                  18556 non-null int64
category                   12426 non-null object
other_info                 18552 non-null object
image_count                18556 non-null int64
video_count                18556 non-null int64
author_name                18556 non-null object
source_id                  18556 non-null int64
source_name                18556 non-null object
source_url                 18556 non-null object
source_type                18556 non-null object
source_is_reliable         18556 non-null int64
fb_ad_0_comment_count      8282 non-null float64
fb_ad_1_comment_count      12906 non-null

-----

Rozdelenie hodnot popularity do 5 skupin

- `0 - 0.5`
- `0.5 - 0.75`
- `0.75 - 0.9`
- `0.9 - 0.95`
- `0.95 - 1`

In [213]:
def add_labels(df, quantiles, column='fb_popularity_ad_15'):
    df = df.copy()
    label_str = f'{column}_label'
    
    df[label_str] = -1
    
    label = 1    
    for i in range(len(quantiles) - 1):
        low = df[column].quantile(quantiles[i])
        high = df[column].quantile(quantiles[i + 1])
        
        df.loc[(low <= df[column]) & (df[column] <= high), label_str] = int(label)
        
        label += 1
    df = df.drop(columns=[column])    
    return df.copy()

In [214]:
quantiles = [
    0,
    .50,
    .75,
    .90,
    .95,
    1
]

cols = [
    'fb_ad_15_reaction_count',
    'fb_ad_15_comment_count',
    'fb_ad_15_share_count',
    'fb_popularity_ad_15'
]

for i in cols:
    print(df[i].quantile(quantiles))
    df = add_labels(df, quantiles, column=i)

0.00          0.0
0.50         26.0
0.75        319.0
0.90       1755.3
0.95       4988.5
1.00    1369290.0
Name: fb_ad_15_reaction_count, dtype: float64
0.00         0.00
0.50         4.00
0.75        67.00
0.90       445.60
0.95      1273.65
1.00    898615.00
Name: fb_ad_15_comment_count, dtype: float64
0.00         0.00
0.50        29.00
0.75       153.00
0.90       645.00
0.95      1604.55
1.00    404542.00
Name: fb_ad_15_share_count, dtype: float64
0.00          0.0
0.50         17.0
0.75        321.0
0.90       1983.0
0.95       5843.0
1.00    2566473.0
Name: fb_popularity_ad_15, dtype: float64


Pri jednotlivych zlozkach sme pri tomto rozdeleni nasli len 4 skupiny (lebo 1 == 2)

---

Jednoducha heuristika: ak je zdroj nedoveryhodny tak aj clanok je nedoveryhodny

In [215]:
df['is_fake_news_label'] = df.source_is_reliable.replace({0:1, 1:0})

---

### data cleanup

In [66]:
# clear body, perex, etc from html....

In [216]:
# drop rows without body
df = df[~df.body.isnull()]
df = df[~df.title.isnull()]

In [217]:
from bs4 import BeautifulSoup
import unicodedata

def clear_text(text):
    if text is None:
        return ''

    text = BeautifulSoup(text, features='html.parser').text
    text = text.lower()
    text = text.replace('\r', '')
    text = text.replace('\n', ' ')
    text = unicodedata.normalize('NFKD', text)

    return text

def clear_column(df, column):
    df[column] = df[column].apply(clear_text)

def clear_columns(df, columns):
    pbar_conf = {
        'refresh_rate': 1,
        'action_names': columns
    }
        
    for c in Pbar(columns, **pbar_conf):
        clear_column(df, c)

In [218]:
clear_columns(df, ['title', 'perex', 'body'])



In [219]:
label_names = list(filter(lambda x: x.endswith('_label'), df.columns))

In [220]:
for ln in label_names:
    df[ln] = pd.to_numeric(df[ln])

In [221]:
labels_df = pd.DataFrame()

In [222]:
# labely
labels_df = pd.concat([labels_df] + [df[label_name] for label_name in label_names], axis=1)

In [223]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 18523 entries, 428781 to 808421
Data columns (total 81 columns):
title                            18523 non-null object
perex                            18523 non-null object
body                             18523 non-null object
published_at                     18523 non-null datetime64[ns]
extracted_at                     18523 non-null datetime64[ns]
source_id                        18523 non-null int64
category                         12396 non-null object
other_info                       18519 non-null object
image_count                      18523 non-null int64
video_count                      18523 non-null int64
author_name                      18523 non-null object
source_id                        18523 non-null int64
source_name                      18523 non-null object
source_url                       18523 non-null object
source_type                      18523 non-null object
source_is_reliable               18523 non-null 

## Rozdelenie dat

In [224]:
train, test, validation = tuple(split_data(df, sizes=[2, 2, 1], shuffle=True, np_random=np_random))

In [225]:
print([len(i) for i in [train,test,validation]])

[7410, 7409, 3704]


In [226]:
label_names

['fb_ad_15_reaction_count_label',
 'fb_ad_15_comment_count_label',
 'fb_ad_15_share_count_label',
 'fb_popularity_ad_15_label',
 'is_fake_news_label']

---

In [227]:
from sklearn.feature_extraction.text import CountVectorizer
from textblob import TextBlob

import spacy

nlp = spacy.load("en_core_web_sm")

In [228]:
def tokenize(text):
    doc = nlp(text, disable=['parser', 'tagger', 'ner'])
    
    res = []
    for i in doc:
        if i.is_stop:
            continue
        if i.is_punct:
            continue
            
        res.append(str(i))
    
    return res

In [229]:
def title_basic_features(df):
    cv = CountVectorizer()
    data = cv.fit_transform(df.title)

    res = pd.DataFrame(index=df.index)
    
    res['title_word_count'] = data.sum(axis=1)
    res['title_char_length'] = df.title.apply(lambda x: len(x))
    
    return res

In [230]:
def perex_basic_features(df):
    cv = CountVectorizer()
    data = cv.fit_transform(df.perex)

    res = pd.DataFrame(index=df.index)    
    res['perex_word_count'] = data.sum(axis=1)
    res['perex_char_length'] = df.perex.apply(lambda x: len(x))
    
    return res

In [231]:
def content_basic_features(df):
    content_cv = CountVectorizer()
    data = content_cv.fit_transform(df.body)

    res = pd.DataFrame(index=df.index)    
    res['content_word_count'] = data.sum(axis=1)
    res['content_char_length'] = df.body.apply(lambda x: len(x))
    
    return res

In [232]:
def media_count_total(df):
    res = pd.DataFrame(index=df.index)
    
    res['media_count_total'] = df['image_count'] + df['video_count']
    
    return res
    
def media_count_image(df):
    return column_feature(df, 'image_count')

def media_count_video(df):
    return column_feature(df, 'video_count')

In [233]:
def published_on_day(df):
    res = pd.DataFrame(index=df.index)
    
    res['published_on_day'] = df.published_at.dt.weekday + 1
    
    return res

In [234]:
def popularity_features(df):
    res = pd.DataFrame(index=df.index)
    
    
    for i in [0,1,3]:
        res[f'fb_ad_{i}_reaction_count'] = df[f'fb_ad_{i}_reaction_count']
        res[f'fb_ad_{i}_comment_count'] = df[f'fb_ad_{i}_comment_count']
        res[f'fb_ad_{i}_share_count'] = df[f'fb_ad_{i}_share_count']
        res[f'fb_popularity_ad_{i}'] = df[f'fb_popularity_ad_{i}']
        
    
    res.fillna(res.mean(), inplace=True)
    
    return res

In [235]:
def is_collective_author(df):
    
    uniq_source_names = df.source_name.unique()
    def make_a_guess(author_name):

        return any((
                    str_contains(author_name, 'admin', case=False),
                    author_name.startswith('Neuroscience News Posts Science Research News Labs Universities Hospitals News Departments Around The World'),
                    author_name in ['Neuroscience News',
                                    'Wake Up World',
                                    'Health Sciences Institute',
                                    'REALdeal', 
                                    'nmheditor',
                                    'The Mind Unleashed',
                                    'Thinking Moms\' Revolution',
                                    'TheNewsDoctors',
                                    'clnews',
                                    'Associated Press',
                                    'HealthDay',
                                    'Infowars',
                                    'Natural News Editors',
                                    'https://www.facebook.com/WebMD',
                                    'naturalnews', 'peakconsciousness', 'HealingwithoutHurting',
                                    'HealthNutNews.com',
                                   ],
                    author_name.startswith('The Associated Press'),
                    # ' and ' in author_name, # todo: je to kolektivny autor ak ich je len viac?
                    author_name in uniq_source_names,   
        ))
    
    res = pd.DataFrame(index=df.index)
    
    res['is_collective_author'] = df.author_name.map(make_a_guess)
    
    return res

In [236]:
pd.Series(df.category.explode().unique())

0                                 gut health
1                                       None
2                                     Beauty
3                                     Health
4                              First Opinion
                        ...                 
1504    fc1580e2-2ac1-532a-bd5c-b5adeffb1c25
1505                                 Liberty
1506    6790bd7e-8e3f-5c8e-b184-17a7c91417fd
1507    3d59c851-2266-5959-85f7-6cdd51d6e939
1508    92972df9-7135-5d2f-9116-d4d0fd37393d
Length: 1509, dtype: object

In [237]:
features = [
    title_basic_features,
    perex_basic_features,
    content_basic_features,
    
    media_count_total,
    media_count_image,
    media_count_video,
    
    published_on_day,
    is_collective_author,
    
    popularity_features
]

In [238]:
def add_features(df):    
    pbar_conf = {
        'refresh_rate': 1,
        'action_names': [i.__name__ for i in features]
    }
    
    res = pd.DataFrame()
    for feature_generator in Pbar(features, **pbar_conf):
        res = pd.concat([res, feature_generator(df)], axis=1)

    return res

---

In [239]:
data = split_X_y_all(train, test, validation, selected_label='is_fake_news_label', all_labels=label_names)

In [240]:
data.train.features = add_features(data.train.X)



In [241]:
data.test.features = add_features(data.test.X)



In [242]:
data.validation.features = add_features(data.validation.X)



In [243]:
label_names

['fb_ad_15_reaction_count_label',
 'fb_ad_15_comment_count_label',
 'fb_ad_15_share_count_label',
 'fb_popularity_ad_15_label',
 'is_fake_news_label']

In [244]:
from gensim.models.doc2vec import Doc2Vec
from gensim.models.doc2vec import TaggedDocument


import multiprocessing

In [245]:
def tokenize_to_file(data, file):
    with open(file, 'w', encoding='utf-8') as f:
        for i in Pbar(data):
            f.write(f"{' '.join(tokenize(i))}\n")

In [96]:
cores = multiprocessing.cpu_count()
print(f'>>> {cores} cores available')

>>> 8 cores available


In [246]:
tokenize_to_file(data.train.X.body, './data/train_body_tokenized.txt')
tokenize_to_file(data.test.X.body, './data/test_body_tokenized.txt')
tokenize_to_file(data.validation.X.body, './data/validation_body_tokenized.txt')



In [247]:
d2v = Doc2Vec(corpus_file='./data/train_body_tokenized.txt', vector_size=300, min_count=2, epochs=15, workers=cores)

2020-04-03 01:26:15,098 : INFO : collecting all words and their counts
2020-04-03 01:26:15,100 : INFO : PROGRESS: at example #0, processed 0 words (0/s), 0 word types, 0 tags
2020-04-03 01:26:15,845 : INFO : collected 86257 word types and 7410 unique tags from a corpus of 7410 examples and 2531878 words
2020-04-03 01:26:15,845 : INFO : Loading a fresh vocabulary
2020-04-03 01:26:15,957 : INFO : effective_min_count=2 retains 49575 unique words (57% of original 86257, drops 36682)
2020-04-03 01:26:15,958 : INFO : effective_min_count=2 leaves 2495196 word corpus (98% of original 2531878, drops 36682)
2020-04-03 01:26:16,132 : INFO : deleting the raw counts dictionary of 86257 items
2020-04-03 01:26:16,134 : INFO : sample=0.001 downsamples 9 most-common words
2020-04-03 01:26:16,135 : INFO : downsampling leaves estimated 2470425 word corpus (99.0% of prior 2495196)
2020-04-03 01:26:16,303 : INFO : estimated required memory for 49575 words and 300 dimensions: 152659500 bytes
2020-04-03 01:2

In [248]:
def infer_d2v(d2v_model, data_file):
    
    res = []
    
    with open(data_file, 'r', encoding='utf-8') as f:
        for i in Pbar(f.readlines()):
            res.append(d2v_model.infer_vector(i.split(' '), steps=20, alpha=0.025)) 
    
    return res

def infer_for_df(df, d2v_model, data_file):
    lst = infer_d2v(d2v_model, data_file)
    d2v_df = pd.DataFrame(lst, index=df.index, columns=[f'd2v_{i}' for i in range(1, 301)] )
    
    return pd.concat([df, d2v_df], axis=1, sort=False)

In [249]:
data.train.features = infer_for_df(data.train.features, d2v, './data/train_body_tokenized.txt')



In [250]:
data.test.features = infer_for_df(data.test.features, d2v, './data/test_body_tokenized.txt')



In [251]:
from sklearn.metrics import classification_report

from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier

In [254]:
def fit_predict(clf, data):
    clf.fit(data.train.features, data.train.y)
    return clf.predict(data.test.features)

In [255]:
label_names

['fb_ad_15_reaction_count_label',
 'fb_ad_15_comment_count_label',
 'fb_ad_15_share_count_label',
 'fb_popularity_ad_15_label',
 'is_fake_news_label']

In [263]:
data.train.switch_label('is_fake_news_label')
data.test.switch_label('is_fake_news_label')

In [264]:
data.train.features['media_count_total'] = pd.to_numeric(data.train.features['media_count_total'])
data.train.features['image_count'] = pd.to_numeric(data.train.features['image_count'])
data.train.features['video_count'] = pd.to_numeric(data.train.features['video_count'])

data.test.features['media_count_total'] = pd.to_numeric(data.test.features['media_count_total'])
data.test.features['image_count'] = pd.to_numeric(data.test.features['image_count'])
data.test.features['video_count'] = pd.to_numeric(data.test.features['video_count'])

In [265]:
data.train.features.drop(columns=['perex_word_count', 'perex_char_length'], inplace=True)
data.test.features.drop(columns=['perex_word_count', 'perex_char_length'], inplace=True)

In [266]:
data.train.y = pd.to_numeric(data.train.y)
data.test.y = pd.to_numeric(data.test.y)

In [267]:
classifiers = [
    RandomForestClassifier(n_estimators=100, class_weight='balanced', n_jobs=cores),
    XGBClassifier(n_jobs=cores, seed=RANDOM_STATE),
]

pbar_conf = {
    'refresh_rate': 1,
    'length': len(classifiers), 
    'pbar_width': 52,
    'action_names': [i.__class__.__name__ for i in classifiers]
}

predictions = list(Pbar((fit_predict(clf, data) for clf in classifiers), **pbar_conf))

for p in predictions:
    print(classification_report(data.test.y, p))
    print('-' * 54)

              precision    recall  f1-score   support

           0       0.83      0.98      0.90      4333
           1       0.95      0.72      0.82      3076

    accuracy                           0.87      7409
   macro avg       0.89      0.85      0.86      7409
weighted avg       0.88      0.87      0.87      7409

------------------------------------------------------
              precision    recall  f1-score   support

           0       0.82      0.97      0.89      4333
           1       0.95      0.71      0.81      3076

    accuracy                           0.86      7409
   macro avg       0.89      0.84      0.85      7409
weighted avg       0.88      0.86      0.86      7409

------------------------------------------------------


In [268]:
display_all(pd.DataFrame((i for i in classifiers[0].feature_importances_), index=data.train.features.columns, columns=['importance']).sort_values(by=['importance'], ascending=False))

Unnamed: 0,importance
fb_ad_0_reaction_count,0.035345
fb_ad_0_share_count,0.033777
fb_ad_0_comment_count,0.031055
title_char_length,0.030491
media_count_total,0.025735
title_word_count,0.024622
content_char_length,0.022564
content_word_count,0.021474
image_count,0.02143
fb_popularity_ad_0,0.021361


In [None]:
data.train.y