In [1]:
import sys

import importlib
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import sqlalchemy
import gensim
import logging

import common
import util
importlib.reload(common)
importlib.reload(util)

from common import create_engine
from common import display_all
from common import figsize
from common import save_df, load_df
from common import save_session, load_session

from util import show_importances
from util import split_X_y_all, split_X_y, split_data
from util import empty_features, column_feature, str_contains

from pbar import Pbar

from pandas.plotting import register_matplotlib_converters

register_matplotlib_converters() # converters e.g. for datetime in plots
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)

In [2]:
RANDOM_STATE = 123
np_random = np.random.RandomState(RANDOM_STATE)

### Load dataset

In [3]:
df = load_df('final_data.pickle')

In [4]:
df.head()

Unnamed: 0_level_0,id,title,perex,body,published_at,extracted_at,source_id,category,other_info,image_count,...,fb_ad_5_comment_count,fb_ad_6_comment_count,fb_ad_7_comment_count,fb_ad_8_comment_count,fb_ad_9_comment_count,fb_ad_10_comment_count,fb_ad_11_comment_count,fb_ad_12_comment_count,fb_ad_13_comment_count,fb_ad_14_comment_count
url,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
https://naturalnewsblogs.com/sugar-is-not-sweet/,430166,Sugar is not sweet,<p>Heart disease is the leading cause of death...,Heart disease is the leading cause of death in...,2019-10-14 17:14:23,2019-10-15 03:54:24.343870,142,[Health],"{'tags': [], 'updated_at': '2019-10-14T17:14:24'}",1,...,,,,,,,,,,
https://naturalnewsblogs.com/how-fluoride-a-toxin-got-in-our-water-and-iodine-a-critical-nutrient-disappeared-from-medical-school-textbooks/,430168,"How Fluoride (a toxin) got in our water, and I...","<p>If you were a woman with painful, cystic br...","If you were a woman with painful, cystic breas...",2019-10-14 17:46:13,2019-10-15 03:54:24.807962,142,"[Health, Science, Videos, Fluoride, Medicine, ...","{'tags': ['""The Iodine Crisis""', 'brominated f...",1,...,,,,,,,,,,
https://www.healthnutnews.com/vitamin-a-can-save-your-skin/,430178,Vitamin A Can Save Your Skin,"<p>Written by Joseph Mercola, D.O., Ph.D. Stor...","\n\n\nWritten by Joseph Mercola, D.O., Ph.D.\n...",2019-10-14 15:35:19,2019-10-15 05:45:32.471265,176,"[Health, Food]","{'tags': ['Astaxanthin', 'dangers of sunscreen...",1,...,,,,,,,,,,
https://www.healthnutnews.com/ny-times-in-the-pancreas-common-fungi-may-drive-cancer/,430180,"NY Times: In the Pancreas, Common Fungi May Dr...","<p>By now, you&#8217;ve probably heard that yo...","By now, you’ve probably heard that your body i...",2019-10-14 21:49:41,2019-10-15 05:45:35.141698,176,[Health],"{'tags': ['bacteria', 'digestive enzymes', 'fu...",1,...,,,,,,,,,,
https://hsionline.com/2019/10/14/federal-crackdown-milk-trick/,430181,Federal CRACKDOWN suppresses mind-sharpening b...,<p>You drive over to one of those “superstores...,You drive over to one of those “superstores”… ...,2019-10-14 18:00:21,2019-10-15 05:58:25.540033,177,"[Memory, Facebook, eAlert News]","{'tags': ['Alzheimer’s'], 'updated_at': '2019-...",0,...,,,,,,,,,,


In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 3435 entries, https://naturalnewsblogs.com/sugar-is-not-sweet/ to https://www.medicaldaily.com/diarrhea-could-be-first-sign-coronavirus-infection-study-says-450722
Data columns (total 69 columns):
id                         3435 non-null int64
title                      3435 non-null object
perex                      2646 non-null object
body                       3430 non-null object
published_at               3435 non-null datetime64[ns]
extracted_at               3435 non-null datetime64[ns]
source_id                  3435 non-null int64
category                   2085 non-null object
other_info                 3433 non-null object
image_count                3435 non-null int64
video_count                3435 non-null int64
veracity                   3435 non-null object
claims_false               3435 non-null int64
claims_mixture             3435 non-null int64
claims_mostly_false        3435 non-null int64
claims_mostly_true         34

-----

Rozdelenie hodnot popularity do 5 skupin

- `0 - 0.5`
- `0.5 - 0.75`
- `0.75 - 0.9`
- `0.9 - 0.95`
- `0.95 - 1`

In [6]:
def add_labels(df, quantiles, column='fb_popularity'):
    df = df.copy()
    label_str = f'{column}_label'
    
    df[label_str] = -1
    
    label = 1    
    for i in range(len(quantiles) - 1):
        low = df[column].quantile(quantiles[i])
        high = df[column].quantile(quantiles[i + 1])
        
        df.loc[(low <= df[column]) & (df[column] <= high), label_str] = int(label)
        
        label += 1
    df = df.drop(columns=[column])    
    return df.copy()

In [73]:
quantiles = [
    0,
    .50,
    .75,
    .90,
    .95,
    1
]

cols = [
    'fb_ad_13_reaction_count',
    'fb_ad_13_comment_count',
    'fb_ad_13_share_count'    
]

for i in cols:
    print(df[i].quantile(quantiles))
    df = add_labels(df, quantiles, column=i)

0.00         0.0
0.50        28.0
0.75       227.0
0.90       995.0
0.95      2392.5
1.00    901984.0
Name: fb_ad_13_reaction_count, dtype: float64
0.00         0.0
0.50         3.0
0.75        40.0
0.90       234.0
0.95       648.0
1.00    140695.0
Name: fb_ad_13_comment_count, dtype: float64
0.00         0.0
0.50        31.0
0.75       132.5
0.90       472.0
0.95       990.5
1.00    168585.0
Name: fb_ad_13_share_count, dtype: float64


Pri jednotlivych zlozkach sme pri tomto rozdeleni nasli len 4 skupiny (lebo 1 == 2)

---

Jednoducha heuristika: ak je zdroj nedoveryhodny tak aj clanok je nedoveryhodny

In [9]:
df['is_fake_news_label'] = df.source_is_reliable.replace({0:1, 1:0})

---

### data cleanup

In [115]:
# clear body, perex, etc from html....

In [10]:
# drop rows without body
df = df[~df.body.isnull()]
df = df[~df.title.isnull()]

In [11]:
from bs4 import BeautifulSoup
import unicodedata

def clear_text(text):
    if text is None:
        return ''

    text = BeautifulSoup(text, features='html.parser').text
    text = text.lower()
    text = text.replace('\r', '')
    text = text.replace('\n', ' ')
    text = unicodedata.normalize('NFKD', text)

    return text

def clear_column(df, column):
    df[column] = df[column].apply(clear_text)

def clear_columns(df, columns):
    pbar_conf = {
        'refresh_rate': 1,
        'action_names': columns
    }
        
    for c in Pbar(columns, **pbar_conf):
        clear_column(df, c)

In [12]:
clear_columns(df, ['title', 'perex', 'body'])



In [13]:
label_names = list(filter(lambda x: x.endswith('_label'), df.columns))

In [14]:
for ln in label_names:
    df[ln] = pd.to_numeric(df[ln])

In [15]:
labels_df = pd.DataFrame()

In [16]:
# labely
labels_df = pd.concat([labels_df] + [df[label_name] for label_name in label_names], axis=1)

of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  


In [17]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 3430 entries, https://naturalnewsblogs.com/sugar-is-not-sweet/ to https://www.medicaldaily.com/diarrhea-could-be-first-sign-coronavirus-infection-study-says-450722
Data columns (total 70 columns):
id                         3430 non-null int64
title                      3430 non-null object
perex                      3430 non-null object
body                       3430 non-null object
published_at               3430 non-null datetime64[ns]
extracted_at               3430 non-null datetime64[ns]
source_id                  3430 non-null int64
category                   2080 non-null object
other_info                 3428 non-null object
image_count                3430 non-null int64
video_count                3430 non-null int64
veracity                   3430 non-null object
claims_false               3430 non-null int64
claims_mixture             3430 non-null int64
claims_mostly_false        3430 non-null int64
claims_mostly_true         34

## Rozdelenie dat

In [18]:
train, test, validation = tuple(split_data(df, sizes=[2, 2, 1], shuffle=True, np_random=np_random))

In [19]:
print([len(i) for i in [train,test,validation]])

[1372, 1372, 686]


In [21]:
label_names

['is_fake_news_label']

---

In [22]:
from sklearn.feature_extraction.text import CountVectorizer
from textblob import TextBlob

import spacy

nlp = spacy.load("en_core_web_sm")

In [24]:
def tokenize(text):
    doc = nlp(text, disable=['parser', 'tagger', 'ner'])
    
    res = []
    for i in doc:
        if i.is_stop:
            continue
        if i.is_punct:
            continue
            
        res.append(str(i))
    
    return res

In [25]:
def title_basic_features(df):
    cv = CountVectorizer()
    data = cv.fit_transform(df.title)

    res = pd.DataFrame(index=df.index)
    
    res['title_word_count'] = data.sum(axis=1)
    res['title_char_length'] = df.title.apply(lambda x: len(x))
    
    return res

In [26]:
def perex_basic_features(df):
    cv = CountVectorizer()
    data = cv.fit_transform(df.perex)

    res = pd.DataFrame(index=df.index)    
    res['perex_word_count'] = data.sum(axis=1)
    res['perex_char_length'] = df.perex.apply(lambda x: len(x))
    
    return res

In [27]:
def content_basic_features(df):
    content_cv = CountVectorizer()
    data = content_cv.fit_transform(df.body)

    res = pd.DataFrame(index=df.index)    
    res['content_word_count'] = data.sum(axis=1)
    res['content_char_length'] = df.body.apply(lambda x: len(x))
    
    return res

In [28]:
def media_count_total(df):
    res = pd.DataFrame(index=df.index)
    
    res['media_count_total'] = df['image_count'] + df['video_count']
    
    return res
    
def media_count_image(df):
    return column_feature(df, 'image_count')

def media_count_video(df):
    return column_feature(df, 'video_count')

In [29]:
def published_on_day(df):
    res = pd.DataFrame(index=df.index)
    
    res['published_on_day'] = df.published_at.dt.weekday + 1
    
    return res

In [30]:
def is_collective_author(df):
    
    uniq_source_names = df.source_name.unique()
    def make_a_guess(author_name):

        return any((
                    str_contains(author_name, 'admin', case=False),
                    author_name.startswith('Neuroscience News Posts Science Research News Labs Universities Hospitals News Departments Around The World'),
                    author_name in ['Neuroscience News',
                                    'Wake Up World',
                                    'Health Sciences Institute',
                                    'REALdeal', 
                                    'nmheditor',
                                    'The Mind Unleashed',
                                    'Thinking Moms\' Revolution',
                                    'TheNewsDoctors',
                                    'clnews',
                                    'Associated Press',
                                    'HealthDay',
                                    'Infowars',
                                    'Natural News Editors',
                                    'https://www.facebook.com/WebMD',
                                    'naturalnews', 'peakconsciousness', 'HealingwithoutHurting',
                                    'HealthNutNews.com',
                                   ],
                    author_name.startswith('The Associated Press'),
                    # ' and ' in author_name, # todo: je to kolektivny autor ak ich je len viac?
                    author_name in uniq_source_names,   
        ))
    
    res = pd.DataFrame(index=df.index)
    
    res['is_collective_author'] = df.author_name.map(make_a_guess)
    
    return res

In [31]:
pd.Series(df.category.explode().unique())

0                                    Health
1                                   Science
2                                    Videos
3                                  Fluoride
4                                  Medicine
                       ...                 
428    d419034f-f7e3-5029-8dd9-67fd1f5cfeb7
429    d258ff11-c795-5fdd-a013-d5c09188ed0f
430    7f78d6ae-1132-5d03-a932-f47e31f30167
431                         The Sacred Blog
432    ddba18a3-9044-5ba1-8bcd-1e2535e604e6
Length: 433, dtype: object

In [32]:
features = [
    title_basic_features,
    perex_basic_features,
    content_basic_features,
    
    media_count_total,
    media_count_image,
    media_count_video,
    
    published_on_day,
    is_collective_author,
]

In [33]:
def add_features(df):    
    pbar_conf = {
        'refresh_rate': 1,
        'action_names': [i.__name__ for i in features]
    }
    
    res = pd.DataFrame()
    for feature_generator in Pbar(features, **pbar_conf):
        res = pd.concat([res, feature_generator(df)], axis=1)

    return res

---

In [34]:
data = split_X_y_all(train, test, validation, selected_label='is_fake_news_label', all_labels=label_names)

In [35]:
data.train.features = add_features(data.train.X)



2020-03-31 01:15:39,279 : INFO : NumExpr defaulting to 8 threads.




In [36]:
data.test.features = add_features(data.test.X)



In [37]:
data.validation.features = add_features(data.validation.X)



In [38]:
label_names

['is_fake_news_label']

In [39]:
from gensim.models.doc2vec import Doc2Vec
from gensim.models.doc2vec import TaggedDocument


import multiprocessing

In [40]:
def tokenize_to_file(data, file):
    with open(file, 'w', encoding='utf-8') as f:
        for i in Pbar(data):
            f.write(f"{' '.join(tokenize(i))}\n")

In [41]:
cores = multiprocessing.cpu_count()
print(f'>>> {cores} cores available')

>>> 8 cores available


In [43]:
tokenize_to_file(data.train.X.body, './data/train_body_tokenized.txt')
tokenize_to_file(data.test.X.body, './data/test_body_tokenized.txt')
tokenize_to_file(data.validation.X.body, './data/validation_body_tokenized.txt')



In [44]:
d2v = Doc2Vec(corpus_file='./data/train_body_tokenized.txt', vector_size=300, min_count=2, epochs=15, workers=cores)

2020-03-31 01:16:38,776 : INFO : collecting all words and their counts
2020-03-31 01:16:38,778 : INFO : PROGRESS: at example #0, processed 0 words (0/s), 0 word types, 0 tags
2020-03-31 01:16:38,963 : INFO : collected 35938 word types and 1372 unique tags from a corpus of 1372 examples and 619694 words
2020-03-31 01:16:38,963 : INFO : Loading a fresh vocabulary
2020-03-31 01:16:38,997 : INFO : effective_min_count=2 retains 21095 unique words (58% of original 35938, drops 14843)
2020-03-31 01:16:38,998 : INFO : effective_min_count=2 leaves 604851 word corpus (97% of original 619694, drops 14843)
2020-03-31 01:16:39,060 : INFO : deleting the raw counts dictionary of 35938 items
2020-03-31 01:16:39,062 : INFO : sample=0.001 downsamples 16 most-common words
2020-03-31 01:16:39,064 : INFO : downsampling leaves estimated 595804 word corpus (98.5% of prior 604851)
2020-03-31 01:16:39,118 : INFO : estimated required memory for 21095 words and 300 dimensions: 62821900 bytes
2020-03-31 01:16:39,

In [45]:
def infer_d2v(d2v_model, data_file):
    
    res = []
    
    with open(data_file, 'r', encoding='utf-8') as f:
        for i in Pbar(f.readlines()):
            res.append(d2v_model.infer_vector(i.split(' '), steps=20, alpha=0.025)) 
    
    return res

def infer_for_df(df, d2v_model, data_file):
    lst = infer_d2v(d2v_model, data_file)
    d2v_df = pd.DataFrame(lst, index=df.index, columns=[f'd2v_{i}' for i in range(1, 301)] )
    
    return pd.concat([df, d2v_df], axis=1, sort=False)

In [46]:
data.train.features = infer_for_df(data.train.features, d2v, './data/train_body_tokenized.txt')



In [47]:
data.test.features = infer_for_df(data.test.features, d2v, './data/test_body_tokenized.txt')



In [48]:
from sklearn.metrics import classification_report

from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier

In [49]:
def fit_predict(clf, data):
    clf.fit(data.train.features, data.train.y)
    return clf.predict(data.test.features)

In [50]:
data.train.features['media_count_total'] = pd.to_numeric(data.train.features['media_count_total'])
data.train.features['image_count'] = pd.to_numeric(data.train.features['image_count'])
data.train.features['video_count'] = pd.to_numeric(data.train.features['video_count'])

data.test.features['media_count_total'] = pd.to_numeric(data.test.features['media_count_total'])
data.test.features['image_count'] = pd.to_numeric(data.test.features['image_count'])
data.test.features['video_count'] = pd.to_numeric(data.test.features['video_count'])

In [75]:
data.train.features.drop(columns=['perex_word_count', 'perex_char_length'], inplace=True)
data.test.features.drop(columns=['perex_word_count', 'perex_char_length'], inplace=True)

In [53]:
data.train.y = pd.to_numeric(data.train.y)
data.test.y = pd.to_numeric(data.test.y)

In [76]:
classifiers = [
    RandomForestClassifier(n_estimators=100, class_weight='balanced', n_jobs=cores),
    XGBClassifier(n_jobs=cores, seed=RANDOM_STATE),
]

pbar_conf = {
    'refresh_rate': 1,
    'length': len(classifiers), 
    'pbar_width': 52,
    'action_names': [i.__class__.__name__ for i in classifiers]
}

predictions = list(Pbar((fit_predict(clf, data) for clf in classifiers), **pbar_conf))

for p in predictions:
    print(classification_report(data.test.y, p))
    print('-' * 54)

              precision    recall  f1-score   support

           0       0.86      0.87      0.87       750
           1       0.84      0.84      0.84       622

    accuracy                           0.85      1372
   macro avg       0.85      0.85      0.85      1372
weighted avg       0.85      0.85      0.85      1372

------------------------------------------------------
              precision    recall  f1-score   support

           0       0.88      0.84      0.86       750
           1       0.82      0.86      0.84       622

    accuracy                           0.85      1372
   macro avg       0.85      0.85      0.85      1372
weighted avg       0.85      0.85      0.85      1372

------------------------------------------------------


In [None]:
[====================================================] -- 2 / 2 -- (finished)fier) -- 1 / 2 -- 0 / 2
              precision    recall  f1-score   support

           0       0.91      0.89      0.90       750
           1       0.87      0.89      0.88       622

    accuracy                           0.89      1372
   macro avg       0.89      0.89      0.89      1372
weighted avg       0.89      0.89      0.89      1372

------------------------------------------------------
              precision    recall  f1-score   support

           0       0.92      0.92      0.92       750
           1       0.91      0.90      0.90       622

    accuracy                           0.91      1372
   macro avg       0.91      0.91      0.91      1372
weighted avg       0.91      0.91      0.91      1372


In [77]:
display_all(pd.DataFrame((i for i in classifiers[0].feature_importances_), index=data.train.features.columns, columns=['importance']).sort_values(by=['importance'], ascending=False))

Unnamed: 0,importance
title_char_length,0.031636
d2v_82,0.025878
title_word_count,0.024277
d2v_197,0.022545
d2v_84,0.015678
d2v_288,0.014582
d2v_21,0.014407
content_word_count,0.014356
d2v_194,0.013666
content_char_length,0.013029


In [68]:
show_importances(classifiers[1], data.train.features.columns)

Classifier XGBClassifier does not contain feature importance data


In [385]:
data.train.y

0        1
1        0
2        1
3        1
4        1
        ..
65684    1
65685    1
65686    0
65687    1
65688    1
Name: is_fake_news_label, Length: 65689, dtype: int64

In [2]:
import pandas as pd

In [22]:
x = pd.DataFrame(columns=["x"])

In [23]:
x.loc[1] = [[1,2,3,4]]
x.loc[2] = [[3,4,5,6]]
x.loc[3] = [[6,7,8,9]]
x.loc[4] = [[10,11,12,13]]
x.loc[5] = [[15]]
x.loc[6] = [[15,16]]

In [24]:
x

Unnamed: 0,x
1,"[1, 2, 3, 4]"
2,"[3, 4, 5, 6]"
3,"[6, 7, 8, 9]"
4,"[10, 11, 12, 13]"
5,[15]
6,"[15, 16]"


In [28]:
zle = pd.Series([3,6,15,16])

In [26]:
x['zle'] = x['x'].apply(lambda stlpec: len([i for i in stlpec if i in zle]))

In [29]:
x

Unnamed: 0,x,zle
1,"[1, 2, 3, 4]",1
2,"[3, 4, 5, 6]",2
3,"[6, 7, 8, 9]",1
4,"[10, 11, 12, 13]",0
5,[15],1
6,"[15, 16]",2
