In [2]:
import sys

import importlib
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import sqlalchemy
import gensim
import logging
import empath

import common
import util
importlib.reload(common)
importlib.reload(util)

from common import create_engine
from common import display_all
from common import figsize
from common import save_df, load_df
from common import save_session, load_session

from util import show_importances
from util import split_X_y_all, split_X_y, split_data
from util import empty_features, column_feature, str_contains

from pbar import Pbar

from pandas.plotting import register_matplotlib_converters

register_matplotlib_converters() # converters e.g. for datetime in plots
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)

In [3]:
RANDOM_STATE = 123
np_random = np.random.RandomState(RANDOM_STATE)

### Load dataset

In [36]:
df = load_df('final_data.pickle')

In [29]:
df.head()

Unnamed: 0_level_0,title,perex,body,raw_body,published_at,extracted_at,category,other_info,image_count,video_count,...,fb_popularity_ad_6,fb_popularity_ad_7,fb_popularity_ad_8,fb_popularity_ad_9,fb_popularity_ad_10,fb_popularity_ad_11,fb_popularity_ad_12,fb_popularity_ad_13,fb_popularity_ad_14,fb_popularity_ad_15
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
428781,Want to Support Immunity? Look to Your Gut,<p>For thousands of years we’ve relied on our ...,For thousands of years we’ve relied on our mic...,"<p><span data-contrast=""auto"">For thousands of...",2019-10-10 00:04:42,2019-10-10 07:13:11.637640,[gut health],"{'tags': ['gut health', 'immune system', 'immu...",0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
428783,NY Judge Denies Stay: Children Locked Out of S...,,\n \n,"<p><a class=""asset-img-link"" href=""https://www...",2019-10-10 01:01:32,2019-10-10 07:13:17.715180,,"{'tags': None, 'updated_at': '2019-10-09 23:01...",0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
428831,Cucumber + Turmeric = Gorgeous Skin and Your F...,<p>Face masks – the best means of expressing c...,Face masks – the best means of expressing care...,\n<p>Face masks – the best means of expressing...,2019-10-10 02:21:28,2019-10-10 09:24:06.693998,[Beauty],"{'tags': [], 'updated_at': '2019-10-10T02:30:54'}",1,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
428832,‘Sesame Street’ launches initiative to help ex...,"In a new initiative, “Sesame Street” is addres...",“Sesame Street” is introducing a new storyline...,<p>Parents and kids who are fans of &#x201C;Se...,2019-10-10 10:40:22,2019-10-10 12:02:48.264837,[Health],"{'tags': ['pediatrics', 'addiction'], 'keyword...",1,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
428833,Silicosis outbreak highlights the 'malignant n...,Fourteen U.S. workers are killed on the job ev...,Cutting or polishing the quartz-based composit...,"<p>Across the United States, workers are suffe...",2019-10-10 10:35:03,2019-10-10 12:02:48.419273,[First Opinion],"{'tags': ['public health', 'government agencie...",1,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [6]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 20280 entries, 428781 to 812426
Data columns (total 86 columns):
 #   Column                   Non-Null Count  Dtype         
---  ------                   --------------  -----         
 0   title                    20280 non-null  object        
 1   perex                    16137 non-null  object        
 2   body                     20246 non-null  object        
 3   published_at             20280 non-null  datetime64[ns]
 4   extracted_at             20280 non-null  datetime64[ns]
 5   category                 13716 non-null  object        
 6   other_info               20276 non-null  object        
 7   image_count              20280 non-null  int64         
 8   video_count              20280 non-null  int64         
 9   author_name              20280 non-null  object        
 10  source_id                20280 non-null  int64         
 11  source_name              20280 non-null  object        
 12  source_url               2

-----

Rozdelenie hodnot popularity do 5 skupin

- `0 - 0.5`
- `0.5 - 0.75`
- `0.75 - 0.9`
- `0.9 - 0.95`
- `0.95 - 1`

In [22]:
def add_labels(df, quantiles, column='fb_popularity_ad_15'):
    df = df.copy()
    label_str = f'{column}_label'
    
    df[label_str] = -1
    
    label = 1    
    for i in range(len(quantiles) - 1):
        low = df[column].quantile(quantiles[i])
        high = df[column].quantile(quantiles[i + 1])
        
        df.loc[(low <= df[column]) & (df[column] <= high), label_str] = int(label)
        
        label += 1
    df = df.drop(columns=[column])    
    return df.copy()

In [8]:
quantiles = [
    0,
    .50,
    .75,
    .90,
    .95,
    1
]

cols = [
    'fb_ad_15_reaction_count',
    'fb_ad_15_comment_count',
    'fb_ad_15_share_count',
    'fb_popularity_ad_15'
]

for i in cols:
    print(df[i].quantile(quantiles))
    df = add_labels(df, quantiles, column=i)

0.00          0.0
0.50         27.0
0.75        336.0
0.90       1875.0
0.95       5344.7
1.00    1369290.0
Name: fb_ad_15_reaction_count, dtype: float64
0.00         0.00
0.50         4.00
0.75        72.00
0.90       482.70
0.95      1356.05
1.00    898615.00
Name: fb_ad_15_comment_count, dtype: float64
0.00         0.00
0.50        30.00
0.75       159.00
0.90       684.40
0.95      1754.35
1.00    404542.00
Name: fb_ad_15_share_count, dtype: float64
0.00          0.0
0.50         17.0
0.75        336.0
0.90       2112.5
0.95       6159.1
1.00    2566473.0
Name: fb_popularity_ad_15, dtype: float64


Pri jednotlivych zlozkach sme pri tomto rozdeleni nasli len 4 skupiny (lebo 1 == 2)

---

Jednoducha heuristika: ak je zdroj nedoveryhodny tak aj clanok je nedoveryhodny

In [9]:
df['is_fake_news_label'] = df.source_is_reliable.replace({0:1, 1:0})

---

In [13]:
label_names = list(filter(lambda x: x.endswith('_label'), df.columns))

In [14]:
for ln in label_names:
    df[ln] = pd.to_numeric(df[ln])

In [15]:
labels_df = pd.DataFrame()

In [15]:
# labely
labels_df = pd.concat([labels_df] + [df[label_name] for label_name in label_names], axis=1)

In [17]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 20246 entries, 428781 to 812426
Data columns (total 87 columns):
 #   Column                         Non-Null Count  Dtype         
---  ------                         --------------  -----         
 0   title                          20246 non-null  object        
 1   perex                          20246 non-null  object        
 2   body                           20246 non-null  object        
 3   published_at                   20246 non-null  datetime64[ns]
 4   extracted_at                   20246 non-null  datetime64[ns]
 5   category                       13685 non-null  object        
 6   other_info                     20242 non-null  object        
 7   image_count                    20246 non-null  int64         
 8   video_count                    20246 non-null  int64         
 9   author_name                    20246 non-null  object        
 10  source_id                      20246 non-null  int64         
 11  source_na

## Rozdelenie dat

In [16]:
train, test, validation = tuple(split_data(df, sizes=[2, 2, 1], shuffle=True, np_random=np_random))

In [17]:
print([len(i) for i in [train,test,validation]])

[8099, 8098, 4049]


In [18]:
label_names

['fb_ad_15_reaction_count_label',
 'fb_ad_15_comment_count_label',
 'fb_ad_15_share_count_label',
 'fb_popularity_ad_15_label',
 'is_fake_news_label']

---

In [16]:
from sklearn.feature_extraction.text import CountVectorizer
from textblob import TextBlob
import textstat

import spacy
importlib.reload(spacy)
nlp = spacy.load("en_core_web_sm")

In [20]:
def tokenize(text):
    doc = nlp(text, disable=['parser', 'tagger', 'ner'])
    
    res = []
    for i in doc:
        if i.is_stop:
            continue
        if i.is_punct:
            continue
            
        res.append(str(i))
    
    return res

In [None]:
# textblob repre

# spacy repre

# textstat repre

### Skupina 'forma obsahu'

In [None]:
class Content:
    # title word count
    # content word count
    
    # title length
    # content lenth
    
    # avg word len
    # sentences count

    # count of words over 5 chars
    
    # count of ?
    # count of !
    # count of ...
    
    # count of media
    
    # POS tags
    # stop words count
        

### Skupina 'metadata'

In [None]:
def title_basic_features(df):
    cv = CountVectorizer()
    data = cv.fit_transform(df.title)

    res = pd.DataFrame(index=df.index)
    
    res['title_word_count'] = data.sum(axis=1)
    res['title_char_length'] = df.title.apply(lambda x: len(x))
    
    return res

In [None]:
def perex_basic_features(df):
    cv = CountVectorizer()
    data = cv.fit_transform(df.perex)

    res = pd.DataFrame(index=df.index)    
    res['perex_word_count'] = data.sum(axis=1)
    res['perex_char_length'] = df.perex.apply(lambda x: len(x))
    
    return res

In [23]:
def content_basic_features(df):
    content_cv = CountVectorizer()
    data = content_cv.fit_transform(df.body)

    res = pd.DataFrame(index=df.index)    
    res['content_word_count'] = data.sum(axis=1)
    res['content_char_length'] = df.body.apply(lambda x: len(x))
    
    return res

In [24]:
def media_count_total(df):
    res = pd.DataFrame(index=df.index)
    
    res['media_count_total'] = df['image_count'] + df['video_count']
    
    return res
    
def media_count_image(df):
    return column_feature(df, 'image_count')

def media_count_video(df):
    return column_feature(df, 'video_count')

In [26]:
def popularity_features(df):
    res = pd.DataFrame(index=df.index)
    
    
    for i in [0,1,3]:
        res[f'fb_ad_{i}_reaction_count'] = df[f'fb_ad_{i}_reaction_count']
        res[f'fb_ad_{i}_comment_count'] = df[f'fb_ad_{i}_comment_count']
        res[f'fb_ad_{i}_share_count'] = df[f'fb_ad_{i}_share_count']
        res[f'fb_popularity_ad_{i}'] = df[f'fb_popularity_ad_{i}']
        
    
    res.fillna(res.mean(), inplace=True)
    
    return res

In [28]:
def is_collective_author(df):
    
    uniq_source_names = df.source_name.unique()
    def make_a_guess(author_name):
        return any((
                    str_contains(author_name, 'admin', case=False),
                    author_name.startswith('Neuroscience News Posts Science Research News Labs Universities Hospitals News Departments Around The World'),
                    author_name in ['Neuroscience News',
                                    'Wake Up World',
                                    'Health Sciences Institute',
                                    'REALdeal', 
                                    'nmheditor',
                                    'The Mind Unleashed',
                                    'Thinking Moms\' Revolution',
                                    'TheNewsDoctors',
                                    'clnews',
                                    'Associated Press',
                                    'HealthDay',
                                    'Infowars',
                                    'Natural News Editors',
                                    'https://www.facebook.com/WebMD',
                                    'naturalnews',
                                    'peakconsciousness',
                                    'HealingwithoutHurting',
                                    'HealthNutNews.com',
                                   ],
                    author_name.startswith('The Associated Press'),
                    # ' and ' in author_name, # todo: je to kolektivny autor ak ich je len viac?
                    author_name in uniq_source_names,   
        ))
    
    res = pd.DataFrame(index=df.index)
    
    res['is_collective_author'] = df.author_name.map(make_a_guess)
    
    return res

In [None]:
features = [
    title_basic_features,
    perex_basic_features,
    content_basic_features,
    
    media_count_total,
    media_count_image,
    media_count_video,
    
    published_on_day,
    is_collective_author,
    
    claim_counts,
    
    popularity_features,
    readability_features
]

In [31]:
def add_features(df):    
    pbar_conf = {
        'refresh_rate': 1,
        'action_names': [i.__name__ for i in features]
    }
    
    res = pd.DataFrame()
    for feature_generator in Pbar(features, **pbar_conf):
        res = pd.concat([res, feature_generator(df)], axis=1)

    return res

---

In [32]:
data = split_X_y_all(train, test, validation, selected_label='is_fake_news_label', all_labels=label_names)

In [33]:
data.train.features = add_features(data.train.X)



In [34]:
data.test.features = add_features(data.test.X)



In [35]:
data.validation.features = add_features(data.validation.X)



In [36]:
label_names

['fb_ad_15_reaction_count_label',
 'fb_ad_15_comment_count_label',
 'fb_ad_15_share_count_label',
 'fb_popularity_ad_15_label',
 'is_fake_news_label']

In [37]:
from gensim.models.doc2vec import Doc2Vec
from gensim.models.doc2vec import TaggedDocument


import multiprocessing

In [38]:
def tokenize_to_file(data, file):
    with open(file, 'w', encoding='utf-8') as f:
        for i in Pbar(data):
            f.write(f"{' '.join(tokenize(i))}\n")

In [39]:
cores = multiprocessing.cpu_count()
print(f'>>> {cores} cores available')

>>> 12 cores available


In [40]:
tokenize_to_file(data.train.X.body, './data/train_body_tokenized.txt')
tokenize_to_file(data.test.X.body, './data/test_body_tokenized.txt')
tokenize_to_file(data.validation.X.body, './data/validation_body_tokenized.txt')



In [41]:
d2v = Doc2Vec(corpus_file='./data/train_body_tokenized.txt', vector_size=300, min_count=2, epochs=15, workers=12)

2020-04-27 11:30:50,863 : INFO : collecting all words and their counts
2020-04-27 11:30:50,864 : INFO : PROGRESS: at example #0, processed 0 words (0/s), 0 word types, 0 tags
2020-04-27 11:30:51,711 : INFO : collected 90408 word types and 8099 unique tags from a corpus of 8099 examples and 2789006 words
2020-04-27 11:30:51,712 : INFO : Loading a fresh vocabulary
2020-04-27 11:30:51,878 : INFO : effective_min_count=2 retains 51892 unique words (57% of original 90408, drops 38516)
2020-04-27 11:30:51,878 : INFO : effective_min_count=2 leaves 2750490 word corpus (98% of original 2789006, drops 38516)
2020-04-27 11:30:52,056 : INFO : deleting the raw counts dictionary of 90408 items
2020-04-27 11:30:52,059 : INFO : sample=0.001 downsamples 9 most-common words
2020-04-27 11:30:52,060 : INFO : downsampling leaves estimated 2721940 word corpus (99.0% of prior 2750490)
2020-04-27 11:30:52,213 : INFO : estimated required memory for 51892 words and 300 dimensions: 160205600 bytes
2020-04-27 11:3

In [42]:
def infer_d2v(d2v_model, data_file):
    
    res = []
    
    with open(data_file, 'r', encoding='utf-8') as f:
        for i in Pbar(f.readlines()):
            res.append(d2v_model.infer_vector(i.split(' '), steps=20, alpha=0.025)) 
    
    return res

def infer_for_df(df, d2v_model, data_file):
    lst = infer_d2v(d2v_model, data_file)
    d2v_df = pd.DataFrame(lst, index=df.index, columns=[f'd2v_{i}' for i in range(1, 301)] )
    
    return pd.concat([df, d2v_df], axis=1, sort=False)

In [43]:
data.train.features = infer_for_df(data.train.features, d2v, './data/train_body_tokenized.txt')



In [44]:
data.test.features = infer_for_df(data.test.features, d2v, './data/test_body_tokenized.txt')



In [45]:
from sklearn.metrics import classification_report

from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier

In [46]:
def fit_predict(clf, data):
    clf.fit(data.train.features, data.train.y)
    return clf.predict(data.test.features)

In [47]:
label_names

['fb_ad_15_reaction_count_label',
 'fb_ad_15_comment_count_label',
 'fb_ad_15_share_count_label',
 'fb_popularity_ad_15_label',
 'is_fake_news_label']

In [48]:
data.train.switch_label('is_fake_news_label')
data.test.switch_label('is_fake_news_label')

In [49]:
data.train.features['media_count_total'] = pd.to_numeric(data.train.features['media_count_total'])
data.train.features['image_count'] = pd.to_numeric(data.train.features['image_count'])
data.train.features['video_count'] = pd.to_numeric(data.train.features['video_count'])

data.test.features['media_count_total'] = pd.to_numeric(data.test.features['media_count_total'])
data.test.features['image_count'] = pd.to_numeric(data.test.features['image_count'])
data.test.features['video_count'] = pd.to_numeric(data.test.features['video_count'])

data.train.features['av_claims_false'] = pd.to_numeric(data.train.features['av_claims_false'])
data.train.features['av_claims_mostly_false'] = pd.to_numeric(data.train.features['av_claims_mostly_false'])
data.train.features['av_claims_mixture'] = pd.to_numeric(data.train.features['av_claims_mixture'])
data.train.features['av_claims_mostly_true'] = pd.to_numeric(data.train.features['av_claims_mostly_true'])
data.train.features['av_claims_true'] = pd.to_numeric(data.train.features['av_claims_true'])
data.train.features['av_claims_unknown'] = pd.to_numeric(data.train.features['av_claims_unknown'])


data.test.features['av_claims_false'] = pd.to_numeric(data.test.features['av_claims_false'])
data.test.features['av_claims_mostly_false'] = pd.to_numeric(data.test.features['av_claims_mostly_false'])
data.test.features['av_claims_mixture'] = pd.to_numeric(data.test.features['av_claims_mixture'])
data.test.features['av_claims_mostly_true'] = pd.to_numeric(data.test.features['av_claims_mostly_true'])
data.test.features['av_claims_true'] = pd.to_numeric(data.test.features['av_claims_true'])
data.test.features['av_claims_unknown'] = pd.to_numeric(data.test.features['av_claims_unknown'])

In [50]:
data.train.features.drop(columns=['perex_word_count', 'perex_char_length'], inplace=True)
data.test.features.drop(columns=['perex_word_count', 'perex_char_length'], inplace=True)

In [51]:
data.train.y = pd.to_numeric(data.train.y)
data.test.y = pd.to_numeric(data.test.y)

In [52]:
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB

In [53]:
classifiers = [
    RandomForestClassifier(n_estimators=100, class_weight='balanced', n_jobs=cores),
    XGBClassifier(n_jobs=cores, seed=RANDOM_STATE),
    GaussianNB(),
    LogisticRegression()
]

pbar_conf = {
    'refresh_rate': 1,
    'length': len(classifiers), 
    'pbar_width': 52,
    'action_names': [i.__class__.__name__ for i in classifiers]
}

predictions = list(Pbar((fit_predict(clf, data) for clf in classifiers), **pbar_conf))

for p in predictions:
    print(classification_report(data.test.y, p))
    print('-' * 54)



STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


              precision    recall  f1-score   support

           0       0.88      0.96      0.92      4838
           1       0.93      0.81      0.87      3260

    accuracy                           0.90      8098
   macro avg       0.90      0.88      0.89      8098
weighted avg       0.90      0.90      0.90      8098

------------------------------------------------------
              precision    recall  f1-score   support

           0       0.90      0.94      0.92      4838
           1       0.91      0.84      0.87      3260

    accuracy                           0.90      8098
   macro avg       0.90      0.89      0.90      8098
weighted avg       0.90      0.90      0.90      8098

------------------------------------------------------
              precision    recall  f1-score   support

           0       0.77      0.88      0.82      4838
           1       0.77      0.61      0.68      3260

    accuracy                           0.77      8098
   macro avg      

In [None]:
[====================================================] -- 2 / 2 -- (finished)
              precision    recall  f1-score   support

           0       0.89      0.96      0.92      4838
           1       0.92      0.82      0.87      3260

    accuracy                           0.90      8098
   macro avg       0.91      0.89      0.89      8098
weighted avg       0.90      0.90      0.90      8098

------------------------------------------------------
              precision    recall  f1-score   support

           0       0.90      0.94      0.92      4838
           1       0.90      0.84      0.87      3260

    accuracy                           0.90      8098
   macro avg       0.90      0.89      0.89      8098
weighted avg       0.90      0.90      0.90      8098

------------------------------------------------------

[====================================================] -- 2 / 2 -- (finished)
              precision    recall  f1-score   support

           0       0.83      0.88      0.85      4838
           1       0.80      0.73      0.76      3260

    accuracy                           0.82      8098
   macro avg       0.81      0.80      0.81      8098
weighted avg       0.82      0.82      0.82      8098

------------------------------------------------------
              precision    recall  f1-score   support

           0       0.82      0.88      0.85      4838
           1       0.81      0.72      0.76      3260

    accuracy                           0.82      8098
   macro avg       0.82      0.80      0.81      8098
weighted avg       0.82      0.82      0.82      8098

------------------------------------------------------

In [54]:
display_all(pd.DataFrame((i for i in classifiers[0].feature_importances_), index=data.train.features.columns, columns=['importance']).sort_values(by=['importance'], ascending=False))

Unnamed: 0,importance
fb_ad_0_share_count,0.037344
fb_ad_0_reaction_count,0.032782
title_char_length,0.030757
fb_ad_0_comment_count,0.026981
content_word_count,0.025531
fb_popularity_ad_0,0.024881
title_word_count,0.021971
media_count_total,0.021888
image_count,0.021173
fb_ad_1_reaction_count,0.020109


In [None]:
data.train.y