In [1]:
%matplotlib inline
%load_ext autoreload
%autoreload 2

In [2]:
import sys

import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import sqlalchemy

from common import create_engine
from common import display_all
from common import figsize
from common import save_df, load_df
from common import save_session, load_session


from util import split_X_y_all, split_X_y, split_data, empty_features, column_feature, str_contains

from pbar import Pbar

from pandas.plotting import register_matplotlib_converters
register_matplotlib_converters() # converters e.g. for datetime in plots

In [3]:
SESSION_FILE_NAME = 'method-impl.session-db'

load_session(SESSION_FILE_NAME)

In [4]:
RANDOM_STATE = 123
np_random = np.random.RandomState(RANDOM_STATE)

### Load dataset

In [5]:
df = load_df('final_data.pickle')

In [6]:
df.head()

Unnamed: 0,id,url,title,perex,body,published_at,extracted_at,source_id,category,other_info,...,source_id.1,source_name,source_url,source_type,source_is_reliable,fb_sync_date,fb_reaction_count,fb_comment_count,fb_share_count,fb_popularity
0,235048,https://www.naturalnewsblogs.com/the-secret-to...,The Secret to Happiness &#8211; Revealed!,<p>&#8220;Why are you so happy all of the time...,“Why are you so happy all of the time?”\nI am ...,2013-08-07 12:36:20,2019-09-05 06:45:28.429274,142,"[Health, Mental Health]","{'tags': ['choices', 'depression', 'happiness'...",...,142,naturalnewsblogs.com,http://naturalnewsblogs.com,news_website,0,2019-10-14 15:29:44.601408,0,0,0,0
1,235036,https://www.naturalnewsblogs.com/us-government...,US government claims 100% ownership over all y...,<p>(NaturalNews) The United States government ...,(NaturalNews) The United States government cla...,2013-05-19 18:50:37,2019-09-05 06:45:27.633650,142,"[Health, News, Science, Weird, Biotechnology, ...","{'tags': ['gene patents', 'genetic slavery', '...",...,142,naturalnewsblogs.com,http://naturalnewsblogs.com,news_website,0,2019-10-14 15:29:44.601408,0,0,0,0
2,235039,https://www.naturalnewsblogs.com/angelina-joli...,Angelina Jolie copied by men! Surgeons now cut...,<p>(NaturalNews) Beyond merely inspiring women...,(NaturalNews) Beyond merely inspiring women to...,2013-05-20 18:54:30,2019-09-05 06:45:28.083048,142,"[Health, Science, Weird, Celebrity, Hospitals ...","{'tags': ['Angelina Jolie', 'BRCA genes', 'pro...",...,142,naturalnewsblogs.com,http://naturalnewsblogs.com,news_website,0,2019-10-14 15:29:44.601408,0,0,0,0
3,235041,https://www.naturalnewsblogs.com/natural-news-...,Natural News releases latest laboratory test r...,"<p>(NaturalNews) As Natural News readers know,...","(NaturalNews) As Natural News readers know, we...",2013-05-23 19:47:30,2019-09-05 06:45:28.167110,142,"[Health, Food, Nutrition]","{'tags': ['clean chlorella', 'heavy metals', '...",...,142,naturalnewsblogs.com,http://naturalnewsblogs.com,news_website,0,2019-10-14 15:29:44.601408,0,0,0,0
4,235052,https://www.naturalnewsblogs.com/food-insecuri...,Food Insecurity: A Solution Grows Under Your Feet,<p>Food insecurity is a serious problem that p...,Food insecurity is a serious problem that plag...,2013-08-08 10:12:31,2019-09-05 06:45:28.607943,142,"[Health, Nutritional Medicine, Food, Nutrition]","{'tags': ['broadleaf plantain', 'food insecuri...",...,142,naturalnewsblogs.com,http://naturalnewsblogs.com,news_website,0,2019-10-14 15:29:44.601408,0,0,0,0


In [7]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 164450 entries, 0 to 171743
Data columns (total 23 columns):
id                    164450 non-null int64
url                   164450 non-null object
title                 164450 non-null object
perex                 140260 non-null object
body                  164221 non-null object
published_at          164450 non-null datetime64[ns]
extracted_at          164450 non-null datetime64[ns]
source_id             164450 non-null int64
category              136428 non-null object
other_info            164446 non-null object
image_count           164450 non-null int64
video_count           164450 non-null int64
author_name           164450 non-null object
source_id             164450 non-null int64
source_name           164450 non-null object
source_url            164450 non-null object
source_type           164450 non-null object
source_is_reliable    164450 non-null int64
fb_sync_date          164450 non-null datetime64[ns]
fb_reaction_coun

-----

Rozdelenie hodnot popularity do 5 skupin

- `0 - 0.5`
- `0.5 - 0.75`
- `0.75 - 0.9`
- `0.9 - 0.95`
- `0.95 - 1`

In [8]:
def add_labels(df, quantiles, column='fb_popularity'):
    df = df.copy()
    label_str = f'{column}_label'
    
    df[label_str] = -1
    
    label = 1    
    for i in range(len(quantiles) - 1):
        low = df[column].quantile(quantiles[i])
        high = df[column].quantile(quantiles[i + 1])
        
        df.loc[(low <= df[column]) & (df[column] <= high), label_str] = int(label)
        
        label += 1
    
    return df.copy()

In [9]:
quantiles = [
    0,
    .50,
    .75,
    .90,
    .95,
    1
]

cols = [
    'fb_reaction_count',
    'fb_comment_count',
    'fb_share_count',
    'fb_popularity'    
]

for i in cols:
    print(df[i].quantile(quantiles))
    df = add_labels(df, quantiles, column=i)

0.00          0.00
0.50          0.00
0.75         53.00
0.90        459.00
0.95       1369.55
1.00    3929532.00
Name: fb_reaction_count, dtype: float64
0.00         0.0
0.50         0.0
0.75         8.0
0.90        92.0
0.95       310.0
1.00    797490.0
Name: fb_comment_count, dtype: float64
0.00         0.0
0.50         1.0
0.75        35.0
0.90       194.0
0.95       529.0
1.00    572708.0
Name: fb_share_count, dtype: float64
0.00          0.00
0.50          2.00
0.75        105.00
0.90        771.00
0.95       2251.55
1.00    5197586.00
Name: fb_popularity, dtype: float64


Pri jednotlivych zlozkach sme pri tomto rozdeleni nasli len 4 skupiny (lebo 1 == 2)

---

Jednoducha heuristika: ak je zdroj nedoveryhodny tak aj clanok je nedoveryhodny

In [10]:
df['is_fake_news_label'] = df.source_is_reliable.replace({0:1, 1:0})

---

### data cleanup

In [11]:
# clear body, perex, etc from html....

In [12]:
# drop rows without body
df = df[~df.body.isnull()]
df = df[~df.title.isnull()]

## Feature engineering

dve hlavne skupinky dat

- metadata (rozne)
- obsahove

In [13]:
label_names = list(filter(lambda x: x.endswith('_label'), df.columns))

In [37]:
for ln in label_names:
    df[ln] = pd.to_numeric(df[ln])

In [38]:
f_df = pd.DataFrame()

In [39]:
# labely
f_df = pd.concat([f_df] + [df[label_name] for label_name in label_names], axis=1)

df.info()

In [16]:
from sklearn.feature_extraction.text import CountVectorizer
from textblob import TextBlob

In [17]:
def title_basic_features(df):
    content_cv = CountVectorizer()
    data = content_cv.fit_transform(df.body)

    res = pd.DataFrame(index=df.index)
    
    res['title_word_count'] = data.sum(axis=1)
    res['title_char_length'] = df.title.apply(lambda x: len(x))
    
    return res

def title_sentiment(df):
    return empty_features(df)

In [18]:
def content_basic_features(df):
    content_cv = CountVectorizer()
    # todo preprocess body
    data = content_cv.fit_transform(df.body)

    res = pd.DataFrame(index=df.index)
    
    res['content_word_count'] = data.sum(axis=1)
    res['content_char_length'] = df.body.apply(lambda x: len(x))
    
    return res

def content_sentiment(df):
    return empty_features(df)

In [19]:
def media_count_total(df):
    res = pd.DataFrame(index=df.index)
    
    res['media_count_total'] = df['image_count'] + df['video_count']
    
    return res
    
def media_count_image(df):
    return column_feature(df, 'image_count')

def media_count_video(df):
    return column_feature(df, 'video_count')

In [20]:
def published_on_day(df):
    res = pd.DataFrame(index=df.index)
    
    res['published_on_day'] = df.published_at.dt.weekday + 1
    
    return res

In [21]:
def is_collective_author(df):
    
    uniq_source_names = df.source_name.unique()
    def make_a_guess(author_name):

        return any((
                    str_contains(author_name, 'admin', case=False),
                    author_name.startswith('Neuroscience News Posts Science Research News Labs Universities Hospitals News Departments Around The World'),
                    author_name in ['Neuroscience News',
                                    'Wake Up World',
                                    'Health Sciences Institute',
                                    'REALdeal', 
                                    'nmheditor',
                                    'The Mind Unleashed',
                                    'Thinking Moms\' Revolution',
                                    'TheNewsDoctors',
                                    'clnews',
                                    'Associated Press',
                                    'HealthDay',
                                    'Infowars',
                                    'Natural News Editors',
                                    'https://www.facebook.com/WebMD',
                                    'naturalnews', 'peakconsciousness', 'HealingwithoutHurting',
                                    'HealthNutNews.com',
                                   ],
                    author_name.startswith('The Associated Press'),
                    # ' and ' in author_name, # todo: je to kolektivny autor ak ich je len viac?
                    author_name in uniq_source_names,   
        ))
    
    res = pd.DataFrame(index=df.index)
    
    res['is_collective_author'] = df.author_name.map(make_a_guess)
    
    return res

In [22]:
features = [
    title_basic_features,
    title_sentiment,
    
    content_basic_features,
    content_sentiment,
    
    media_count_total,
    media_count_image,
    media_count_video,
    
    published_on_day,
    is_collective_author,
]

In [23]:
def add_features(df, f_df):
    f_df = f_df.copy()
    
    pbar_conf = {
        'refresh_rate': 1,
        'action_names': [i.__name__ for i in features]
    }
    
    for feature_generator in Pbar(features, **pbar_conf):
        f_df = pd.concat([f_df, feature_generator(df)], axis=1)
        
    return f_df

In [24]:
final_df = add_features(df, f_df)



In [25]:
final_df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 164221 entries, 0 to 171743
Data columns (total 14 columns):
fb_reaction_count_label    164221 non-null int64
fb_comment_count_label     164221 non-null int64
fb_share_count_label       164221 non-null int64
fb_popularity_label        164221 non-null int64
is_fake_news_label         164221 non-null int64
title_word_count           164221 non-null int64
title_char_length          164221 non-null int64
content_word_count         164221 non-null int64
content_char_length        164221 non-null int64
media_count_total          164221 non-null int64
image_count                164221 non-null int64
video_count                164221 non-null int64
published_on_day           164221 non-null int64
is_collective_author       164221 non-null bool
dtypes: bool(1), int64(13)
memory usage: 17.7+ MB


---

## Rozdelenie dat

In [26]:
train, test, validation = tuple(split_data(final_df, sizes=[2, 2, 1], shuffle=True, np_random=np_random))

In [27]:
print([len(i) for i in [train,test,validation]])

[65689, 65688, 32844]


In [28]:
label_names

['fb_reaction_count_label',
 'fb_comment_count_label',
 'fb_share_count_label',
 'fb_popularity_label',
 'is_fake_news_label']

In [29]:
data = split_X_y_all(train, test, validation, selected_label='fb_popularity_label', all_labels=label_names)

In [35]:
data.test.y.astype(int)

0        1
1        3
2        2
3        1
4        1
        ..
65683    3
65684    3
65685    2
65686    3
65687    1
Name: fb_popularity_label, Length: 65688, dtype: int32

In [30]:
from sklearn.metrics import classification_report

from sklearn.ensemble import RandomForestClassifier

In [31]:
def fit_predict(clf, data):
    clf.fit(data.train.X, data.train.y)
    return clf.predict(data.test.X)

In [42]:
classifiers = [
    RandomForestClassifier(n_estimators=100),

]

pbar_conf = {
    'refresh_rate': 1,
    'length': len(classifiers), 
    'pbar_width': 52,
    'action_names': [i.__class__.__name__ for i in classifiers]
}

predictions = list(Pbar((fit_predict(clf, data) for clf in classifiers), **pbar_conf))

for p in predictions:
    print(classification_report(data.test.y, p))
    print('-' * 54)

              precision    recall  f1-score   support

           1       0.60      0.80      0.69     32565
           2       0.36      0.31      0.33     16684
           3       0.31      0.20      0.24      9949
           4       0.14      0.04      0.06      3205
           5       0.15      0.04      0.07      3285

    accuracy                           0.51     65688
   macro avg       0.31      0.28      0.28     65688
weighted avg       0.45      0.51      0.47     65688

------------------------------------------------------


In [None]:
save_session(SESSION_FILE_NAME)