In [69]:
import importlib

import common
import util

importlib.reload(common)
importlib.reload(util)

<module 'util' from 'C:\\Users\\kamko\\projects\\fiit_masters_thesis_project\\jupyter\\util.py'>

In [70]:
%matplotlib inline

import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import sqlalchemy

from common import create_engine
from common import display_all
from common import figsize
from common import save_df, load_df

from util import split_data


from pandas.plotting import register_matplotlib_converters
register_matplotlib_converters() # converters e.g. for datetime in plots

In [3]:
RANDOM_STATE = 123
np_random = np.random.RandomState(RANDOM_STATE)

### Load dataset

In [4]:
df = load_df('final_data.pickle')

In [6]:
df.head()

Unnamed: 0,id,url,title,perex,body,published_at,extracted_at,source_id,category,other_info,...,source_id.1,source_name,source_url,source_type,source_is_reliable,fb_sync_date,fb_reaction_count,fb_comment_count,fb_share_count,fb_popularity
0,355323,http://blog.healthadvocate.com/2013/09/low-cos...,Low-cost workout essentials,<p>People are often under the assumption that ...,People are often under the assumption that wor...,2013-09-20 15:46:28,2019-09-06 04:03:23.886423,223,{Fitness},"{'tags': [], 'updated_at': '2018-03-02T11:43:39'}",...,223,healthadvocate.com,http://healthadvocate.com,news_website,1,2019-10-14 21:53:24.384129,0,0,0,0
1,355818,http://blog.healthadvocate.com/2016/05/be-prep...,Be prepared! Essential inventory for your firs...,<p>Summertime often means more time outside at...,Summertime often means more time outside at th...,2016-05-29 16:33:38,2019-09-06 04:09:33.886336,223,"{""Health Tips"",""Health Information"",""Safety Ti...","{'tags': [], 'updated_at': '2018-03-02T11:31:38'}",...,223,healthadvocate.com,http://healthadvocate.com,news_website,1,2019-10-14 21:53:28.006933,9,0,7,16
2,281114,https://coconutoil.com/achalasia-improvement-a...,Achalasia: Improvement after taking coconut oil,<p>In slightly over 2 weeks after starting the...,\nby Chin (from Singapore)\nHealingWell.com\nF...,2012-07-03 12:49:50,2019-09-05 09:09:29.068001,173,{News},"{'tags': ['Achalasia', 'coconut oil', 'GERD'],...",...,173,coconutoil.com,http://coconutoil.com,news_website,0,2019-10-14 18:12:07.579041,0,0,0,0
3,281513,https://coconutoil.com/dr-brownstein-national-...,Dr. Brownstein: National Panel Reverses Idioti...,<p>As reported in the New York Times (2.20.15)...,\nby Dr. Brownstein\nDrbrownstein.com\nAs repo...,2015-02-24 18:05:23,2019-09-05 09:10:44.903351,173,{News},"{'tags': ['cholesterol'], 'updated_at': '2015-...",...,173,coconutoil.com,http://coconutoil.com,news_website,0,2019-10-14 18:12:17.327260,0,0,0,0
4,281124,https://coconutoil.com/drug-side-effect-discov...,Drug Side Effect Discovery from Online Patient...,<p>We find a highly significant correlation be...,"\nby Jingjing Liu, Alice Li and Stephanie Sene...",2012-07-24 07:11:27,2019-09-05 09:09:32.477872,173,{News},"{'tags': ['cholesterol', 'statins'], 'updated_...",...,173,coconutoil.com,http://coconutoil.com,news_website,0,2019-10-14 18:12:07.579198,0,0,0,0


In [7]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 160897 entries, 0 to 168190
Data columns (total 21 columns):
 #   Column              Non-Null Count   Dtype         
---  ------              --------------   -----         
 0   id                  160897 non-null  int64         
 1   url                 160897 non-null  object        
 2   title               160897 non-null  object        
 3   perex               137532 non-null  object        
 4   body                160668 non-null  object        
 5   published_at        160897 non-null  datetime64[ns]
 6   extracted_at        160897 non-null  datetime64[ns]
 7   source_id           160897 non-null  int64         
 8   category            133794 non-null  object        
 9   other_info          160893 non-null  object        
 10  author_name         160897 non-null  object        
 11  source_id           160897 non-null  int64         
 12  source_name         160897 non-null  object        
 13  source_url          160897 no

-----

Rozdelenie hodnot popularity do 5 skupin

- `0 - 0.5`
- `0.5 - 0.75`
- `0.75 - 0.9`
- `0.9 - 0.95`
- `0.95 - 1`

In [8]:
def add_labels(df, quantiles, column='fb_popularity'):
    df = df.copy()
    label_str = f'{column}_label'
    
    df[label_str] = -1
    
    label = 1    
    for i in range(len(quantiles) - 1):
        low = df[column].quantile(quantiles[i])
        high = df[column].quantile(quantiles[i + 1])
        
        df.loc[(low <= df[column]) & (df[column] <= high), label_str] = label
        
        label += 1
    
    return df.copy()

In [9]:
quantiles = [
    0,
    .50,
    .75,
    .90,
    .95,
    1
]

cols = [
    'fb_reaction_count',
    'fb_comment_count',
    'fb_share_count',
    'fb_popularity'    
]

for i in cols:
    print(df[i].quantile(quantiles))
    df = add_labels(df, quantiles, column=i)

0.00          0.0
0.50          0.0
0.75         50.0
0.90        436.0
0.95       1301.0
1.00    3929532.0
Name: fb_reaction_count, dtype: float64
0.00         0.0
0.50         0.0
0.75         7.0
0.90        85.0
0.95       290.0
1.00    714968.0
Name: fb_comment_count, dtype: float64
0.00         0.0
0.50         1.0
0.75        33.0
0.90       185.0
0.95       503.0
1.00    572708.0
Name: fb_share_count, dtype: float64
0.00          0.0
0.50          1.0
0.75         99.0
0.90        734.0
0.95       2126.0
1.00    5197586.0
Name: fb_popularity, dtype: float64


Pri jednotlivych zlozkach sme pri tomto rozdeleni nasli len 4 skupiny (lebo 1 == 2)

---

Jednoducha heuristika: ak je zdroj nedoveryhodny tak aj clanok je nedoveryhodny

In [10]:
df['is_fake_news_label'] = df.source_is_reliable.replace({0:1, 1:0})

---

## Feature engineering

dve hlavne skupinky dat

- metadata (rozne)
- obsahove

In [11]:
label_names = list(filter(lambda x: x.endswith('_label'), df.columns))

In [12]:
f_df = pd.DataFrame()

In [13]:
# labely
f_df = pd.concat([f_df] + [df[label_name] for label_name in label_names], axis=1)

In [37]:
features = [
    
]

In [51]:
def add_features(df, f_df):
    f_df = f_df.copy()
    for feature_generator in features:
        f_df = pd.concat([f_df, feature_generator(df)], axis=1)
        
    return f_df

In [52]:
final_df = add_features(df, f_df)

In [53]:
final_df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 160897 entries, 0 to 168190
Data columns (total 6 columns):
 #   Column                   Non-Null Count   Dtype
---  ------                   --------------   -----
 0   fb_reaction_count_label  160897 non-null  int64
 1   fb_comment_count_label   160897 non-null  int64
 2   fb_share_count_label     160897 non-null  int64
 3   fb_popularity_label      160897 non-null  int64
 4   is_fake_news_label       160897 non-null  int64
 5   abc                      160897 non-null  int64
dtypes: int64(6)
memory usage: 8.6+ MB


## Skalovanie a normalizacia

In [57]:
# todo: find out if needed

---

## Rozdelenie dat

In [75]:
train, test, validation = tuple(split_data(final_df, sizes=[2, 2, 1], shuffle=True, np_random=np_random))

In [76]:
print([len(i) for i in [train,test,validation]])

[64360, 64358, 32179]
