In [65]:
import importlib

import common

importlib.reload(common)

<module 'common' from 'C:\\Users\\j\\projects\\adaptive-web-project\\popularity\\common.py'>

In [66]:
%matplotlib inline

import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import sqlalchemy
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, VotingClassifier



from common import create_engine
from common import display_all
from common import figsize
from common import save_df
from common import save_model, read_model


from pandas.plotting import register_matplotlib_converters
register_matplotlib_converters() # converters e.g. for datetime in plots

In [6]:
engine = create_engine('db-conf.json', 'local')
pd.sql = lambda sql: pd.read_sql(sqlalchemy.text(sql), con=engine)

In [14]:
# get data from one source (NN)

fetch_data_query = '''
WITH fb_popularity AS (SELECT sbq.url, sbq.sync_date, sbq.reaction_count, sbq.comment_count, sbq.share_count
                       FROM (
                                SELECT afe.*, row_number() OVER (PARTITION BY url ORDER BY sync_date) as rn
                                FROM article_fb_engagement afe) sbq
                       WHERE sbq.rn = 1
)
SELECT a.id,
       a.url,
       title,
       perex,
       body,
       published_at,
       extracted_at,
       a.source_id,
       category,
       other_info,
       aut.name                                       as author_name,
       s.id                                           as source_id,
       s.name                                         as source_name,
       s.url                                          as source_url,
       stype                                          as source_type,
       is_reliable::integer                           as source_is_reliable,
       sync_date                                      as fb_sync_date,
       reaction_count                                 as fb_reaction_count,
       comment_count                                  as fb_comment_count,
       share_count                                    as fb_share_count,
       (reaction_count + comment_count + share_count) as fb_popularity
FROM article a
         JOIN source s on a.source_id = s.id
         JOIN (SELECT * FROM fb_popularity) p ON a.url = p.url
         JOIN author aut on a.author_id = aut.id
WHERE s.id = 142;
'''

df = pd.sql(fetch_data_query)

In [16]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 17752 entries, 0 to 17751
Data columns (total 21 columns):
id                    17752 non-null int64
url                   17752 non-null object
title                 17752 non-null object
perex                 17752 non-null object
body                  17752 non-null object
published_at          17752 non-null datetime64[ns]
extracted_at          17752 non-null datetime64[ns]
source_id             17752 non-null int64
category              17752 non-null object
other_info            17752 non-null object
author_name           17752 non-null object
source_id             17752 non-null int64
source_name           17752 non-null object
source_url            17752 non-null object
source_type           17752 non-null object
source_is_reliable    17752 non-null int64
fb_sync_date          17752 non-null datetime64[ns]
fb_reaction_count     17752 non-null int64
fb_comment_count      17752 non-null int64
fb_share_count        17752 non-null 

In [46]:
# our blog has only title and body + label

df = df[['id', 'title', 'body', 'fb_popularity']]
df = df.set_index('id')

get popularity predictions (labels)

In [22]:
df

Unnamed: 0,id,title,body,fb_popularity
0,375231,10 Air Purifying Plants To Remove Harmful Indo...,Now a days air purifying plants have gained sp...,0
1,373614,10 Amazing Benefits Of Matcha Green Tea (Match...,Matcha green tea has a long history in Japanes...,0
2,374690,10 Amazing Benefits Of Okra/Lady’s Finger For ...,Okra is usually available fresh year-round and...,0
3,373633,10 Amazing Reasons to have Sex Daily,Sex really isn’t just about having fun or maki...,0
4,379834,10 Benefits of Himalayan Salt for Health and B...,Before you get to know important benefits of H...,4
...,...,...,...,...
17747,244742,Your Spiritual Right to GMO Labeling,"As you already figured out, I tried to find ou...",0
17748,243930,Your Weekly Food-Like-Product &#8211; Doritos ...,Doritos Pt. 2\n(5.) Red #40\nDyes are complex ...,1
17749,246785,Yummy Mint Chocolate Chip Ice Cream,Mint chocolate chip ice cream is a favorite de...,0
17750,242213,Zika virus in Brazil raises mosquito and vacci...,The Zika virus outbreak in Brazil has been bla...,1


In [24]:
df.fb_popularity.describe()

count     17752.000000
mean         86.906377
std        2294.931460
min           0.000000
25%           0.000000
50%           0.000000
75%           1.000000
max      203855.000000
Name: fb_popularity, dtype: float64

In [25]:
df['label'] = -1

In [31]:
df.fb_popularity.quantile([.65, .85, .95, .98])

0.65      0.00
0.85      4.00
0.95     45.00
0.98    409.84
Name: fb_popularity, dtype: float64

In [35]:
df.loc[df.fb_popularity >= 0, 'label'] = 1
df.loc[df.fb_popularity >= 4, 'label'] = 2
df.loc[df.fb_popularity >= 45, 'label'] = 3
df.loc[df.fb_popularity >= 409, 'label'] = 4

In [37]:
df.label.value_counts()

1    15007
2     1851
3      538
4      356
Name: label, dtype: int64

In [39]:
df = df.drop(columns=['fb_popularity'])

In [48]:
df

Unnamed: 0_level_0,title,body,label
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
375231,10 Air Purifying Plants To Remove Harmful Indo...,Now a days air purifying plants have gained sp...,1
373614,10 Amazing Benefits Of Matcha Green Tea (Match...,Matcha green tea has a long history in Japanes...,1
374690,10 Amazing Benefits Of Okra/Lady’s Finger For ...,Okra is usually available fresh year-round and...,1
373633,10 Amazing Reasons to have Sex Daily,Sex really isn’t just about having fun or maki...,1
379834,10 Benefits of Himalayan Salt for Health and B...,Before you get to know important benefits of H...,2
...,...,...,...
244742,Your Spiritual Right to GMO Labeling,"As you already figured out, I tried to find ou...",1
243930,Your Weekly Food-Like-Product &#8211; Doritos ...,Doritos Pt. 2\n(5.) Red #40\nDyes are complex ...,1
246785,Yummy Mint Chocolate Chip Ice Cream,Mint chocolate chip ice cream is a favorite de...,1
242213,Zika virus in Brazil raises mosquito and vacci...,The Zika virus outbreak in Brazil has been bla...,1


In [51]:
from sklearn.model_selection import train_test_split

target_df = df

y = target_df['label']
X = target_df.drop(columns='label')

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=123)

In [55]:
X_train

Unnamed: 0_level_0,title,body
id,Unnamed: 1_level_1,Unnamed: 2_level_1
247218,Prescription Drugs That Whack Your Thyroid,Many prescription drugs make life hard for you...
240192,Five Herbal Remedies for Skin Allergies,"Your skin may become red, itchy, irritated, bu..."
241688,Severe childhood asthma linked to new gene mut...,"A new study, which involved the collaborative ..."
372903,10 Healing Foods to Include in the Diet,More and more people – within the healthcare c...
240174,Calm menstrual trouble with blackstrap molasses,Blackstrap molasses may sound funny when said ...
...,...,...
374049,12 Symptoms Of Autoimmune Disease! How To Reve...,Autoimmune conditions affect over 50 million A...
379993,Top 6 Far-reaching Benefits of Edamame &#8211;...,It is highly certain that almost all of us are...
247063,This Tea May Be The Solution to Your Bloating,Whether from a gastrointestinal issue or from ...
243721,Why you should be drinking Kombucha and how to...,Kombucha is a living health drink that consist...


In [58]:
rdf = RandomForestClassifier()

In [60]:
# rdf.fit(X_train, y_train)

In [67]:
save_model(rdf)