In [1]:
import importlib

import common

importlib.reload(common)

<module 'common' from 'C:\\Users\\j\\projects\\adaptive-web-project\\popularity\\common.py'>

In [2]:
%matplotlib inline

import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import sqlalchemy
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, VotingClassifier
from sklearn.metrics import confusion_matrix, f1_score, accuracy_score, precision_score, recall_score, make_scorer



from common import create_engine
from common import display_all
from common import figsize
from common import save_df
from common import save_model, read_model, create_features


from pandas.plotting import register_matplotlib_converters
register_matplotlib_converters() # converters e.g. for datetime in plots

In [3]:
engine = create_engine('db-conf.json', 'local')
pd.sql = lambda sql: pd.read_sql(sqlalchemy.text(sql), con=engine)

In [4]:
# get data from one source (NN)

fetch_data_query = '''
WITH fb_popularity AS (SELECT sbq.url, sbq.sync_date, sbq.reaction_count, sbq.comment_count, sbq.share_count
                       FROM (
                                SELECT afe.*, row_number() OVER (PARTITION BY url ORDER BY sync_date) as rn
                                FROM article_fb_engagement afe) sbq
                       WHERE sbq.rn = 1
)
SELECT a.id,
       a.url,
       title,
       perex,
       body,
       published_at,
       extracted_at,
       a.source_id,
       category,
       other_info,
       aut.name                                       as author_name,
       s.id                                           as source_id,
       s.name                                         as source_name,
       s.url                                          as source_url,
       stype                                          as source_type,
       is_reliable::integer                           as source_is_reliable,
       sync_date                                      as fb_sync_date,
       reaction_count                                 as fb_reaction_count,
       comment_count                                  as fb_comment_count,
       share_count                                    as fb_share_count,
       (reaction_count + comment_count + share_count) as fb_popularity
FROM article a
         JOIN source s on a.source_id = s.id
         JOIN (SELECT * FROM fb_popularity) p ON a.url = p.url
         JOIN author aut on a.author_id = aut.id
WHERE s.id = 145;
'''

df = pd.sql(fetch_data_query)

In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20328 entries, 0 to 20327
Data columns (total 21 columns):
id                    20328 non-null int64
url                   20328 non-null object
title                 20328 non-null object
perex                 0 non-null object
body                  20328 non-null object
published_at          20328 non-null datetime64[ns]
extracted_at          20328 non-null datetime64[ns]
source_id             20328 non-null int64
category              0 non-null object
other_info            20328 non-null object
author_name           20328 non-null object
source_id             20328 non-null int64
source_name           20328 non-null object
source_url            20328 non-null object
source_type           20328 non-null object
source_is_reliable    20328 non-null int64
fb_sync_date          20328 non-null datetime64[ns]
fb_reaction_count     20328 non-null int64
fb_comment_count      20328 non-null int64
fb_share_count        20328 non-null int64
fb

In [6]:
# our blog has only title and body + label

df = df[['id', 'title', 'body', 'fb_popularity']]
df = df.set_index('id')

get popularity predictions (labels)

In [7]:
df

Unnamed: 0_level_0,title,body,fb_popularity
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
322528,Flashback: Local pharmacies donate free vitami...,\n\n\n(NaturalNews) Recognizing the dismal he...,0
322519,Are you chucking used batteries in the trash? ...,\n\n\n(NaturalNews) The average person genera...,0
322499,California state police and prison guard allie...,\n\n\n(NaturalNews) As California prepares to...,0
322523,While insisting marijuana has no medicinal val...,\n\n\n(NaturalNews) Hypocrisy on the part of ...,0
322495,Study: Brain Stem Cells can be activated and r...,\n\n\n (Natural News)\n The many health benef...,2
...,...,...,...
427883,The Mediterranean diet can stop overeating and...,\n\n\n (Natural News)\n The Mediterranean die...,37
427861,The black ridge oak from Asia found to inhibit...,\n\n\n (Natural News)\n Black ridge oak (Quer...,28
427858,The Greta Thunberg phenomenon is nothing but a...,\n\n\n (Natural News)\n Many Americans are sc...,115
427874,Study reveals surprising link between your ski...,\n\n\n (Natural News)\n The human body is mor...,74


In [8]:
df.fb_popularity.describe()

count    2.032800e+04
mean     1.246525e+03
std      4.193087e+04
min      0.000000e+00
25%      1.800000e+01
50%      8.900000e+01
75%      2.530000e+02
max      5.197586e+06
Name: fb_popularity, dtype: float64

In [9]:
df['label'] = -1

In [10]:
df.fb_popularity.quantile([.65, .85, .95, .98])

0.65     162.00
0.85     465.00
0.95    1667.60
0.98    4653.84
Name: fb_popularity, dtype: float64

In [11]:
df.loc[df.fb_popularity <= 4653, 'label'] = 4
df.loc[df.fb_popularity <= 1667, 'label'] = 3
df.loc[df.fb_popularity <= 465, 'label'] = 2
df.loc[df.fb_popularity <= 162, 'label'] = 1

In [12]:
df.label.value_counts()

 1    13223
 2     4062
 3     2026
 4      610
-1      407
Name: label, dtype: int64

In [13]:
df = df.drop(columns=['fb_popularity'])

In [14]:
df

Unnamed: 0_level_0,title,body,label
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
322528,Flashback: Local pharmacies donate free vitami...,\n\n\n(NaturalNews) Recognizing the dismal he...,1
322519,Are you chucking used batteries in the trash? ...,\n\n\n(NaturalNews) The average person genera...,1
322499,California state police and prison guard allie...,\n\n\n(NaturalNews) As California prepares to...,1
322523,While insisting marijuana has no medicinal val...,\n\n\n(NaturalNews) Hypocrisy on the part of ...,1
322495,Study: Brain Stem Cells can be activated and r...,\n\n\n (Natural News)\n The many health benef...,1
...,...,...,...
427883,The Mediterranean diet can stop overeating and...,\n\n\n (Natural News)\n The Mediterranean die...,1
427861,The black ridge oak from Asia found to inhibit...,\n\n\n (Natural News)\n Black ridge oak (Quer...,1
427858,The Greta Thunberg phenomenon is nothing but a...,\n\n\n (Natural News)\n Many Americans are sc...,1
427874,Study reveals surprising link between your ski...,\n\n\n (Natural News)\n The human body is mor...,1


In [15]:
df_with_features = create_features(df)

In [16]:
df_with_features['label'] = df['label'].array

In [17]:
from sklearn.model_selection import train_test_split

target_df = df_with_features

y = target_df['label']
X = target_df.drop(columns='label')

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=123)

In [18]:
X_train

Unnamed: 0,content_length,number_of_words_in_title,number_of_words_in_content,title_sentiment_polarity,title_sentiment_subjectivity,content_sentiment_polarity,content_sentiment_subjectivity
1018,4059,14,651,-0.195833,0.433333,0.454447,0.454447
13160,4415,13,722,0.300000,0.562500,0.548730,0.548730
9873,3412,20,528,0.411111,0.633333,0.402730,0.402730
2822,4075,16,691,-0.500000,0.875000,0.499228,0.499228
10688,4142,17,721,0.000000,0.125000,0.422559,0.422559
...,...,...,...,...,...,...,...
7763,2865,11,478,0.000000,1.000000,0.510815,0.510815
15377,4281,16,737,-0.500000,1.000000,0.368299,0.368299
17730,4634,16,759,-0.150000,0.250000,0.504259,0.504259
15725,4175,17,668,0.059524,0.351190,0.478026,0.478026


In [19]:
rdf = RandomForestClassifier()

In [20]:
rdf.fit(X_train, y_train)



RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
                       max_depth=None, max_features='auto', max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=10,
                       n_jobs=None, oob_score=False, random_state=None,
                       verbose=0, warm_start=False)

In [21]:
y_pred = rdf.predict(X_test)

In [22]:
print(confusion_matrix(y_test, y_pred))

[[   2   80   11    3    0]
 [   5 2998  240   64    7]
 [   0  914   68   18    5]
 [   2  465   31   11    4]
 [   4  132   11    5    2]]


In [23]:
accuracy_score(y_test, y_pred)

0.6062573789846517

In [24]:
f1_score(y_test, y_pred, average='macro')

0.19080873400812587

In [25]:
save_model(rdf)

In [26]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 20328 entries, 322528 to 427845
Data columns (total 3 columns):
title    20328 non-null object
body     20328 non-null object
label    20328 non-null int64
dtypes: int64(1), object(2)
memory usage: 635.2+ KB


In [27]:
df.sort_values(by='label', ascending=False)

Unnamed: 0_level_0,title,body,label
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
307628,Celebrity trainer warns: Sodas are worse than ...,"\n\n\n (Natural News)\n Just like cigarettes,...",4
247270,Medical police state cuts off research funding...,\n\n\n (Natural News)\n When it comes to the ...,4
269945,REPORT: Elizabeth Warren’s great-great-great g...,\n\n\n (Natural News)\n If you thought diggin...,4
297004,Dishonest CDC caught hiding data proving that ...,\n\n\n (Natural News)\n One of the Left’s big...,4
250325,How the National Popular Vote Interstate Compa...,\n\n\n (Natural News)\n It’s no longer diffic...,4
...,...,...,...
315329,Children need to be allowed to play in the dir...,\n\n\n (Natural News)\n While most parents wo...,-1
312715,Blood pressure can be significantly lowered in...,\n\n\n (Natural News)\n A study found that ev...,-1
313496,Swiss cheese found to contain powerful probiot...,\n\n\n (Natural News)\n Researchers have foun...,-1
253857,"Probiotics reduce the need for antibiotics, ne...",\n\n\n (Natural News)\n Probiotics are a cert...,-1


In [28]:
xx = X_test.copy()

In [29]:
xx['label'] = y_pred

In [30]:
xx[xx.label == 4]

Unnamed: 0,content_length,number_of_words_in_title,number_of_words_in_content,title_sentiment_polarity,title_sentiment_subjectivity,content_sentiment_polarity,content_sentiment_subjectivity,label
149,11234,19,2033,-0.155556,0.288889,0.52822,0.52822,4
8211,8636,15,1494,0.0,0.0,0.40907,0.40907,4
10917,7421,19,1215,0.25,0.333333,0.464348,0.464348,4
14952,7266,18,1261,0.175325,0.548701,0.432347,0.432347,4
14008,5906,18,985,-0.375,0.458333,0.496249,0.496249,4
16125,11392,17,1931,-0.5,0.55,0.421956,0.421956,4
3288,5567,25,884,0.0,0.0,0.379819,0.379819,4
4811,6536,20,1176,-0.321429,0.528571,0.464888,0.464888,4
15658,14498,22,2437,-0.7,0.85,0.425801,0.425801,4
10753,8007,20,1304,-0.857143,0.928571,0.433895,0.433895,4


In [34]:
from common import normalize
normalize(df.iloc[16712].body)



In [35]:
df.iloc[16712].title

'The REAL WAR for the future of humanity: Democrats increasingly possessed by demonic forces that seek the extermination of humanity?'

In [33]:
y_train

1018     1
13160    1
9873     4
2822     1
10688    3
        ..
7763     1
15377    1
17730    2
15725    2
19966    2
Name: label, Length: 15246, dtype: int64