In [1]:
import importlib

import common

importlib.reload(common)

<module 'common' from 'C:\\Users\\j\\projects\\adaptive-web-project\\popularity\\common.py'>

In [2]:
%matplotlib inline

import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import sqlalchemy
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, VotingClassifier
from sklearn.metrics import confusion_matrix, f1_score, accuracy_score, precision_score, recall_score, make_scorer



from common import create_engine
from common import display_all
from common import figsize
from common import save_df
from common import save_model, read_model, create_features


from pandas.plotting import register_matplotlib_converters
register_matplotlib_converters() # converters e.g. for datetime in plots

In [3]:
engine = create_engine('db-conf.json', 'local')
pd.sql = lambda sql: pd.read_sql(sqlalchemy.text(sql), con=engine)

In [4]:
# get data from one source (NN)

fetch_data_query = '''
WITH fb_popularity AS (SELECT sbq.url, sbq.sync_date, sbq.reaction_count, sbq.comment_count, sbq.share_count
                       FROM (
                                SELECT afe.*, row_number() OVER (PARTITION BY url ORDER BY sync_date) as rn
                                FROM article_fb_engagement afe) sbq
                       WHERE sbq.rn = 1
)
SELECT a.id,
       a.url,
       title,
       perex,
       body,
       published_at,
       extracted_at,
       a.source_id,
       category,
       other_info,
       aut.name                                       as author_name,
       s.id                                           as source_id,
       s.name                                         as source_name,
       s.url                                          as source_url,
       stype                                          as source_type,
       is_reliable::integer                           as source_is_reliable,
       sync_date                                      as fb_sync_date,
       reaction_count                                 as fb_reaction_count,
       comment_count                                  as fb_comment_count,
       share_count                                    as fb_share_count,
       (reaction_count + comment_count + share_count) as fb_popularity
FROM article a
         JOIN source s on a.source_id = s.id
         JOIN (SELECT * FROM fb_popularity) p ON a.url = p.url
         JOIN author aut on a.author_id = aut.id
WHERE s.id = 145;
'''

df = pd.sql(fetch_data_query)

In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20328 entries, 0 to 20327
Data columns (total 21 columns):
id                    20328 non-null int64
url                   20328 non-null object
title                 20328 non-null object
perex                 0 non-null object
body                  20328 non-null object
published_at          20328 non-null datetime64[ns]
extracted_at          20328 non-null datetime64[ns]
source_id             20328 non-null int64
category              0 non-null object
other_info            20328 non-null object
author_name           20328 non-null object
source_id             20328 non-null int64
source_name           20328 non-null object
source_url            20328 non-null object
source_type           20328 non-null object
source_is_reliable    20328 non-null int64
fb_sync_date          20328 non-null datetime64[ns]
fb_reaction_count     20328 non-null int64
fb_comment_count      20328 non-null int64
fb_share_count        20328 non-null int64
fb

In [6]:
# our blog has only title and body + label

df = df[['id', 'title', 'body', 'fb_popularity']]
df = df.set_index('id')

get popularity predictions (labels)

In [7]:
df

Unnamed: 0_level_0,title,body,fb_popularity
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
322528,Flashback: Local pharmacies donate free vitami...,\n\n\n(NaturalNews) Recognizing the dismal he...,0
322519,Are you chucking used batteries in the trash? ...,\n\n\n(NaturalNews) The average person genera...,0
322499,California state police and prison guard allie...,\n\n\n(NaturalNews) As California prepares to...,0
322523,While insisting marijuana has no medicinal val...,\n\n\n(NaturalNews) Hypocrisy on the part of ...,0
322495,Study: Brain Stem Cells can be activated and r...,\n\n\n (Natural News)\n The many health benef...,2
...,...,...,...
427883,The Mediterranean diet can stop overeating and...,\n\n\n (Natural News)\n The Mediterranean die...,37
427861,The black ridge oak from Asia found to inhibit...,\n\n\n (Natural News)\n Black ridge oak (Quer...,28
427858,The Greta Thunberg phenomenon is nothing but a...,\n\n\n (Natural News)\n Many Americans are sc...,115
427874,Study reveals surprising link between your ski...,\n\n\n (Natural News)\n The human body is mor...,74


In [8]:
df.fb_popularity.describe()

count    2.032800e+04
mean     1.246525e+03
std      4.193087e+04
min      0.000000e+00
25%      1.800000e+01
50%      8.900000e+01
75%      2.530000e+02
max      5.197586e+06
Name: fb_popularity, dtype: float64

In [9]:
df['label'] = 1

In [10]:
df.fb_popularity.quantile([.65, .85, .95])

0.65     162.0
0.85     465.0
0.95    1667.6
Name: fb_popularity, dtype: float64

In [11]:
df.loc[df.fb_popularity > 162, 'label'] = 2
df.loc[df.fb_popularity > 465, 'label'] = 3
df.loc[df.fb_popularity > 1667, 'label'] = 4

In [12]:
df.label.value_counts()

1    13223
2     4062
3     2026
4     1017
Name: label, dtype: int64

In [13]:
df = df.drop(columns=['fb_popularity'])

In [14]:
df

Unnamed: 0_level_0,title,body,label
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
322528,Flashback: Local pharmacies donate free vitami...,\n\n\n(NaturalNews) Recognizing the dismal he...,1
322519,Are you chucking used batteries in the trash? ...,\n\n\n(NaturalNews) The average person genera...,1
322499,California state police and prison guard allie...,\n\n\n(NaturalNews) As California prepares to...,1
322523,While insisting marijuana has no medicinal val...,\n\n\n(NaturalNews) Hypocrisy on the part of ...,1
322495,Study: Brain Stem Cells can be activated and r...,\n\n\n (Natural News)\n The many health benef...,1
...,...,...,...
427883,The Mediterranean diet can stop overeating and...,\n\n\n (Natural News)\n The Mediterranean die...,1
427861,The black ridge oak from Asia found to inhibit...,\n\n\n (Natural News)\n Black ridge oak (Quer...,1
427858,The Greta Thunberg phenomenon is nothing but a...,\n\n\n (Natural News)\n Many Americans are sc...,1
427874,Study reveals surprising link between your ski...,\n\n\n (Natural News)\n The human body is mor...,1


In [15]:
df_with_features = create_features(df)

In [16]:
df_with_features['label'] = df['label'].array

In [17]:
from sklearn.model_selection import train_test_split

target_df = df_with_features

y = target_df['label']
X = target_df.drop(columns='label')

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=123)

In [18]:
X_train

Unnamed: 0,content_length,number_of_words_in_title,number_of_words_in_content,title_sentiment_polarity,title_sentiment_subjectivity,content_sentiment_polarity,content_sentiment_subjectivity
1018,4059,14,651,-0.195833,0.433333,0.454447,0.454447
13160,4415,13,722,0.300000,0.562500,0.548730,0.548730
9873,3412,20,528,0.411111,0.633333,0.402730,0.402730
2822,4075,16,691,-0.500000,0.875000,0.499228,0.499228
10688,4142,17,721,0.000000,0.125000,0.422559,0.422559
...,...,...,...,...,...,...,...
7763,2865,11,478,0.000000,1.000000,0.510815,0.510815
15377,4281,16,737,-0.500000,1.000000,0.368299,0.368299
17730,4634,16,759,-0.150000,0.250000,0.504259,0.504259
15725,4175,17,668,0.059524,0.351190,0.478026,0.478026


In [19]:
rdf = RandomForestClassifier()

In [20]:
rdf.fit(X_train, y_train)



RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
                       max_depth=None, max_features='auto', max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=10,
                       n_jobs=None, oob_score=False, random_state=None,
                       verbose=0, warm_start=False)

In [21]:
y_pred = rdf.predict(X_test)

In [22]:
print(confusion_matrix(y_test, y_pred))

[[2986  244   68   16]
 [ 912   71   17    5]
 [ 457   34   13    9]
 [ 210   22    6   12]]


In [23]:
y_test.value_counts()

1    3314
2    1005
3     513
4     250
Name: label, dtype: int64

In [24]:
accuracy_score(y_test, y_pred)

0.6064541519086973

In [25]:
f1_score(y_test, y_pred, average='macro')

0.24637326200328424

In [26]:
save_model(rdf)

In [27]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 20328 entries, 322528 to 427845
Data columns (total 3 columns):
title    20328 non-null object
body     20328 non-null object
label    20328 non-null int64
dtypes: int64(1), object(2)
memory usage: 635.2+ KB


In [28]:
df.sort_values(by='label', ascending=False)

Unnamed: 0_level_0,title,body,label
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
252844,The top 10 immune system KILLERS of all time,\n\n\n (Natural News)\n Most health-conscious...,4
252860,The new “organic” meat: HUNTING animals sudden...,\n\n\n (Natural News)\n If you’re worried abo...,4
277708,"It’s TREASON! Pelosi, Schumer, Schiff and Warn...",\n\n\n (Natural News)\n It’s time to call for...,4
310930,VIDEO: Health Ranger demands FBI stop lying to...,\n\n\n (Natural News)\n The FBI is clearly ly...,4
309121,How this couple earns a six figure annual inco...,\n\n\n (Natural News)\n Can small-scale farmi...,4
...,...,...,...
314795,Dicamba “drift” destroying crops downwind from...,\n\n\n (Natural News)\n It’s one of the most ...,1
314781,Antibiotic drugs found to cause harmful damage...,\n\n\n (Natural News)\n A meta-analysis publi...,1
314783,Link confirmed between antidepressant use duri...,\n\n\n (Natural News)\n A recent study by res...,1
314707,CDC sounds alarm over exploding diabetes epide...,\n\n\n (Natural News)\n The figures in the la...,1


In [29]:
xx = X_test.copy()

In [35]:
xx['label'] = y_pred
xx['label2'] = y_test

In [36]:
xx[xx.label == 4]

Unnamed: 0,content_length,number_of_words_in_title,number_of_words_in_content,title_sentiment_polarity,title_sentiment_subjectivity,content_sentiment_polarity,content_sentiment_subjectivity,label,label2
9725,10732,22,1806,0.0,0.0,0.355554,0.355554,4,4
4110,5821,14,993,0.0,1.0,0.510719,0.510719,4,3
3668,5716,25,952,0.016667,0.283333,0.388026,0.388026,4,3
11277,7901,21,1326,0.0,0.0,0.360838,0.360838,4,3
13484,6503,16,1109,0.25,0.55,0.455623,0.455623,4,3
17714,5790,25,955,0.016234,0.192208,0.413775,0.413775,4,1
149,11234,19,2033,-0.155556,0.288889,0.52822,0.52822,4,1
8211,8636,15,1494,0.0,0.0,0.40907,0.40907,4,2
10917,7421,19,1215,0.25,0.333333,0.464348,0.464348,4,2
13872,4849,15,770,0.10101,0.351515,0.493905,0.493905,4,4


In [37]:
from common import normalize
normalize(df.iloc[13872].body)

'     (Natural News)  Young, healthy women who experience amenorrhea, ovarian failure, and infertility have likely been damaged by Merck’s HPV vaccine. A new study published in the Journal of Toxicology and Environmental Health analyzed data on pregnancy outcomes, comparing women who received an HPV vaccine with those who did not. The data analysis is titled, “A lowered probability of pregnancy in females in the USA aged 25 who received a human papillomavirus vaccine injection.” The study, uninfluenced by pharmaceutical money, analyzed data on 8 million women aged 25-29 living in the U.S. from 2007 and 2014. According to the analysis, conception rates would have fallen by 2 million if 100 percent of the females in the study would have received the HPV vaccine. The study warns that the HPV vaccine has a negative influence on fertility and more research is “warranted.” Despite mounting evidence of harm, Merck seeks to fast-track Gardasil on new age group Despite these grave concerns, the

In [38]:
df.iloc[13872].title

'New study proves the FDA is actively suppressing information about the harmful effects of Gardasil'

In [34]:
y_train

1018     1
13160    1
9873     4
2822     1
10688    3
        ..
7763     1
15377    1
17730    2
15725    2
19966    2
Name: label, Length: 15246, dtype: int64