In [28]:
import numpy as np
import pandas as pd
from sklearn.linear_model import LinearRegression
import statsmodels.api as sm
from IPython.core.display import display, HTML

from IPython.core.interactiveshell import InteractiveShell
import matplotlib.pyplot as plt
InteractiveShell.ast_node_interactivity = "all"
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)
from collections import Counter


In [72]:
df = pd.read_csv("data/df_story_features_large_Jul29.csv")

In [494]:
# Print columns
# df.columns.tolist()

# display(HTML(df.head(1).to_html()))

# Generate binary dependent variable (for logit)

In [4]:
df['isPopular'] = (df['ClapCount_Story'] < df['ClapCount_Story'].median()).astype(int)

# Split train / test (80/20)

In [110]:
mask = np.random.rand(len(df)) < 1
train = df[mask]
test = df[~mask]

In [43]:
xFeatures = ['isPaywall', 'isPublication', 'WordNum', 'SentimentPolarity', 'SentimentSubjectivity',  "TextSyllableNum",
          "TextLexiconNum", "TextSentenceNum", "ReadabilityFleschEase", "ReadabilitySMOG", "ReadabilityFleschKincaid", "ReadabilityColemanLiau", "ReadabilityARI", "ReadabilityDaleChall", "ReadabilityDifficultWordsNum", "ReadabilityLinsearWriteFormula", "ReadabilityGunningFog", "ReadabilityReadingTime", "ReadabilityConsensus", 
        'ReadingTime', 'HasFeaturedImage', 'CodeInlineNum', 'TagNum',
        'CodeBlockNum', 'CodeBlockLengthSum', 'CodeBlockLengthMedian', 'CodeBlockLengthMean',
        'CodeBlockLengthStd', 'CodeBlockLengthMin', 'CodeBlockLengthMax',
        'ListOlNum', 'ListOlSum', 'ListOlMedian', 'ListOlMean', 'ListOlStd', 'ListOlMin', 'ListOlMax', 
        'ListUlNum', 'ListUlSum', 'ListUlMedian', 'ListUlMean', 'ListUlStd', 'ListUlMin', 'ListUlMax', 
        'ImgNum', 'LinkNum',
        'HlightNum', 'ParagraphNum', 'ItalicNum', 'BoldNum', 
        'AuthorArticlesClap_mean','AuthorArticlesClap_median','AuthorArticlesClap_count','AuthorArticlesClap_sum',
        'AuthorArticlesResponse_mean','AuthorArticlesResponse_median','AuthorArticlesResponse_sum',
        'AuthorArticlesVoter_mean','AuthorArticlesVoter_median','AuthorArticlesVoter_sum',
        
        'TagUseSum', 'TagUseMean', 'TagUseMedian', 
        'TagClapSum','TagClapMean', 'TagClapMedian',  
        'PublicationClap_mean','PublicationClap_median','PublicationClap_count','PublicationClap_sum',
        'PublicationReadingTime_mean','PublicationReadingTime_median','PublicationReadingTime_sum',
        'PublicationVoter_mean','PublicationVoter_median','PublicationVoter_sum',
        'PublicationisPaywall_mean','PublicationisPaywall_median','PublicationisPaywall_sum',]

yFeatures = 'ClapCount_Story' # For linreg
# yFeatures = 'isPopular' # For logreg

xTrain = train[xFeatures]
yTrain = train[yFeatures]

xTest = test[xFeatures]
yTest = test[yFeatures]

In [111]:
features = ["isPaywall","isPublication",
"SentimentPolarity","SentimentSubjectivity",
"WordNum","TextSyllableNum","TextLexiconNum","TextSentenceNum",
"ReadabilityFleschEase","ReadabilitySMOG","ReadabilityFleschKincaid","ReadabilityColemanLiau","ReadabilityARI","ReadabilityDaleChall","ReadabilityDifficultWordsNum","ReadabilityLinsearWriteFormula","ReadabilityGunningFog","ReadabilityReadingTime","ReadabilityConsensus",
"ReadingTime","HasFeaturedImage","CodeInlineNum","TagNum","CodeBlockNum",
"CodeBlockLengthSum","CodeBlockLengthMedian","CodeBlockLengthMean","CodeBlockLengthStd","CodeBlockLengthMin","CodeBlockLengthMax",
"ListOlNum","ListOlSum","ListOlMedian","ListOlMean","ListOlStd","ListOlMin","ListOlMax",
"ListUlNum","ListUlSum","ListUlMedian","ListUlMean","ListUlStd","ListUlMin","ListUlMax","ImgNum","LinkNum","HlightNum",
"ParagraphNum","ItalicNum","BoldNum",
# "AuthorArticlesClap_mean","AuthorArticlesClap_median","AuthorArticlesClap_count","AuthorArticlesClap_sum","AuthorArticlesResponse_mean",
"AuthorArticlesClap_count",
# "AuthorArticlesResponse_median","AuthorArticlesResponse_sum",
"AuthorArticlesVoter_mean","AuthorArticlesVoter_median","AuthorArticlesVoter_sum",
"TagUseSum","TagUseMean","TagUseMedian","TagClapSum","TagClapMean","TagClapMedian",
# "PublicationClap_mean","PublicationClap_median",
"PublicationClap_count","PublicationClap_sum","PublicationReadingTime_mean","PublicationReadingTime_median","PublicationReadingTime_sum","PublicationVoter_mean","PublicationVoter_median","PublicationVoter_sum","PublicationisPaywall_mean","PublicationisPaywall_median","PublicationisPaywall_sum"

]

depvar = 'ClapCount_Story'

xTrain = train[features]
yTrain = train[depvar]

xTest = test[features]
yTest = test[depvar]

# Investigate distribution of response variable

In [77]:
yTrain.median()
df[['VoterCount']].median()
len(df[df['VoterCount']<2])/len(df)

5.0

VoterCount    2.0
dtype: float64

0.46517206959914703

# Check NA columns

In [58]:
pd.set_option('display.max_rows', 1000)
display((df[df['TagNum']==0]))
xTrain.isna().any()

isPaywall                         False
isPublication                     False
WordNum                           False
SentimentPolarity                 False
SentimentSubjectivity             False
TextSyllableNum                   False
TextLexiconNum                    False
TextSentenceNum                   False
ReadabilityFleschEase             False
ReadabilitySMOG                   False
ReadabilityFleschKincaid          False
ReadabilityColemanLiau            False
ReadabilityARI                    False
ReadabilityDaleChall              False
ReadabilityDifficultWordsNum      False
ReadabilityLinsearWriteFormula    False
ReadabilityGunningFog             False
ReadabilityReadingTime            False
ReadabilityConsensus              False
ReadingTime                       False
HasFeaturedImage                  False
CodeInlineNum                     False
TagNum                            False
CodeBlockNum                      False
CodeBlockLengthSum                False


# Model data

In [112]:
reg = LinearRegression().fit(xTrain, yTrain)

In [113]:
reg.score(xTrain,yTrain)

0.07631876354359224

# statsmodel

In [47]:
Counter(df['isPublication'])
print(len(df[df['Company'].isnull()]))
print(df.sample()['Company'])

Counter({False: 44050, True: 35984})

0
64669    0
Name: Company, dtype: object


In [106]:
mod = sm.OLS(yTrain,xTrain.astype(float))
fii = mod.fit()

p_values = fii.pvalues
print(p_values[p_values<0.05].sort_values(ascending=True))
print(fii.summary())

PublicationClap_mean            6.836945e-64
ParagraphNum                    1.557539e-52
CodeInlineNum                   7.696876e-23
ReadingTime                     5.984957e-15
PublicationisPaywall_sum        2.297522e-12
PublicationVoter_median         2.313637e-10
LinkNum                         3.793769e-10
TextSyllableNum                 2.799455e-09
CodeBlockLengthSum              3.089377e-09
CodeBlockLengthMax              9.296785e-09
ReadabilityReadingTime          1.080890e-07
TagUseMedian                    2.408881e-06
PublicationReadingTime_sum      8.722049e-06
CodeBlockLengthStd              2.649779e-05
ReadabilitySMOG                 4.109375e-05
CodeBlockLengthMin              6.657543e-05
PublicationReadingTime_mean     2.109230e-04
TagClapSum                      3.996589e-04
TextLexiconNum                  4.330973e-04
PublicationClap_count           5.137085e-04
PublicationVoter_sum            6.751645e-04
CodeBlockNum                    9.392410e-04
TagNum    

# Logistic Regression

In [24]:
from sklearn.linear_model import LogisticRegression

In [25]:
logit = LogisticRegression()
logit.fit(xTrain,yTrain)



LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='warn', n_jobs=None, penalty='l2',
                   random_state=None, solver='warn', tol=0.0001, verbose=0,
                   warm_start=False)

In [26]:
logit.score(xTrain,yTrain)
importance = logit.coef_[0]
# summarize feature importance
# for i,v in enumerate(importance):
# 	print('Feature: %0d, Score: %.5f' % (i,v))


0.7158671586715867

In [27]:
# Get AUC
from sklearn import metrics

yPreds = logit.predict(xTest)
print("Predict", Counter(yPreds))
# print(type(yPreds))
print("Truth", Counter(yTest))

# print(yPreds)
fpr, tpr, thresholds = metrics.roc_curve(yTest, yPreds)
print(metrics.auc(fpr, tpr))

Predict Counter({1: 220, 0: 163})
Truth Counter({0: 201, 1: 182})
0.6960800393636215
