In [1]:
import pandas as pd
import numpy as np

### Import Data

In [2]:
scrape_1 = pd.read_csv('../data/hot_scrape_20220526_184845.csv').drop('Unnamed: 0', axis = 1)
scrape_2 = pd.read_csv('../data/hot_scrape_20220527_153856.csv').drop('Unnamed: 0', axis = 1)
scrape_3 = pd.read_csv('../data/hot_scrape_20220528_112955.csv').drop('Unnamed: 0', axis = 1)
scrape_4 = pd.read_csv('../data/hot_scrape_20220529_173947.csv').drop('Unnamed: 0', axis = 1)
scrape_5 = pd.read_csv('../data/hot_scrape_20220530_104237.csv').drop('Unnamed: 0', axis = 1)

In [3]:
print(scrape_1.shape)
print(scrape_2.shape)
print(scrape_3.shape)
print(scrape_4.shape)
print(scrape_5.shape)

(7385, 24)
(7580, 24)
(7216, 24)
(7318, 24)
(8216, 24)


In [4]:
scrapes = scrape_1.iloc[:2000,:].append(scrape_2.iloc[:2000,:])\
                                .append(scrape_3.iloc[:2000,:])\
                                .append(scrape_4.iloc[:2000,:])\
                                .append(scrape_5.iloc[:2000,:]).reset_index(drop=True)

In [5]:
scrapes.shape

(10000, 24)

In [6]:
scrapes.head(3)

Unnamed: 0,title,subreddit,time_sampled,created_utc,num_comments,author,author_premium,domain,edited,gilded,...,locked,over_18,post_id,score,spoiler,subreddit_id,subreddit_subscribers,upvote_ratio,post_age,post_age_min
0,I got new teeth today after having bad teeth m...,MadeMeSmile,1653609000.0,1653593000.0,2008,MrChris680,True,reddit.com,False,2,...,False,False,uyfchr,60494,False,t5_2uqcm,5491453,0.94,15546.906511,259.115109
1,My avocado today was 11/10.,oddlysatisfying,1653609000.0,1653591000.0,440,180secondideas,False,i.redd.it,False,0,...,False,False,uyek2k,26369,False,t5_2x93b,6922967,0.92,17679.90751,294.665125
2,Husband of teacher killed in Texas school shoo...,news,1653609000.0,1653589000.0,1855,twolf1,False,fox10phoenix.com,False,0,...,False,False,uydosg,27941,False,t5_2qh3l,24714016,0.96,20065.90751,334.431792


### Remove duplicate titles

In [7]:
scrapes.drop_duplicates(subset = ['title'], inplace=True)
scrapes.reset_index(drop=True);

### Handle Nulls

In [8]:
[(col, scrapes[col].isnull().sum()) for col in scrapes.columns if scrapes[col].isnull().sum()>0]

[('author', 3), ('author_premium', 3)]

In [9]:
# drop these nulls
scrapes = scrapes[(~scrapes.author.isnull()) & (~scrapes.author_premium.isnull())].reset_index(drop=True)

### Create Target Column

In [10]:
target_val = scrapes.num_comments.median()
target = scrapes.num_comments.map(lambda x: x>target_val).astype(int)
target.rename('target')

scrapes['target'] = target

In [11]:
# drop num_comments
scrapes.drop('num_comments', axis = 1, inplace = True)

### Review Correlations

In [12]:
scrapes.corr().iloc[-1:,:-1]

Unnamed: 0,time_sampled,created_utc,gilded,is_meta,is_original_content,is_reddit_media_domain,is_self,locked,over_18,score,spoiler,subreddit_subscribers,upvote_ratio,post_age,post_age_min
target,-0.064856,-0.098165,0.125162,,0.00493,-0.1588,0.156907,0.02136,0.03762,0.284643,0.030616,0.225188,-0.347287,0.292775,0.292775


In [13]:
# drop correlations in (-0.10, 0.10)
scrapes.drop(['created_utc','is_original_content', 'locked', 'over_18', 'spoiler'], axis = 1, inplace=True)

In [14]:
# drop time_sampled
scrapes.drop('time_sampled', axis = 1, inplace=True)

In [15]:
# drop variables that are a measurement of post success that are not in our control
scrapes.drop(['score', 'upvote_ratio'], axis = 1, inplace = True)

In [16]:
# drop unique id's and other duplicative variables
scrapes.drop(['post_id', 'subreddit_id', 'post_age'], axis = 1, inplace = True)

### Initial EDA

In [17]:
scrapes.dtypes

title                      object
subreddit                  object
author                     object
author_premium             object
domain                     object
edited                     object
gilded                      int64
is_meta                      bool
is_reddit_media_domain       bool
is_self                      bool
subreddit_subscribers       int64
post_age_min              float64
target                      int32
dtype: object

#### Review `title`

In [18]:
# look at length of titles
scrapes.title.map(lambda x: len(x)).describe()

count    9553.000000
mean       49.946718
std        45.573350
min         1.000000
25%        21.000000
50%        36.000000
75%        63.000000
max       300.000000
Name: title, dtype: float64

**Observation** There are no empty titles in our dataset. Additional cleaning will be completed during NLP.

#### Review `subreddit`

In [19]:
scrapes.groupby('subreddit').sum().target.describe()

count    2486.000000
mean        1.904264
std         3.049769
min         0.000000
25%         0.000000
50%         1.000000
75%         3.000000
max        24.000000
Name: target, dtype: float64

In [20]:
# create list of subreddits with target successes 3 stdevs above the meandomain
three_sigma_subreddits = scrapes.groupby('subreddit').count()[scrapes.groupby('subreddit').sum().target > 11].index

In [21]:
is_successful_subreddit = scrapes.subreddit.map(lambda x: x in three_sigma_subreddits).astype(int)
is_successful_subreddit.rename('is_successful_subreddit')

scrapes['is_successful_subreddit'] = is_successful_subreddit

In [22]:
scrapes.drop('subreddit', axis = 1, inplace = True)

#### Review `author`

In [23]:
len(scrapes.author.unique())

8477

In [24]:
# drop column ... that is almost 1 unique per row
scrapes.drop('author', axis = 1, inplace = True)

#### Review `author_premium`

In [25]:
scrapes.author_premium.unique()

array([True, False], dtype=object)

In [26]:
scrapes.author_premium.value_counts(normalize=True)

False    0.839422
True     0.160578
Name: author_premium, dtype: float64

In [27]:
# bool it and see the corr
scrapes.drop('author_premium', axis=1).join(scrapes.author_premium.astype(int)).corr().loc['target','author_premium']

0.11281181972388561

In [28]:
# drop the column
scrapes.drop('author_premium', axis = 1, inplace=True)

#### Review `domain`

In [29]:
len(scrapes.domain.unique())

489

In [30]:
scrapes.groupby('domain').sum().target.describe()

count     489.000000
mean        9.680982
std       119.291953
min         0.000000
25%         1.000000
50%         1.000000
75%         2.000000
max      2461.000000
Name: target, dtype: float64

In [31]:
# create list of domains with target successes 3 stdevs above the mean
three_sigma_domains = scrapes.groupby('domain').count()[scrapes.groupby('domain').sum().target > 367].index

In [32]:
is_successful_domain = scrapes.domain.map(lambda x: x in three_sigma_domains).astype(int)
is_successful_domain.rename('is_successful_domain')

scrapes['is_successful_domain'] = is_successful_domain

In [33]:
scrapes.drop('domain', axis = 1, inplace = True)

#### Review `edited`

In [34]:
scrapes.edited.unique()[:10]

array(['False', '1653605084.0', '1653586491.0', '1653605587.0',
       '1653598322.0', '1653585974.0', '1653581994.0', '1653598639.0',
       '1653587977.0', '1653600605.0'], dtype=object)

In [35]:
# convert to bool
is_edited = scrapes.edited.map(lambda x: x != 'False').astype(int)

scrapes['edited'] = is_edited
scrapes.rename(columns = {'edited': 'is_edited'}, inplace=True)

In [36]:
scrapes.corr().loc['target','is_edited']

0.09931148186932341

In [37]:
# drop
scrapes.drop('is_edited', axis = 1, inplace = True)

#### Review `gilded`

In [38]:
scrapes.gilded.value_counts(normalize=True)

0    0.967445
1    0.027740
2    0.003664
4    0.000523
3    0.000523
5    0.000105
Name: gilded, dtype: float64

In [39]:
# drop it
scrapes.drop('gilded', axis = 1, inplace = True)

#### Review `is_meta`

In [40]:
scrapes.is_meta.unique()

array([False])

In [41]:
# drop it
scrapes.drop('is_meta', axis = 1, inplace = True)

#### Review `is_reddit_media_domain`

In [42]:
scrapes.is_reddit_media_domain.value_counts(normalize=True)

True     0.775359
False    0.224641
Name: is_reddit_media_domain, dtype: float64

In [43]:
# convert to int
scrapes.is_reddit_media_domain = scrapes.is_reddit_media_domain.astype(int)

#### Review `is_self`

In [44]:
scrapes.is_self.value_counts(normalize=True)

False    0.950801
True     0.049199
Name: is_self, dtype: float64

In [45]:
scrapes.corr().loc['target','is_self']

0.15690703388659485

In [46]:
# drop it
scrapes.drop('is_self', axis = 1, inplace = True)

#### Final dataframe pre-NPL

In [47]:
# move target to first column of df
temp_col = scrapes.loc[:,['target']]
scrapes = scrapes.drop('target', axis = 1)
scrapes = temp_col.join(scrapes)

del temp_col

In [48]:
scrapes.dtypes

target                       int32
title                       object
is_reddit_media_domain       int32
subreddit_subscribers        int64
post_age_min               float64
is_successful_subreddit      int32
is_successful_domain         int32
dtype: object

In [49]:
scrapes.to_csv('../data/pre_nlp_data.csv', index = False)