# Coding

In [68]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import time
%matplotlib inline

import math

# Data Pre-Processing

In [54]:
path = 'C:/Users/Jasper/Desktop/Studying/Collecting and Analyzing Big Data/Project/Data/'

In [55]:
data1 = pd.read_csv(path+'2017-11.csv', dtype=str)
data2 = pd.read_csv(path+'2017-12.csv', dtype=str)
data3 = pd.read_csv(path+'2018-01.csv', dtype=str)
data4 = pd.read_csv(path+'2018-02.csv', dtype=str)
data5 = pd.read_csv(path+'2018-03.csv', dtype=str)

In [56]:
# Not all datasets have the same columns, this way we make sure they all do.
cols = data2.columns
data1 = data1[cols]
data2 = data2[cols]
data3 = data3[cols]
data4 = data4[cols]
data5 = data5[cols]

# Concatenating to one
df = pd.concat((data1, data2, data3, data4, data5), axis=0)

In [57]:
# In the following, we will deal with each column / preprocess on its own
df.columns

Index(['Unnamed: 0', 'author', 'author_cakeday', 'author_flair_css_class',
       'author_flair_text', 'body', 'created_utc', 'distinguished', 'edited',
       'id', 'link_id', 'parent_id', 'permalink', 'retrieved_on', 'score',
       'stickied', 'subreddit', 'subreddit_id'],
      dtype='object')

# Setting the Index

In [58]:
df['Index'] = range(df.shape[0])
df = df.set_index('Index')
df = df.drop('Unnamed: 0', axis=1)

# Author

In [59]:
for elem in df['author']:
    if str(elem).lower() == 'nan':
        print(elem)
        
# So, can be left unchanged

# Author Cake Day

In [60]:
def authorcakeday_preprocess(elem):
    elem = str(elem)
    elem = elem.replace("'", "")
    if elem == "True":
        elem = True
    else:
        elem = False
    return elem 

df['author_cakeday'] = df['author_cakeday'].apply(authorcakeday_preprocess)
df['author_flair_text'] = df['author_flair_text'].astype(str)

for elem in df['author_flair_text']:
    elem = elem.capitalize()

# Author Flair CSS Class

In [61]:
# I think we can drop this column althogether
#df['author_flair_css_class'].unique()

# Author Flair Text

In [62]:
df['author_flair_text'].unique()

array(['nan', 'Redditor for 1 month.', 'Student', ...,
       'Developing https://alltimehigh.ly', 'Gridcoin fan', '\\/eChain'],
      dtype=object)

# Body 

In [63]:
df['body']

Index
0                                          Because I sold it
1          They probably just airdropped tokens into your...
2                                           soon since $5000
3          Here is a list of threads in other subreddits ...
4          MEW is only set up for ERC20 tokens so make su...
                                 ...                        
2160614               I'm studying thermo-physics thanks bud
2160615                                            [removed]
2160616    So. Sell 10k ZIL for some Matrix Ai Network. Z...
2160617    I ever said anywhere its "dooms day".  Lots of...
2160618    Hey Bane, I'm just going to keep posting these...
Name: body, Length: 2160619, dtype: object

# Created utc

In [64]:
# convert from epoch to day
import datetime
# Drop nan
df['created_utc'] = df['created_utc'].astype(str)

for elem in df['created_utc']:
    elem = elem.lower()

df = df[df['created_utc'] != 'nan']

def epoch_to_time(elem):
    ydm = datetime.datetime.utcfromtimestamp(int(elem))
    return ydm

df['created_utc'] = df['created_utc'].apply(epoch_to_time)
df['created_utc'].value_counts()

2018-03-15 17:43:16    18
2018-02-07 17:07:49    11
2017-12-30 10:41:43     8
2018-02-13 03:24:25     7
2018-01-16 22:34:21     7
                       ..
2018-03-15 14:42:37     1
2018-01-12 11:03:00     1
2018-02-19 15:42:17     1
2018-01-10 05:18:24     1
2018-01-08 19:36:48     1
Name: created_utc, Length: 1920407, dtype: int64

# Distinguished

In [65]:
df['distinguished'].unique()

def moderator_func(elem):
    if str(elem) == 'moderator':
        mod_bool = True
    else:
        mod_bool = False
    return mod_bool

df['distinguished'] = df['distinguished'].apply(moderator_func)
df = df.rename(columns={'distinguished':'Moderator'})

# Edited

In [66]:
df['edited'] = df['edited'].astype(str)


def edit_func(elem):
    if elem.lower() == 'nan':
        ed_time = math.nan
    else:
        ed_time = datetime.datetime.utcfromtimestamp(int(elem))

    return ed_time

df['edited'] = df['edited'].apply(edit_func)

In [67]:
df['edited'].unique()

array([                          'NaT', '2017-11-01T00:12:54.000000000',
       '2017-11-01T01:59:02.000000000', ...,
       '2018-03-20T05:21:54.000000000', '2018-03-20T05:20:38.000000000',
       '2018-03-20T13:17:02.000000000'], dtype='datetime64[ns]')

# ID 

In [70]:
# can be left as is
df['id']

Index
0          dp61rvp
1          dp61ttv
2          dp61usd
3          dp61v3p
4          dp61wr9
            ...   
2160614    dwlqlrc
2160615    dwlqm07
2160616    dwlqmnm
2160617    dwlqmuz
2160618    dwlqnf1
Name: id, Length: 2160610, dtype: object

# Link ID

In [73]:
df['link_id'].value_counts()

t3_7owmot    15088
t3_7qqddk    12486
t3_7qz1vp    11430
t3_7vcvi0    10163
t3_7ugrgt     9848
             ...  
t3_7v9t36        1
t3_7pia9e        1
t3_7o827a        1
t3_7iwqoi        1
t3_7qnwvr        1
Name: link_id, Length: 98975, dtype: int64

# Parent ID 

In [75]:
df['parent_id'].value_counts()

t3_7owmot     3389
t3_7qqddk     3249
t3_7qz1vp     2674
t3_7pdp8e     2464
t3_7vcvi0     2400
              ... 
t1_dujh19r       1
t1_dufnmky       1
t1_ds2j5lx       1
t1_dsidip9       1
t1_dv6psu7       1
Name: parent_id, Length: 1039463, dtype: int64

# Permalink

In [79]:
df['permalink'].value_counts()

/r/CryptoCurrency/comments/7glaeg/tomorrow_there_will_be_a_lot_of_newbies_here_lets/dql6ih6/    1
/r/CryptoCurrency/comments/7zb9fv/what_is_vertcoin_lightning_network_segwit_enabled/dunrnq0/    1
/r/CryptoCurrency/comments/7j64f7/there_is_so_much_innovation_in_the_space_i_feel/dr4segr/      1
/r/CryptoCurrency/comments/7zlz33/daily_general_discussion_february_23_2018/duq56np/            1
/r/CryptoCurrency/comments/7nzqpm/test/ds5s7ap/                                                 1
                                                                                               ..
/r/CryptoCurrency/comments/7soghc/arsenal_fc_become_first_in_premier_league_to/dt697v6/         1
/r/CryptoCurrency/comments/7s3vio/daily_general_discussion_january_22_2018/dt29a23/             1
/r/CryptoCurrency/comments/7wjsoi/daily_general_discussion_february_10_2018/du1cq4a/            1
/r/CryptoCurrency/comments/7o2o44/welcome_to_the_age_of_monkey_darts/ds6x5zv/                   1
/r/CryptoCurrency/co

# Retrieved On 

In [None]:
df['retrieved_on'] = df['retrieved_on'].astype(str)


def retrieved_func(elem):
    if elem.lower() == 'nan':
        ed_time = math.nan
    else:
        ed_time = datetime.datetime.utcfromtimestamp(int(elem))

    return ed_time

df['retrieved_on'] = df['retrieved_on'].apply(retrieved_func)
df['retrieved_on'].value_counts()

# Score

In [89]:
df['score'] = df['score'].astype(int)

In [94]:
df['score']

Index
0          10
1           1
2           0
3           1
4           1
           ..
2160614     1
2160615     1
2160616     1
2160617     1
2160618     1
Name: score, Length: 2160610, dtype: int32

# Stickied

In [100]:
def stickied_func(elem):
    if str(elem).lower() == 'True':
        stick_bool = True
    else:
        stick_bool = False
    return stick_bool

df['stickied'] = df['stickied'].apply(stickied_func)

In [101]:
df['stickied']

Index
0          False
1          False
2          False
3          False
4          False
           ...  
2160614    False
2160615    False
2160616    False
2160617    False
2160618    False
Name: stickied, Length: 2160610, dtype: bool

# Subreddit

In [104]:
df['subreddit'].unique()
# to be expected

array(['CryptoCurrency'], dtype=object)

# Subreddit ID

In [105]:
df['subreddit_id'].unique()

array(['t5_2wlj3'], dtype=object)

# Pre-processing done!

In [106]:
df

Unnamed: 0_level_0,author,author_cakeday,author_flair_css_class,author_flair_text,body,created_utc,Moderator,edited,id,link_id,parent_id,permalink,retrieved_on,score,stickied,subreddit,subreddit_id
Index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1
0,ShytTalkingScrub,False,noflair,,Because I sold it,2017-11-01 00:00:09,False,NaT,dp61rvp,t3_79zwnf,t3_79zwnf,/r/CryptoCurrency/comments/79zwnf/why_is_bitco...,NaT,10,False,CryptoCurrency,t5_2wlj3
1,Threat-Level-Midnite,False,training,Redditor for 1 month.,They probably just airdropped tokens into your...,2017-11-01 00:01:20,False,NaT,dp61ttv,t3_79txig,t1_dp5hu0f,/r/CryptoCurrency/comments/79txig/daily_genera...,NaT,1,False,CryptoCurrency,t5_2wlj3
2,e_x_p,False,transition,,soon since $5000,2017-11-01 00:01:54,False,NaT,dp61usd,t3_79xisk,t3_79xisk,/r/CryptoCurrency/comments/79xisk/a_trend_i_st...,NaT,0,False,CryptoCurrency,t5_2wlj3
3,DuplicatesBot,False,training,Redditor for 1 month.,Here is a list of threads in other subreddits ...,2017-11-01 00:02:06,False,NaT,dp61v3p,t3_79zyc6,t3_79zyc6,/r/CryptoCurrency/comments/79zyc6/what_exactly...,NaT,1,False,CryptoCurrency,t5_2wlj3
4,Threat-Level-Midnite,False,training,Redditor for 1 month.,MEW is only set up for ERC20 tokens so make su...,2017-11-01 00:03:03,False,NaT,dp61wr9,t3_79txig,t1_dp60kf0,/r/CryptoCurrency/comments/79txig/daily_genera...,NaT,1,False,CryptoCurrency,t5_2wlj3
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2160614,Bigvardaddy,False,Training,Redditor for 8 months.,I'm studying thermo-physics thanks bud,2018-03-31 23:58:58,False,NaT,dwlqlrc,t3_887zgf,t1_dwixhcv,/r/CryptoCurrency/comments/887zgf/til_the_supp...,2018-03-31 23:59:00,1,False,CryptoCurrency,t5_2wlj3
2160615,[deleted],False,,,[removed],2018-03-31 23:59:07,False,NaT,dwlqm07,t3_88mpgv,t3_88mpgv,/r/CryptoCurrency/comments/88mpgv/how_to_ta_co...,2018-03-31 23:59:08,1,False,CryptoCurrency,t5_2wlj3
2160616,radarmike,False,Warning-level1,6 - 7 years account age. 350 - 700 comment karma.,So. Sell 10k ZIL for some Matrix Ai Network. Z...,2018-03-31 23:59:27,False,NaT,dwlqmnm,t3_88gupa,t3_88gupa,/r/CryptoCurrency/comments/88gupa/daily_genera...,2018-03-31 23:59:28,1,False,CryptoCurrency,t5_2wlj3
2160617,arsonbunny,False,Analyst,Analyst,"I ever said anywhere its ""dooms day"". Lots of...",2018-03-31 23:59:34,False,NaT,dwlqmuz,t3_88khpp,t1_dwlqemq,/r/CryptoCurrency/comments/88khpp/bitcoin_you_...,2018-03-31 23:59:36,1,False,CryptoCurrency,t5_2wlj3
