In [1]:
import os
import os.path as path
import sys
sys.path.append(path.abspath(path.join(os.getcwd(), os.pardir)))
from datetime import datetime
import pytz
import pandas as pd
import nltk
from src.utils import TIMEZONES

In [62]:
tweets = pd.read_csv('../data/tweets_201803262044.csv')
reddit_posts = pd.read_csv('../data/reddit_posts_201803271026.csv')
reddit_comments = pd.read_csv('../data/reddit_comments_201803271026.csv')

all_data = [tweets, reddit_posts, reddit_comments]

In [63]:
for data in all_data:
    print(data.shape)

(10209, 9)
(50, 13)
(2351, 10)


In [None]:
for data in all_data:
    print(data.apply(lambda x: sum(x.isnull())))

In [None]:
timezones = {
    'toronto': pytz.timezone('America/Toronto'),
    'new_york': pytz.timezone('America/New_York'),
    'chicago': pytz.timezone('America/Chicago'),
    'los_angeles': pytz.timezone('America/Los_Angeles'),
    'vancouver': pytz.timezone('America/Vancouver')
}

# drop rows with missing values and duplicates
tweets.dropna(axis=0, how='any', inplace=True)
tweets.drop_duplicates('id', inplace=True)

# make user id a string
tweets['user'] = tweets['user'].map(lambda x: 'UID%019d' % x)

# save utc and local times
tweets['utc_time'] = tweets['created_at'].map(
    lambda x: datetime.strptime(x, '%a %b %d %H:%M:%S +0000 %Y').replace(tzinfo=pytz.utc)
)
tweets['local_time'] = tweets.apply(
    lambda row: timezones[row['city']].normalize(row['utc_time']),
    axis=1
)
tweets['local_hour'] = tweets['local_time'].map(lambda x: x.hour)
tweets['weekday'] = tweets['local_time'].map(lambda x: x.weekday())
tweets['month'] = tweets['local_time'].map(lambda x: x.month)

# extract source
tweets['source'] = tweets['source'].map(
    lambda x: x.split('>')[1].split('<')[0].strip())
btm_95pct_sources = tweets['source'].value_counts()[tweets['source'].value_counts().cumsum() > tweets['source'].value_counts().sum() * 0.95]
for other in btm_95pct_sources.index:
    tweets['source'].replace({other: 'Other'}, inplace=True)

tweets.drop(['created_at', 'id'])

In [64]:
reddit_comments.apply(lambda x: sum(x.isnull()))

id              0
author         41
created        13
created_utc    13
subreddit      13
score          13
ups            13
downs          13
gilded         13
body           13
dtype: int64

In [7]:
reddit_comments[reddit_comments['author'].isnull()]
reddit_comments.iloc[1708:1712]

Unnamed: 0,id,author,created_utc,subreddit,score,ups,downs,gilded,body
1708,dwakqjr,CptCrankyPants,1522038000.0,toronto,3.0,3.0,0.0,0.0,Did the tin foil hat blow off with the wind to...
1709,dwb24op,sieradz,1522071000.0,toronto,1.0,1.0,0.0,0.0,You dont need to read the article. The entire ...
1710,dwacc2p,,,,,,,,
1711,dwa9od1,,1522025000.0,toronto,33.0,33.0,0.0,0.0,[removed]


In [81]:
# drop duplicates, missing values, and deleted/removed comments
reddit_comments = reddit_comments[
    reddit_comments['body'].map(lambda x: x not in ['[deleted]', '[removed]'])
].reset_index()
# author: null if this is a promotional link
reddit_comments.dropna(axis=0, inplace=True, subset=['body', 'author'])
reddit_comments.drop_duplicates('id', inplace=True)

# save utc time
# not able to extract local (submitter's) time
reddit_comments['utc_time'] = reddit_comments['created_utc'].map(
    lambda x: datetime.fromtimestamp(x).replace(tzinfo=pytz.utc)
)
reddit_comments['weekday'] = reddit_comments['utc_time'].map(lambda x: x.weekday())
reddit_comments['month'] = reddit_comments['utc_time'].map(lambda x: x.month)

In [89]:
list(reddit_comments['author'].unique())

['sprungy',
 'nerkidner',
 'CDunzz',
 'thenewoldschool55',
 'CptCrankyPants',
 'Noobodybelievesyou',
 'Tonezinator',
 'Rex_Reynolds',
 'ApprehensivePickle',
 'Sh4ggyD00',
 'nedsucks',
 'YoungZM',
 'arsentis',
 'Zeppelanoid',
 'CthulhusCall',
 'BoogerSlug',
 'bananacrumble',
 'CommonSenseAvenger',
 'Nimku',
 'Asrivak',
 'section111',
 'decmcc',
 'Ehau',
 'beautiful_bwoi',
 'zenmaster91',
 'Astro493',
 'miurabucho',
 'ramon13',
 'IJUSTENDWDU',
 'ZingerGombie',
 'MarkShapiro',
 'jasonrego',
 'letsboop',
 'natalie_01',
 'Rs1000000',
 'mcreeves',
 'frowningcat',
 'TrOuBLeDbOyXD',
 'watchme3',
 'alvin545',
 'AlwaysStranger2046',
 'rob987654321',
 'goodkiddadcity',
 'GMammoliti',
 'contact-',
 'Artificial_Ghost',
 'CrispyChicken69',
 'ahtchan',
 'ieGod',
 'bbqmeh',
 'Knife_-_Wrench',
 'Davidaaronbanks',
 'sunguilt23',
 'autotldr',
 'POP_TART_TACO',
 'FBIUAreOnTheListFBI',
 'ikarun',
 'GMichaelThomas',
 'Juergenator',
 'RambleMan',
 'NewMilleniumBoy',
 'ktreektree',
 'a-gay-canadian',
 'IHaveA

In [83]:
reddit_posts.head()

Unnamed: 0,id,author,created,created_utc,subreddit,score,ups,downs,gilded,num_comments,over_18,selftext,title
0,87i4ut,torontothingstodo,1522184000.0,1522155000.0,toronto,10,10,0,0,1,False,* **[Raptors vs Denver Nuggets](https://www1.t...,27-MAR-2018 - Things to do Today
1,87cdpy,TheMikeGShow,1522125000.0,1522096000.0,toronto,603,603,0,0,570,False,,Toronto restaurant shocks vegans protesting meat
2,87edkv,CptCrankyPants,1522141000.0,1522112000.0,toronto,177,177,0,0,110,False,,Anti-Doug Ford posters surface in Toronto
3,87ihg9,jasounseebourne,1522187000.0,1522158000.0,toronto,15,15,0,0,24,False,,Ontario budget to fund free child care for pre...
4,87ihzr,BlocBoyJB,1522187000.0,1522158000.0,toronto,10,10,0,0,10,False,,Accused in autistic man beating sports bruises...


In [85]:
reddit_posts.select_dtypes(include=['object', 'bool']).describe()

Unnamed: 0,id,author,subreddit,over_18,selftext,title
count,50,50,50,50,17,50
unique,50,39,5,1,16,50
top,87fph9,AutoModerator,LosAngeles,False,"Hello! This thread is for discussions, questio...",Morning hike at the Griffith
freq,1,6,10,50,2,1
