In [1]:
import os
import os.path as path
import sys
sys.path.append(path.abspath(path.join(os.getcwd(), os.pardir)))
from datetime import datetime
import pytz
import pandas as pd
import nltk
from src.utils import TIMEZONES

In [62]:
tweets = pd.read_csv('../data/tweets_201803262044.csv')
reddit_posts = pd.read_csv('../data/reddit_posts_201803271026.csv')
reddit_comments = pd.read_csv('../data/reddit_comments_201803271026.csv')

all_data = [tweets, reddit_posts, reddit_comments]

In [63]:
for data in all_data:
    print(data.shape)

(10209, 9)
(50, 13)
(2351, 10)


In [None]:
for data in all_data:
    print(data.apply(lambda x: sum(x.isnull())))

In [None]:
timezones = {
    'toronto': pytz.timezone('America/Toronto'),
    'new_york': pytz.timezone('America/New_York'),
    'chicago': pytz.timezone('America/Chicago'),
    'los_angeles': pytz.timezone('America/Los_Angeles'),
    'vancouver': pytz.timezone('America/Vancouver')
}

# drop rows with missing values and duplicates
tweets.dropna(axis=0, how='any', inplace=True)
tweets.drop_duplicates('id', inplace=True)

# make user id a string
tweets['user'] = tweets['user'].map(lambda x: 'UID%019d' % x)

# save utc and local times
tweets['utc_time'] = tweets['created_at'].map(
    lambda x: datetime.strptime(x, '%a %b %d %H:%M:%S +0000 %Y').replace(tzinfo=pytz.utc)
)
tweets['local_time'] = tweets.apply(
    lambda row: timezones[row['city']].normalize(row['utc_time']),
    axis=1
)
tweets['local_hour'] = tweets['local_time'].map(lambda x: x.hour)
tweets['weekday'] = tweets['local_time'].map(lambda x: x.weekday())
tweets['month'] = tweets['local_time'].map(lambda x: x.month)

# extract source
tweets['source'] = tweets['source'].map(
    lambda x: x.split('>')[1].split('<')[0].strip())
btm_95pct_sources = tweets['source'].value_counts()[tweets['source'].value_counts().cumsum() > tweets['source'].value_counts().sum() * 0.95]
for other in btm_95pct_sources.index:
    tweets['source'].replace({other: 'Other'}, inplace=True)

tweets.drop(['created_at', 'id'])

In [64]:
reddit_comments.apply(lambda x: sum(x.isnull()))

id              0
author         41
created        13
created_utc    13
subreddit      13
score          13
ups            13
downs          13
gilded         13
body           13
dtype: int64

In [7]:
reddit_comments[reddit_comments['author'].isnull()]
reddit_comments.iloc[1708:1712]

Unnamed: 0,id,author,created_utc,subreddit,score,ups,downs,gilded,body
1708,dwakqjr,CptCrankyPants,1522038000.0,toronto,3.0,3.0,0.0,0.0,Did the tin foil hat blow off with the wind to...
1709,dwb24op,sieradz,1522071000.0,toronto,1.0,1.0,0.0,0.0,You dont need to read the article. The entire ...
1710,dwacc2p,,,,,,,,
1711,dwa9od1,,1522025000.0,toronto,33.0,33.0,0.0,0.0,[removed]


In [65]:
# drop duplicates, missing values, and deleted/removed comments
reddit_comments = reddit_comments[
    reddit_comments['body'].map(lambda x: x not in ['[deleted]', '[removed]'])
].reset_index()
# author: null if this is a promotional link
reddit_comments.dropna(axis=0, inplace=True, subset=['body', 'author'])
reddit_comments.drop_duplicates('id', inplace=True)

# save utc time
# not able to extract local (submitter's) time
reddit_comments['utc_time'] = reddit_comments['created_utc'].map(
    lambda x: datetime.fromtimestamp(x).replace(tzinfo=pytz.utc)
)
reddit_comments['weekday'] = reddit_comments['utc_time'].map(lambda x: x.weekday())
reddit_comments['month'] = reddit_comments['utc_time'].map(lambda x: x.month)

In [67]:
reddit_comments['created'].map(
    lambda x: datetime.fromtimestamp(x)
).head()

0   2018-03-27 17:57:23
1   2018-03-27 00:53:59
2   2018-03-27 05:57:20
3   2018-03-27 00:41:23
4   2018-03-27 00:29:22
Name: created, dtype: datetime64[ns]

In [80]:
reddit_comments.apply(
    lambda row: row['subreddit'] + str((
        row['created'] - row['created_utc']
    )/3600),
    axis=1
).value_counts()

toronto8.0       940
vancouver8.0     690
chicago8.0       307
LosAngeles8.0    195
nyc8.0           178
dtype: int64

In [None]:
reddit_comments