# Part1: Extraction of the data from www.reddit.com and conversion to a pandas data frame
##  

In [None]:
import json
import urllib.request
from matplotlib import pyplot as plt

In [None]:
NUM_OF_POSTS = 900

In [None]:
import time
hdr = {'User-Agent': 'osx:r/relationships.single.result:v1.0 (by /u/python_machine_learn)'}
url = 'https://www.reddit.com/r/relationships/top/.json?sort=top&t=all&limit=100' 
req = urllib.request.Request(url, headers = hdr)
text_data = urllib.request.urlopen(req).read()
text_data_json = text_data.decode('utf8')
data = list(json.loads(text_data_json).values())
posts_all = data[0]['children']
while (len(posts_all) < NUM_OF_POSTS):
    time.sleep(2)
    last = posts_all[-1]['data']['name']
    url = 'https://www.reddit.com/r/relationships/top/.json?sort=top&t=all&limit=100&after=%s' % last 
    req = urllib.request.Request(url, headers = hdr)
    text_data = urllib.request.urlopen(req).read()
    text_data_json = text_data.decode('utf8')
    data = list(json.loads(text_data_json).values())
    post = data[0]['children']
    posts_all += post

In [None]:
print(len(posts_all))

In [None]:
post_title = []
post_flair = []
post_date = []
post_numComments = []
post_score = []

In [None]:
for k in range(0, NUM_OF_POSTS):
    post_title.append(posts_all[k]['data']['title'])
    post_flair.append(posts_all[k]['data']['link_flair_text'])
    post_date.append(posts_all[k]['data']['created_utc'])
    post_numComments.append(posts_all[k]['data']['num_comments'])
    post_score.append(posts_all[k]['data']['score'])

In [None]:
import pandas as pd
from pandas import DataFrame
posts_df = DataFrame({'Date': post_date,
                       'Title': post_title,
                        'Flair' : post_flair,
                        'Comments' : post_numComments,
                         'Score' : post_score
                     })
posts_df = posts_df[['Date', 'Title', 'Flair', 'Comments', 'Score']]

In [None]:
posts_df.head()

# Part 2: Cleaning the data
## Below we clean the data, and extract some useful information from the title
##  

In [None]:
posts_df['Date'] = pd.to_datetime(posts_df['Date'].values, unit = 's')

In [None]:
posts_df.head()

In [None]:
posts_df['Flair'][1] == '◉ Locked Post ◉'

In [None]:
import numpy as np
replace_value = posts_df['Flair'][1]
posts_df['Flair'] = posts_df['Flair'].replace(replace_value, np.nan)

In [None]:
posts_df.head()

In [None]:
posts_df['Flair'].isnull().sum()

In [None]:
import re
cond1 = posts_df['Title'].str.contains('^\[?[a-z!?A-Z ]*UPDATE\]?:?', flags = re.IGNORECASE)
cond2 = posts_df['Flair'].isnull()
posts_df.loc[(cond1 & cond2), 'Flair'] = posts_df.loc[(cond1 & cond2), 'Flair'].replace(np.nan, 'Updates')

In [None]:
posts_df.head()

In [None]:
poster_age_gender = posts_df['Title'].str.extract("((i\'m|i|my|me)\s?(\[|\()(m|f)?(\s|/)?[0-9]{1,2}(\s|/)?([m,f]|male|female)?(\]|\)))", flags = re.IGNORECASE)[0]
poster_age_gender.head()

In [None]:
poster_age_gender = poster_age_gender.str.replace("((i\'m|i|my|me))\s?", "", flags = re.IGNORECASE)

In [None]:
poster_age = poster_age_gender.str.extract('([0-9]{1,2})')
poster_gender = poster_age_gender.str.extract('([m,f])', flags = re.IGNORECASE)

In [None]:
posts_df['Poster Age'] = pd.to_numeric(poster_age)
posts_df['Poster Gender'] = poster_gender.str.upper()

In [None]:
posts_df.head()

In [None]:
posts_df['Poster Age'].isnull().sum()

In [None]:
posts_df['Poster Gender'].isnull().sum()

In [None]:
posts_df['Day'] = posts_df['Date'].dt.dayofweek

In [None]:
days = {0: 'Mon', 1: 'Tues', 2: 'Weds', 3: 'Thurs', 4: 'Fri',
        5: 'Sat', 6: 'Sun'}
posts_df['Day'] = posts_df['Day'].map(days)
posts_df.head()

# Part 3: Describing (Statistical Analysis of) the data
##  Age Distribution

In [None]:
posts_df['Poster Age'].describe()

In [None]:
poster_age_not_null = posts_df.loc[posts_df['Poster Age'].notnull(), 'Poster Age']
plt.hist(poster_age_not_null)
plt.show()

In [None]:
poster_age_not_null_female = posts_df.loc[(posts_df['Poster Gender'] == 'F'), 'Poster Age']
poster_age_not_null_female.mean()

In [None]:
poster_age_not_null_male =posts_df.loc[(posts_df['Poster Gender'] == 'M'), 'Poster Age']
poster_age_not_null_male.mean()

## Distribution of the days of the week of the post

In [None]:
days_to_nums = {'Mon' : 0, 'Tues' : 1, 'Weds' : 2, 'Thurs' : 3, 'Fri' : 4,
        'Sat' : 5, 'Sun' : 6}
post_day_of_week = posts_df['Day'].map(days_to_nums)
post_day_of_week_not_null = post_day_of_week.loc[post_day_of_week.notnull()]
plt.hist(post_day_of_week_not_null)
plt.show()

## Distribution of the gender

In [None]:
posts_df['Poster Gender'].value_counts()

## Gender percentage 

In [None]:
100 * posts_df['Poster Gender'].value_counts() / posts_df['Poster Gender'].notnull().sum()

In [None]:
plt.hist(posts_df.loc[posts_df['Poster Gender'].notnull(), 'Poster Gender'])
plt.show()

## Distribution of Flairs

In [None]:
posts_df['Flair'].value_counts()

In [None]:
100 * posts_df['Flair'].value_counts() / posts_df['Flair'].notnull().sum()

## Distribution of Comments

In [None]:
posts_df['Comments'].describe()

In [None]:
plt.hist(posts_df['Comments'])
plt.show()

## Distribution of scores

In [None]:
posts_df['Score'].describe()

In [None]:
plt.hist(posts_df['Score'])
plt.show()

In [None]:
print(posts_all[7]['data']['title'])
print('-----------------------------------')
print(posts_all[7]['data']['selftext'])