<h1>EECS 549 Final Project</h1>

Data exploration & basic mainipulation

In [1]:
import os 
import pandas as pd
import json
import csv
from tqdm import tqdm
import numpy as np

In [2]:
np.random.seed(42)

In [3]:
posts = pd.read_csv("../data/ten-million-reddit-answers-posts.csv")

comments = pd.read_csv("../data/ten-million-reddit-answers-comments.csv")

<h2>Combine datasets</h2>

In [4]:
posts['created_date'] = pd.to_datetime(posts['created_utc'], unit = 's')

In [5]:
comments['post permalink'] = comments['permalink'].apply(lambda s: "/".join(s.split("/")[:-2]) + "/")

In [6]:
#prepare data
posts_subset = posts[['title', 'permalink', 'created_date']]
posts_subset = posts_subset.rename(columns = {'created_date': 'post_created'})
comments = comments.drop(columns = ['permalink'])

merge = posts_subset.merge(comments, how = 'left', left_on = 'permalink', right_on = 'post permalink')

In [7]:
merge.shape

(9999980, 13)

In [8]:
merge.groupby('title')['body'].nunique().agg(['mean', 'median', 'min', 'max'])

mean         15.300088
median        4.000000
min           0.000000
max       51673.000000
Name: body, dtype: float64

<h2>Summary statistics about post dates</h2>

In [9]:
posts['created_date'].agg(['min', 'max'])

min   2010-02-18 22:44:06
max   2020-11-30 23:59:39
Name: created_date, dtype: datetime64[ns]

<h2>Create new dataset made up of full posts</h2>

In [10]:
# new numeric index 
merge = merge.reset_index()
merge = merge.drop(columns = ["id"])
merge = merge.rename(columns = {'index': 'docid'})

In [11]:
merge = merge.sort_values(by = 'created_utc', ascending = True)

In [12]:
merge.to_csv("../data/ask_reddit.csv", index = False)

In [13]:
titles = list(merge['title'].astype(str).values)
comments = list(merge['body'].astype(str).values)
idx = list(merge['docid'].values)

In [14]:
used_idx = np.random.choice(idx, size = 300000, replace = False)

In [15]:
idx_to_post = {}

for i in tqdm(range(len(used_idx))):
    docid = idx[i]
    title = titles[i]
    comment = comments[i]
    if '[deleted]' in comment or '[removed]' in comment or '**PLEASE READ THIS MESSAGE IN ITS ENTIRETY BEFORE TAKING ACTION.*' in comment: 
        continue
    if id not in list(idx_to_post.values()):
        idx_to_post[docid] = title + " " + comment + " "
        continue
    curr = idx_to_post[docid]
    if len(curr) <= 30000:
        idx_to_post[docid] = idx_to_post[docid] + comment + " "

100%|████████████████████████████████████████████████████████████████████| 300000/300000 [14:31<00:00, 344.39it/s]


In [21]:
idx_to_post[7188056]

'Every time someone lies to you, $100 gets deposited to your bank account. What is the fastest way for you to get rich? Where is the pen they keep the politicians in? '

In [20]:
list(idx_to_post.keys())

[7188056,
 5710468,
 3399481,
 2538133,
 2191117,
 2883281,
 9917579,
 3955741,
 2538132,
 3971040,
 3971039,
 612505,
 7430223,
 7014874,
 175408,
 9177447,
 5236590,
 3602594,
 9917577,
 3955740,
 6589510,
 7188053,
 7188054,
 877657,
 6645444,
 2315015,
 7188052,
 3971038,
 3452321,
 3397275,
 2603170,
 4080940,
 8456249,
 7188051,
 3971037,
 7430222,
 6522629,
 9017218,
 1608532,
 7188050,
 6645443,
 9177446,
 5379334,
 7188049,
 7188047,
 7188048,
 1839985,
 7912669,
 7188045,
 7188046,
 7188043,
 6318711,
 2844340,
 9177445,
 3971036,
 7188042,
 3512965,
 2191116,
 5987166,
 6477553,
 1154639,
 9177444,
 2187085,
 7188041,
 7188040,
 7188039,
 5906754,
 7343508,
 764629,
 3955739,
 6645442,
 9177443,
 5987165,
 7188038,
 6205882,
 9177442,
 9177441,
 9688930,
 6624065,
 3397274,
 7188037,
 4660969,
 4660968,
 7188036,
 5210533,
 2690933,
 7188035,
 7188034,
 7188033,
 9177440,
 9917576,
 4943241,
 2315014,
 7188032,
 7188030,
 2844339,
 7188028,
 7188031,
 7188026,
 7188027,
 718

<h2>Write dataset to file</h2>

In [None]:
with open("../data/ask_reddit_posts_v2.jsonl", "w") as f:
    for k, v in list(idx_to_post.items()):
        d = {'docid': k, 'text': v}
        print(json.dumps(d), file = f)