In [1]:
# Install any dependencies
!pip install pandas
!pip install praw
!pip install python-dotenv
!pip install pyarrow



In [2]:
import pandas as pd
import praw
import os
from os.path import join, dirname
from dotenv import main

# Make sure you create a .env file and fill in all the necessary information in the same folder as this script!
main.load_dotenv(join(dirname(os.path.realpath('__file__')), '.env'))

reddit = praw.Reddit(
   client_id=os.environ.get("CLIENT_ID"),
   client_secret=os.environ.get("CLIENT_SECRET"),
   user_agent="CMV_Scraper",
)


In [3]:
# load the data
import tarfile
import os.path
import json
import re
from bz2 import BZ2File
from urllib import request
from io import BytesIO

import numpy as np


fname = "cmv.tar.bz2"
url = "https://chenhaot.com/data/cmv/" + fname

# download if not exists
if not os.path.isfile(fname):
    f = BytesIO()
    with request.urlopen(url) as resp, open(fname, 'wb') as f_disk:
        data = resp.read()
        f_disk.write(data)  # save to disk too
        f.write(data)
        f.seek(0)
else:
    f = open(fname, 'rb')




In [8]:
#tar = tarfile.open(fileobj=f, mode="r:bz2")
tar = tarfile.open(fileobj=f, mode="r")

# Extract the file we are interested in

train_fname = "op_task/train_op_data.jsonlist.bz2"
test_fname = "op_task/heldout_op_data.jsonlist.bz2"

train_bzlist = tar.extractfile(train_fname)

ReadError: file could not be opened successfully

In [9]:
# Load the jsonlist file into a dataframe
df = pd.read_json(train_bzlist, orient='list', lines=True)

NameError: name 'train_bzlist' is not defined

In [None]:
# Function to check if the posts still exists on reddit
def try_get_post(post_id):
    try:
        submission = reddit.submission(post_id)
        submission.name
        return True
    except Exception as e:
        return False

In [None]:
# Create the function that will be handling all the data gathering

def get_top_comment_and_clean_data(post_id):
    print(post_id)

    # Grab the post
    submission = reddit.submission(post_id)

    # Grab the highest rated comment on root layer
    submission.submission_type = 'best'
    submission.comments.replace_more(limit=0)
    replies = list(submission.comments)[0].replies.list()

    # Just some variables
    responses = pros = cons = []
    last_author = submission.author.name
    is_pro_argument = False

    for comment in replies:
        print(comment)
        responses.append(comment.body)

        # Sometimes for some reason duplicate entries exist
        # Also remove automated message with "Δ" in it
        if comment.body in responses or "Δ" in responses:
            continue
        
        # If redditor object doesn't exist, the account is invalid/deleted
        if not comment.author:
            author = "[deleted]"
            last_author = author
            # There's no body text, so skip.
            continue
        else:
            author = comment.author.name

        # Assume that whenever the user changes, they are countering the previous person
        if comment.author.name != last_author:
            is_pro_argument = !is_pro_argument
        
        # Add to the respective argument type        
        if is_pro_argument:
            pros.append(comment.body)
        else:
            cons.append(comment.body)
        
        last_author = comment.author.name
        
    # Pros = arguments for the Title of this post
    # Cons = arguments against the title of this post
    
    return pros, cons, responses

In [None]:
df.head(1)

In [None]:
# load in our data. this will take a while.
dataset = df.head(101)

In [None]:


all_pros = all_names = all_titles = []

for i in range(dataset.shape[0]):
    post = dataset.iloc[i]
    print(post.name)

    if not try_get_post(post.name):
        print("This shouldn't happen")
        continue

    pros, _cons, _response = get_top_comment_and_clean_data(post['name'])
    #print(pros)

    all_pros.append(pros)
    all_names.append(post.name)
    all_titles.append(post.titles.replace('CMV', "Change my mind"))

In [None]:
# Place it all into a Pandas Dataframe
clean_df = pd.DataFrame({
    "INSTRUCTION": all_titles,
    "RESPONSE": all_pros,
    "SOURCE": "Reddit"
}, index=all_names
)

In [None]:
# Create Apache Paquete file

import pyarrow as pa
import pyarrow.parquet as pq

table = pa.Table.from_pandas(clean_df)
pq.write_table(table,"output.parquet")