In [None]:
# Install any dependencies
!pip install pandas
!pip install praw
!pip install python-dotenv
!pip install pyarrow
!pip install detoxify



In [22]:
# Set the head number to the amount of entries you want to load in minus one
ENTRIES_COUNT = 100

# Set the threshold for toxic comments to be removed
TOXIC_THRESHOLD = 0.95

In [None]:
import pandas as pd
import praw
import os
from os.path import join, dirname
from dotenv import main

# Make sure you create a .env file and fill in all the necessary information in the same folder as this script!
main.load_dotenv(join(dirname(os.path.realpath('__file__')), '.env'))

reddit = praw.Reddit(
   client_id=os.environ.get("CLIENT_ID"),
   client_secret=os.environ.get("CLIENT_SECRET"),
   user_agent="CMV_Scraper",
)


In [None]:
# load the data
import tarfile
import os.path
import json
import re
from bz2 import BZ2File
from urllib import request
from io import BytesIO

import numpy as np


fname = "cmv.tar.bz2"
url = "https://chenhaot.com/data/cmv/" + fname

# download if not exists
if not os.path.isfile(fname):
    f = BytesIO()
    with request.urlopen(url) as resp, open(fname, 'wb') as f_disk:
        data = resp.read()
        f_disk.write(data)  # save to disk too
        f.write(data)
        f.seek(0)
else:
    f = open(fname, 'rb')




In [None]:
#tar = tarfile.open(fileobj=f, mode="r:bz2")
tar = tarfile.open(fileobj=f, mode="r")

# Extract the file we are interested in

train_fname = "op_task/train_op_data.jsonlist.bz2"
test_fname = "op_task/heldout_op_data.jsonlist.bz2"

train_bzlist = tar.extractfile(train_fname)

In [None]:
# Deserialize the JSON list
original_posts_train = [
    json.loads(line.decode('utf-8'))
    for line in BZ2File(train_bzlist)
]

In [None]:
original_posts_train

In [None]:
# Load the jsonlist file into a dataframe
#df = pd.read_json(original_posts_train, orient='list', lines=True)
df = pd.DataFrame(original_posts_train)

In [None]:
# Function to check if the posts still exists on reddit
def try_get_post(post_id):
    try:
        submission = reddit.submission(id=post_id)
        submission.name
        return True
    except Exception as e:
        return False

In [None]:
# Set up the detoxifier model:
from detoxify import Detoxify

In [None]:
import re

# Removes > sign and the template message at the end of a message
def cleanup_body_text(cmv_post):
    lines = [line for line in cmv_post.splitlines()
            if not line.lstrip().startswith("&gt;")
            and not line.lstrip().startswith("____")
            and not line.lstrip().startswith("So go forth and CMV, noble redditors!")
            and "edit" not in " ".join(line.lower().split()[:2])
            ]
    #print(lines)
    return "\n".join(lines)

# Detoxifies the given line and remove it if it's meta
def cleanup_comment(selected_comment):

    # Remove meta talk about the subreddit + toxic comments
    lines = "\n".join([line for line in selected_comment.splitlines()
        if not re.search(r"(?i)(Change\smy\sview|CMV)", line)
    ])

    # Remove toxic comments
    if Detoxify("multilingual").predict(lines)["toxicity"] > TOXIC_THRESHOLD:
        print(lines)
        print("Identified toxic comment, ignoring...")
        lines = ""

    return lines


# Create the function that will be handling all the data gathering
def get_top_comment_and_clean_data(post_id):
    #print(post_id.lstrip("t3_"))
    last_author = ""
    # Grab the post
    submission = reddit.submission(id=post_id.lstrip("t3_"))
    #print(submission.title)

    # Grab the highest rated comment on root layer
    submission.submission_type = 'best'
    submission.comments.replace_more(limit=0)
    replies = list(submission.comments)[0].replies.list()

    # Just some variables
    responses = pros = cons = []

    # If the post author doesn't exist this submission was deleted (submission.deleted doesn't work)
    if type(submission.author) == type(None):
        last_author = "[deleted]"
    else:
        last_author = submission.author.name

    is_pro_argument = False

    for comment in replies:



        # Sometimes for some reason duplicate entries exist
        # Also remove automated message with "Δ" in it
        if comment.body in responses:
            #print("Skipping duplicate entry")
            continue

        # If redditor object doesn't exist, the account is invalid/deleted
        if type(comment.author) != type(None):
            author = comment.author.name
        else:
            author = "[deleted]"

        # Assume that whenever the user changes, they are countering the previous person
        if author != last_author:
            is_pro_argument = !is_pro_argument

        if author == "[deleted]" or author=="DeltaBot":
            #print("Skipping comment...")
            continue

        comment.body = cleanup_comment(comment.body.replace("[deleted]",""))

        # Add to the respective argument type        
        if is_pro_argument:
            pros.append(comment.body)
        else:
            cons.append(comment.body)
        
        last_author = comment.author.name
        
        # Pros = arguments for the Title of this post
        # Cons = arguments against the title of this post

        responses.append(comment.body)

    return pros, cons, responses

In [None]:

print(f"Loading in {ENTRIES_COUNT} posts")
dataset = df.head(ENTRIES_COUNT)


In [None]:
# the name column does some weird sh** because dataframes already have a name property, so migrate to a different column name

import warnings
warnings.filterwarnings('ignore')

dataset["post_id"] = dataset["name"]
warnings.filterwarnings('default')

In [21]:
%%time

# Reset variables for if we run this multiple times
all_pros = []
all_names = []
all_titles = []
all_sources = []

# load in our data. this will take a while.

for i in range(dataset.shape[0]):

    post = dataset.iloc[i]
    modified_title = post.title.replace('CMV', "Change my mind")
    print(f"\n Loading entry {i+1}/{dataset.shape[0]}:\n\t\"{modified_title}\"")

    if type(post) == type(None):
        continue

    assert(post.post_id != i)

    pros, _cons, _response = get_top_comment_and_clean_data(post.post_id)

    # if type(post.name) == int:
    #     continue
    # if type(pros) == int:
    #     continue
    if post.title == "[deleted]":
        continue

    pros = " ".join(pros)
    pros = pros.replace("[deleted]","")

    post.selftext = cleanup_body_text(post.selftext)
    all_titles.append(modified_title + " " + post.selftext)
    all_pros.append(pros)
    all_names.append(post.name)
    all_sources.append(f"https://reddit.com/r/changemyview/comments/{post.post_id}")
    #print(post.title)




Saying "I hate black people" is a clearly racist statement showing intolerance and prejudice. It is wrong because black people have no choice in the colour of their skin, and so insulting them for it makes you an asshole because you know that what you are mocking them for is something that they can never change and that doesn't effect who they are. 

Drawing Muhammad is not an inherently bad thing, because you are showing disrespect for a *belief* which can be changed at any point. 
Identified toxic comment, ignoring...
> North Korea hacked Sony because they are so paranoid and insecure that they can't stand their leader being mocked. Some Muslims are so irrationally offended and self-important that they believe they have both the imperative and the right to kill people for drawing pictures. Neither of these examples illustrate problems with the speakers, they show the unhinged reactions of listeners.

Damn. I think you just changed my view. That's what I needed, thank you.
Identified 

KeyboardInterrupt: 

In [None]:
# Place it all into a Pandas Dataframe
clean_df = pd.DataFrame({
    "INSTRUCTION": all_titles,
    "RESPONSE": all_pros,
    "SOURCE": all_sources
}, index=all_names
)

In [None]:
clean_df.head(9)

In [None]:
# Create Apache Paquete file

import pyarrow as pa
import pyarrow.parquet as pq

table = pa.Table.from_pandas(clean_df)
pq.write_table(table,"output.parquet")

In [None]:
# Test to see if it was sucessful
table = pq.read_table("output.parquet")
table.to_pandas()