In [None]:
# Pre-Process CMV Data
# ====================

import pandas as pd
import json
import re
from IPython.display import Markdown
import random

In [None]:
cmv = [json.loads(ln) for ln in open("./data/pairs.jsonl", "r")]

In [None]:
id_ = []
claims = []
args = []
counters = []

for _ in cmv:
    id_.append(_["submission"]["id"])
    claims.append(_["submission"]["title"])
    args.append(_["submission"]["selftext"])
    counters.append(_["delta_comment"]["comments"][0]["body"])

In [None]:
_ = random.randint(0, len(cmv))

print(_, "/", len(counters))
counters[_]

In [None]:
print(len(id_))

In [None]:
# Clean Post
def cleanup(cmv_post):
    lines = [
        line for line in cmv_post.splitlines()
        if not line.lstrip().startswith("&gt;")
        and not line.lstrip().startswith("____")
        and "edit" not in " ".join(line.lower().split()[:2])
    ]
    return "\n".join(lines)

# Display Post IPython Markdown
def show_post(title, cmv_post, counter):
    cmv_post = cleanup(cmv_post)
    md_format = "**{}** \n \n {} \n \n **Counter** \n \n {}".format(title, cmv_post, counter)
    md_format = "\n".join(["> " + line for line in md_format.splitlines()])

    return Markdown(md_format)

# Clean Text
def full_clean(data):
    # TODOs: Consider .strip()
    cleaned = []

    for i in data:
        # CMV clean-up
        i = i.lower()
        clean = cleanup(i)

        # RegEx clean-up
        clean = re.sub("CMV:", " ", clean)
        clean = re.sub("CMV", " ", clean)
        clean = re.sub("cmv", " ", clean)
        clean = re.sub(r'(\.)(?:[A-Z])',r'\1\n', clean)
        clean = re.sub(r"http\S+", "", clean)
        clean = re.sub(r"\n", "", clean)
        clean = re.sub(r'(?<=[a-z])\'(?=[a-z])', '', clean)
        clean = re.sub('([^a-zA-Z\s.!?])', "", clean)
        clean = re.sub('\s+', ' ', clean)

        clean = re.sub(r"www\S+", "", clean)
        cleaned.append(clean.strip())

        #clean = re.sub("^\s", "", clean)

    return cleaned

In [None]:
### CONSTRUCT DATAFRAME OBJECTS ###

args_obj = {
    "id": id_,
    "claim": claims,
    "argument": args,
    "counter": counters
}

args_df = pd.DataFrame(args_obj).astype(str)
args_df

In [None]:
# ### CONSTRUCT DATAFRAME OBJECTS ###
#
# arg_load = []
# for line in open('../data/train_cmv.jsonlist', 'r'):
#     arg_load.append(json.loads(line))
#
# args = pd.DataFrame(arg_load)
# titles = args["op_title"]
# props = args["op_text"]
# id = args["op_name"]
#
# wins = [
#     args["positive"][i]["comments"][0]["body"] for i in range(0, len(args))
# ]
#
# debate = {
#     "id": id,
#     "Titles": titles,
#     "Arguments": props,
#     "Counters": wins
# }
#
# debate = pd.DataFrame(data = debate, columns = ["id", "Titles", "Arguments", "Counters"]).astype(str)

In [None]:
# Exploritory Keyword Search
keyword = "Philosophy"
args_df[args_df['counter'].str.contains(keyword,case=False)]

In [None]:
# Explore Post
show_post(args_df["claim"][5], args_df["argument"][5], args_df["counter"][5])

In [None]:
titles_clean = full_clean(claims)
args_clean = full_clean(args)
counters_clean = full_clean(counters)

args_clean = {
    "id": id_,
    "claim": titles_clean,
    "argument": args_clean,
    "counter": counters_clean
}

args_clean_df = pd.DataFrame(args_clean)
args_clean_df

In [None]:
# Explore Post
show_post(args_clean_df["id"], args_clean_df["argument"][_], args_clean_df["counter"][_])

In [None]:
# Exploritory Search Keywords; Assert Clean (URLs: 'http', 'www')
keyword = "www"
args_clean_df[args_clean_df['counter'].str.contains(keyword,case=False)]

In [None]:
# Search Keywords; Assert missing values at start of sentence (known argument issue)
keyword = "harassment"
args_clean_df[args_clean_df['counter'].str.contains(keyword,case=False)]

In [None]:
# Output JSON List .jsonl
import json

data = []
for idx, row in args_clean_df.iterrows():
    data.append({
        "id": row["id"],
        "claim": row["claim"],
        "argument": row["argument"],
        "counter": row["counter"]
    })

with open("./data/cmv_cleaned.jsonl", "w", encoding='utf-8') as f:
    for d in data:
        f.write(json.dumps(d))
        f.write("\n")