In [1]:
# Pre-Process CMV Data
# ====================

import pandas as pd
import json
import re
from IPython.display import Markdown
import random

In [2]:
cmv = [json.loads(ln) for ln in open("./data/pairs.jsonl", "r")]

In [3]:
id_ = []
claims = []
args = []
counters = []

for _ in cmv:
    id_.append(_["submission"]["id"])
    claims.append(_["submission"]["title"])
    args.append(_["submission"]["selftext"])
    counters.append(_["delta_comment"]["comments"][0]["body"])

In [4]:
_ = random.randint(0, len(cmv))

print(_, "/", len(counters))
counters[_]

6561 / 10303


"I will confess, I don't think anyone *needs* to watch anything, but there are a few reasons to watch it with a different outlook.  It's actually a great story about resilience, making amends for past transgressions, and being courageous against the odds.  It really challenges the definition of what justice is and a transformation of a dude that had every reason to just fold.  It's set to some catchy tunes which you can sing to (and dance to, if you so wish).  If you can get yourself around these really powerful themes, I think you'll find it far from bland.\n\nEdit: minor grammar"

In [42]:
print(len(id_))

10303


In [43]:
# Clean Post
def cleanup(cmv_post):
    lines = [
        line for line in cmv_post.splitlines()
        if not line.lstrip().startswith("&gt;")
        and not line.lstrip().startswith("____")
        and "edit" not in " ".join(line.lower().split()[:2])
    ]
    return "\n".join(lines)

# Display Post IPython Markdown
def show_post(title, cmv_post, counter):
    cmv_post = cleanup(cmv_post)
    md_format = "**{}** \n \n {} \n \n **Counter** \n \n {}".format(title, cmv_post, counter)
    md_format = "\n".join(["> " + line for line in md_format.splitlines()])

    return Markdown(md_format)

# Clean Text
def full_clean(data):
    # TODOs: Consider .strip()
    cleaned = []

    for i in data:
        # CMV clean-up
        i = i.lower()
        clean = cleanup(i)

        # RegEx clean-up
        clean = re.sub("CMV:", " ", clean)
        clean = re.sub("CMV", " ", clean)
        clean = re.sub("cmv", " ", clean)
        clean = re.sub(r'(\.)(?:[A-Z])',r'\1\n', clean)
        clean = re.sub(r"http\S+", "", clean)
        clean = re.sub(r"\n", "", clean)
        clean = re.sub(r'(?<=[a-z])\'(?=[a-z])', '', clean)
        clean = re.sub('([^a-zA-Z\s.!?])', "", clean)
        clean = re.sub('\s+', ' ', clean)

        clean = re.sub(r"www\S+", "", clean)
        cleaned.append(clean.strip())

        #clean = re.sub("^\s", "", clean)

    return cleaned

In [44]:
### CONSTRUCT DATAFRAME OBJECTS ###

args_obj = {
    "id": id_,
    "claim": claims,
    "argument": args,
    "counter": counters
}

args_df = pd.DataFrame(args_obj).astype(str)
args_df

Unnamed: 0,id,claim,argument,counter
0,t3_30oi71,CMV: We should strengthen the traditional safe...,## Section I: Why is Basic Income Increasingly...,The majority of your points seem predicated on...
1,t3_30oi71,CMV: We should strengthen the traditional safe...,## Section I: Why is Basic Income Increasingly...,&gt; Section V: Does the Welfare Trap Truly Ex...
2,t3_30oi71,CMV: We should strengthen the traditional safe...,## Section I: Why is Basic Income Increasingly...,"First, your disagreements about different fund..."
3,t3_30oi71,CMV: We should strengthen the traditional safe...,## Section I: Why is Basic Income Increasingly...,your position rests on the assumption that mos...
4,t3_4gdj35,CMV: Males and Females are socially and emotio...,Hi CMV! \n\nI am of the belief that there cann...,You seem to be basing your entire argument on ...
...,...,...,...,...
10298,t3_3h69kp,"CMV:The criticism of ""nice guys"" (as seen in r...",The reasoning behind this view is that everyon...,&gt; everyone is entitled to human companionsh...
10299,t3_3h69kp,"CMV:The criticism of ""nice guys"" (as seen in r...",The reasoning behind this view is that everyon...,"Hey OP, so what WOULD change your view?\nAre y..."
10300,t3_1tqlde,"I believe that people who say ""Why don't you j...",There have been a number of discussions on var...,If you remove the entire religious aspect of C...
10301,t3_1qiccr,I don't see the point of feminism in the West....,Feminism seems to me to be about acquiring equ...,&gt;Feminism seems to me to be about acquiring...


In [45]:
# ### CONSTRUCT DATAFRAME OBJECTS ###
#
# arg_load = []
# for line in open('../data/train_cmv.jsonlist', 'r'):
#     arg_load.append(json.loads(line))
#
# args = pd.DataFrame(arg_load)
# titles = args["op_title"]
# props = args["op_text"]
# id = args["op_name"]
#
# wins = [
#     args["positive"][i]["comments"][0]["body"] for i in range(0, len(args))
# ]
#
# debate = {
#     "id": id,
#     "Titles": titles,
#     "Arguments": props,
#     "Counters": wins
# }
#
# debate = pd.DataFrame(data = debate, columns = ["id", "Titles", "Arguments", "Counters"]).astype(str)

In [46]:
# Exploritory Keyword Search
keyword = "Philosophy"
args_df[args_df['counter'].str.contains(keyword,case=False)]

Unnamed: 0,id,claim,argument,counter
9,t3_5aceoz,CMV: Apple are falsely equating simplicity wit...,"**Update**\n\nThanks for all the replies, ther...",I'm not so much trying to change your view wit...
349,t3_31kfkt,"CMV: It is MY right to decide my fate, and tha...",***EDIT (please read)***: thank you all for yo...,I won't try to change your view that ones righ...
363,t3_2c0frm,CMV: /r/atheism should be renamed to /r/antith...,If you go to /r/atheism or even /r/atheismrebo...,There already is an /r/Antitheism which gets m...
499,t3_6y8y6j,CMV: All our motivations ultimately do (and ma...,"In my opinion, all our motivations and instinc...",Well slight correction here that actually is h...
507,t3_4zoqt4,CMV: Suicide by Aging is a thing.,"So, I have this very acid opinion of the Abrah...",OK. Freed from the blinders of my suicide cul...
...,...,...,...,...
10025,t3_4gmeoo,CMV: Black people need to begin accepting thei...,**[This post is politically incorrect. This po...,While a large portion of people have made very...
10041,t3_5qv6d6,CMV: There is no useful role for God/religion ...,The concept of a creator explained why things ...,I'm going to need several sources and bibliogr...
10044,t3_5ohwa6,CMV: I don't understand people who deny evolution,What about the bacterias? Why do we get sick? ...,I was raised a creationist. I denied evolution...
10104,t3_1kmwmn,I believe Conservative stances and policies ar...,"My overall problem with conservatism, whether ...","You're half right. Generally speaking, conserv..."


In [47]:
# Explore Post
show_post(args_df["claim"][5], args_df["argument"][5], args_df["counter"][5])

> **CMV: Males and Females are socially and emotionally incompatible and heterosexual relationships are unrealistic and farcical.** 
>  
>  Hi CMV! 
> 
> I am of the belief that there cannot exist a legitimate emotional bond between a male and a female that lead to a loving relationship (that is - one of romance). Not once have I seen a couple that did not have an enormous gap in interests, composure, speech patterns, and emotional engagement. There's always and awkward disparity between each partner, and they seem to be tolerating each other, whilst there is little to grasp on why they are interested in each other aside from basic biology. 
> 
> As a result of this, homosexual relationships have always seemed purely the logical and socially optimal choice for anyone pursuing a relationship, and heterosexual relationships continue only because of a heteronormitive society and the biology of reproduction (which I believe is immoral, but that's another topic). I may be biased, but from my perspective and experience I've simply never seen a straight couple that was legitimately compatible in most ways. 
>  
>  **Counter** 
>  
>  &gt; Not once have I seen a couple that did not have an enormous gap in interests
> 
> For what it's worth, there are two large biases here with you need to contend with.
> 
> 1. The couples which you know and see will be influenced heavily with who you associate with and is not representative of relationships in general or the potential which some relationships can reach
> 
> 2. This also depends heavily upon your judgment of those relationships. Which has the following problems 
> 
>   * You might not have complete indepth knowledge between the dynamics of these couples
> 
>   * Even if you did, you might midjudge aspects of a relationship. Do you know exactly what the other person needs or is looking for? Do you know what elements create successful relationships? Can you judge those correctly 100% of the time?
> 
> Also,
> 
> &gt; and heterosexual relationships continue only because of a heteronormitive society and the biology of reproduction 
> 
> What about relationships which last past this stage? This statement obviously doesn't hold up either.
> 
> &gt; I may be biased, but from my perspective and experience I've simply never seen a straight couple that was legitimately compatible in most ways.
> 
> That's basically your argument "i've never seen it work before therefore it cant work". It's not a very good position to defend because, as you've already awknowledged, its covered in bias which prevents it from being logically valid.
> 
> 
> 
> ___
> 
> I'm personally in a working relationship and the differences between thinking which come from being different sexes is quite apparent, that doesn't mean we cant work with it though.
> 
> &gt;they seem to be tolerating each other
> 
> Yes, there's a lot of that in a relationship. After all of the love hormones wear off after a couple of years there is an element of toleration since your partners flaws begin to show and you become to get to know them. The point is though that we're also able to accept each others flaws and let on with our lives despite them.
> 
> The reason why we can ignore them is because more of the time we are supportive of each other. We communicate openly and honestly and we are good listeners. We enjoy each others company much more often than not. We also accept that if the other falls short for some reason that they are only human and that's to be expected.

In [37]:
titles_clean = full_clean(claims)
args_clean = full_clean(args)
counters_clean = full_clean(counters)

args_clean = {
    "id": id_,
    "claim": titles_clean,
    "argument": args_clean,
    "counter": counters_clean
}

args_clean_df = pd.DataFrame(args_clean)
args_clean_df

Unnamed: 0,id,claim,argument,counter
0,t3_30oi71,we should strengthen the traditional safety ne...,section i why is basic income increasingly pop...,the majority of your points seem predicated on...
1,t3_30oi71,we should strengthen the traditional safety ne...,section i why is basic income increasingly pop...,try searching for welfare trap welfare cliff u...
2,t3_30oi71,we should strengthen the traditional safety ne...,section i why is basic income increasingly pop...,first your disagreements about different fundi...
3,t3_30oi71,we should strengthen the traditional safety ne...,section i why is basic income increasingly pop...,your position rests on the assumption that mos...
4,t3_4gdj35,males and females are socially and emotionally...,hi ! i am of the belief that there cannot exis...,you seem to be basing your entire argument on ...
...,...,...,...,...
10298,t3_3h69kp,the criticism of nice guys as seen in rniceguy...,the reasoning behind this view is that everyon...,why? everyone should have it and society shoul...
10299,t3_3h69kp,the criticism of nice guys as seen in rniceguy...,the reasoning behind this view is that everyon...,hey op so what would change your view?are you ...
10300,t3_1tqlde,i believe that people who say why dont you jus...,there have been a number of discussions on var...,if you remove the entire religious aspect of c...
10301,t3_1qiccr,i dont see the point of feminism in the west.,feminism seems to me to be about acquiring equ...,this is not complete its not just about legal ...


In [38]:
# Explore Post
show_post(args_clean_df["id"], args_clean_df["argument"][_], args_clean_df["counter"][_])

> **0        t3_30oi71
> 1        t3_30oi71
> 2        t3_30oi71
> 3        t3_30oi71
> 4        t3_4gdj35
>            ...    
> 10298    t3_3h69kp
> 10299    t3_3h69kp
> 10300    t3_1tqlde
> 10301    t3_1qiccr
> 10302    t3_1bc54q
> Name: id, Length: 10303, dtype: object** 
>  
>  i like musicals. when i was little i watched high school musical and loved it. i sing i dance and im in a show choir that performs showtunes. just from being around theatre geeks i know most of the words to can you hear the people sing...they all love this musical. ive only seen the first five minutes of it and i want to motivate myself to watch it again but it just seems so....bland. dull. please and convince me that i need to watch this. i am of course referring to the version. 
>  
>  **Counter** 
>  
>  i will confess i dont think anyone needs to watch anything but there are a few reasons to watch it with a different outlook. its actually a great story about resilience making amends for past transgressions and being courageous against the odds. it really challenges the definition of what justice is and a transformation of a dude that had every reason to just fold. its set to some catchy tunes which you can sing to and dance to if you so wish. if you can get yourself around these really powerful themes i think youll find it far from bland.

In [39]:
# Exploritory Search Keywords; Assert Clean (URLs: 'http', 'www')
keyword = "www"
args_clean_df[args_clean_df['counter'].str.contains(keyword,case=False)]

Unnamed: 0,id,claim,argument,counter


In [40]:
# Search Keywords; Assert missing values at start of sentence (known argument issue)
keyword = "harassment"
args_clean_df[args_clean_df['counter'].str.contains(keyword,case=False)]

Unnamed: 0,id,claim,argument,counter
354,t3_1i2u57,i believe that kids who are bullied are partia...,so since youre reading this you probably haven...,i was unaware that having puberty later than o...
421,t3_3ia43m,it is neither derogatory nor a promotion of se...,i believe that in our current society heterose...,harassment is nothing more than aggressive pre...
582,t3_1zwqb4,i think that crimes committed while acting in ...,i personally think betrayal of trust is one of...,theres three basic reasons that this is frowne...
779,t3_54wr8w,you shouldnt have to pay more taxes just becau...,im a firm believer in a meritocracy.financiall...,thats pretty much the point everyone wants fre...
835,t3_2lqcby,catcalling and street harassment should be a t...,with the recent discussion of street harassmen...,well theres a pretty big free speech problem. ...
1118,t3_4n5nmv,there is a difference between being happy with...,i came across the instagram user glitterandlaz...,society doesnt do anything to help anorexics b...
1156,t3_1sw6ya,as a proud liberal i believe that framing left...,i believe that leftwingliberal parties are not...,they support more government involvement in pe...
1239,t3_2131f9,banning picketing outside funerals is a good i...,its hard to counterargue slippery slope critic...,youre sort of right but i think for the wrong ...
1448,t3_2kk7pc,revenge porn should not be a criminal offence.,revenge porn can be extremely hurtful there is...,in several highprofile cases where gossip has ...
1590,t3_5cy2u9,porn only brings harm and should be banned.,im not saying from a specific country or regio...,youre forgetting that porn is a job that helps...


In [41]:
# Output JSON List .jsonl
import json

data = []
for idx, row in args_clean_df.iterrows():
    data.append({
        "id": row["id"],
        "claim": row["claim"],
        "argument": row["argument"],
        "counter": row["counter"]
    })

with open("./data/cmv_cleaned.jsonl", "w", encoding='utf-8') as f:
    for d in data:
        f.write(json.dumps(d))
        f.write("\n")