In [22]:
import random
random.seed(119)
import pandas as pd
import csv
import itertools

In [2]:
def read(fn):
    fh = open(fn, 'rt')
    data = []
    for line in fh:
        line = line.split('\t')
        label = line[0].strip('!')
        temp = line[1].strip('!').split('<|endoftext|>')
        question, response = temp[0], temp[1]
        data.append((label, question, response))
    return data

In [4]:
men_data = read('../8-21_help-pipeline_on_gendered-questions/vanilla_sample_high_conf_equal.csv')
men_data = pd.DataFrame(men_data)
men_data.columns = ['gender', 'question', 'response']

In [5]:
def pronouns(x):
    keywords = [' he ', ' him ', ' she ', 'her', ' they ', ' them ', ' it ', 'is that', ' this ']
    return any(i in x.lower() for i in keywords)

def has_or(x):
    return 'or' in x.lower()

def rhetoric(x):
    keywords = ['shouldn\'t', 'wouldn\'t', 'couldn\'t']
    return any(x.lower().startswith(i) for i in keywords)

def too_short(x):
    return len(x.split(' ')) <= 5

In [6]:
men_data

Unnamed: 0,gender,question,response
0,M,Is winning?,"Of a bit more than 6 1 now, it's not."
1,W,Is she trans?,Nah she's a dude.
2,W,Did she live?,She lived a few minutes after the accident.
3,M,ISIS twitter?,.
4,W,Are you my ex?,If she's still around I'll send you the pics.
...,...,...,...
59995,W,Do people in areas where the temperatures go b...,"Yes, often the people who forget their kids di..."
59996,M,Should we make anyone who wants to play Call o...,I see some of these on here. They sound scary....
59997,M,Wasnt the bill just a change in definition of ...,I'll believe this one.
59998,M,Are you seriously saying that the internet (wh...,Did you think he never said it?


In [7]:
men_data['q_pronoun'] = men_data.question.apply(pronouns)
men_data['q_has_or'] = men_data.question.apply(has_or)
men_data['q_rhetoric'] = men_data.question.apply(rhetoric)
men_data['q_too_short'] = men_data.question.apply(too_short)

In [8]:
good_qs = men_data[~men_data.q_pronoun & ~men_data.q_has_or & ~men_data.q_rhetoric & ~men_data.q_too_short]
good_qs

Unnamed: 0,gender,question,response,q_pronoun,q_has_or,q_rhetoric,q_too_short
106,W,Do you mix in a bowl?,No. This was my first homebrew and I was not u...,False,False,False,False
231,M,do you want a gaming PC?,"Yes, yes he do.",False,False,False,False
233,M,Did u manage to get any?,A few but nothing of worth,False,False,False,False
256,W,Do you tend to stay pale?,"Not really, I usually go with pale.",False,False,False,False
259,W,Are you sure thats a she?,Lucky I didn't even see the question mark,False,False,False,False
...,...,...,...,...,...,...,...
59880,M,Do you honestly believe that raising taxes on ...,This is the new way of the world. It's all ove...,False,False,False,False
59908,W,Are you that insecure about yourself that when...,Is that a reasonable reaction I wonder if ther...,False,False,False,False
59932,M,do you think that the tax-free compounding of ...,"Not sure, never held one myself.",False,False,False,False
59964,M,Do any of those allegations have anything to d...,No comment.,False,False,False,False


In [9]:
len(good_qs.question.unique()) / len(men_data.question.unique())

0.22977509231285667

In [10]:
len(good_qs.question.unique())

1369

In [11]:
good_qs.gender.value_counts()

M    7740
W    6000
Name: gender, dtype: int64

In [12]:
m_good_qs = good_qs[good_qs.gender == 'M']
w_good_qs = good_qs[good_qs.gender == 'W']

In [27]:
writer = csv.writer(open('./highly_conf.csv', 'wt'))
writer.writerow(['index', 'text', 'label', 'gender'])

index = 0
for i, row in itertools.chain(m_good_qs.iterrows(), w_good_qs.iterrows()):
    writer.writerow((index, '[CLS] %s [SEP] %s [SEP]' % (row['question'], row['response']), 'n', row['gender']))
    index += 1