In [95]:
import pandas as pd
pd.options.display.max_colwidth = 100
import nltk
import numpy as np

### Data

In [44]:
reddit_posts = pd.read_csv('/nas/home/jwei/biases/rtgender/reddit_posts.csv')
facebook_public_figures = pd.read_csv('/nas/home/jwei/biases/rtgender/facebook_wiki_posts.csv')
facebook_politicians = pd.read_csv('/nas/home/jwei/biases/rtgender/facebook_congress_posts.csv')
fitocracy_posts = pd.read_csv('/nas/home/jwei/biases/rtgender/fitocracy_posts.csv')

### Precision qy

In [45]:
def rule_2(sentence):
    keywords = ['do', 'does', 'did', 'are', 'is', 'was', 'should']
    startswith = any(sentence.lower().startswith(i) for i in keywords)
    return startswith and sentence.endswith('?')

In [46]:
sent_detector = nltk.data.load('tokenizers/punkt/english.pickle')

def first_satisfies_rule2(x):
    for x in sent_detector.tokenize(x):
        if rule_2(x):
            return x
    return None

In [47]:
reddit_posts['qy'] = reddit_posts.post_text.apply(first_satisfies_rule2)
reddit_posts['has_qy'] = ~reddit_posts.qy.isna()

In [49]:
reddit_posts[reddit_posts.has_qy].head(3)

Unnamed: 0,op_id,op_gender,post_id,post_text,subreddit,op_gender_visible,qy,has_qy
5,SlayerMaster,M,5,Are they giving it to everyone for free at the...,2007scape,False,Are they giving it to everyone for free at the...,True
12,Kastoli,M,12,"Are blues still the most expensive, or are pur...",2007scape,False,"Are blues still the most expensive, or are pur...",True
31,SlayerMaster,M,31,Are steel drags pretty good money?,2007scape,False,Are steel drags pretty good money?,True


In [51]:
reddit_posts[reddit_posts.has_qy].op_gender.value_counts('op_gender')

M    0.771802
W    0.228198
Name: op_gender, dtype: float64

### Rank qy

In [52]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression

In [81]:
reddit_qys = reddit_posts[reddit_posts.has_qy].copy()
text = reddit_qys['qy']
genders = reddit_qys['op_gender'] == 'M'
np.mean(genders)

0.7718015992003998

In [82]:
vectorizer = CountVectorizer(max_features=10000, min_df=10, binary=True)
X = vectorizer.fit_transform(text)
y = genders
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

In [83]:
lr = LogisticRegression().fit(X_train, y_train)



In [84]:
# test set accuracy
np.mean(lr.predict(X_test) == y_test)

0.758881861482002

In [85]:
reddit_features = sorted(list(zip([i for i in lr.coef_[0]], vectorizer.get_feature_names())), key=lambda x: x[0])
print('Female:')
print(reddit_features[:20])
print('Male:')
print(reddit_features[-20:])

Female:
[(-1.7408111063838958, 'comfort'), (-1.5480281914473364, 'frequently'), (-1.53385533246326, 'apartment'), (-1.5311280043360171, 'penises'), (-1.51676075097512, 'therapy'), (-1.477753920661345, 'lips'), (-1.4395560917112706, 'volunteer'), (-1.431799697333022, 'wedding'), (-1.3640081851581358, 'orders'), (-1.3618311201093258, 'feeds'), (-1.3400268623421676, 'advice'), (-1.3044269933286, 'anal'), (-1.2960153947384943, 'turkey'), (-1.269356207459313, 'calls'), (-1.268732206171965, 'publicly'), (-1.2384475123897651, 'flowers'), (-1.2227580146389017, 'consequences'), (-1.2042028142595465, 'mothers'), (-1.1821604190781054, 'vagina'), (-1.1727750303359112, 'paul')]
Male:
[(1.2400484815084851, 'citizen'), (1.2487812944082428, 'meal'), (1.2491706152225248, 'logical'), (1.2675785229232932, 'team'), (1.2721304102205777, 'rub'), (1.2828602954044828, 'killer'), (1.2927325961045428, 'lucky'), (1.2949014359309388, 'forgetting'), (1.341129971333667, 'bench'), (1.3782669575806241, 'winning'), (1

In [89]:
X = vectorizer.transform(reddit_qys.qy)
y = lr.predict_proba(X)
reddit_qys['score'] = y

In [97]:
reddit_qys.sort_values('score')[['qy', 'score']]

Unnamed: 0,qy,score
945014,Do the Lakers really have a chance at winning a championship if they gave Kobe the big contract ...,0.001539
1006080,Do you realize that by using TDs and ignoring the differences in the talent surrounding them.......,0.001642
679834,"Do you want to know how it works on a mechanical level (transistors, and so), how its built (the...",0.001699
366908,"Do you specifically mean internet acronyms, or does it bother you when someone talks about laser...",0.001894
1139762,"Do these people really think the most powerful, far-reaching government on the planet doesnt hav...",0.002691
...,...,...
903668,Is your mum one of those mothers who loves to cook and stuff people with delicious food until th...,0.942110
74776,Does this apply to ANY potential pregnancy (even with a serious SO/wife) or just with casual sex...,0.945138
172946,Do you seriously think having 2 parents that no longer love each other but feel forced to stay t...,0.971977
402009,"Did your dads have a special female friend or aunt for you to talk to about adolescence, fashion...",0.983818


### Save sets

50% random male samples, 50% random female samples

In [121]:
for i in range(1, 4):
    trial1 = pd.concat([reddit_qys[reddit_qys.op_gender == 'W'].sample(3000), reddit_qys[reddit_qys.op_gender == 'M'].sample(3000)])
    trial1[['op_gender', 'qy']].to_csv('random_equal_t%d.csv' % i, sep='\t', header=False, index=False)

50% high confidence male samples, 50% high confidence female samples

In [128]:
df = pd.concat([reddit_qys[reddit_qys.op_gender == 'W'].sort_values('score', ascending=False).head(3000),
           reddit_qys[reddit_qys.op_gender == 'M'].sort_values('score', ascending=True).head(3000)])
df[['op_gender', 'qy']].to_csv('high_conf_equal.csv', sep='\t', header=False, index=False)

random male/female samples

In [129]:
df = reddit_qys[reddit_qys.op_gender == 'W'].sample(6000)
df[['score', 'qy']].to_csv('random_women.csv', sep='\t', header=False, index=False)

df = reddit_qys[reddit_qys.op_gender == 'M'].sample(6000)
df[['score', 'qy']].to_csv('random_men.csv', sep='\t', header=False, index=False)