In [1]:
import pandas as pd
pd.options.display.max_colwidth = 100
import nltk
import numpy as np

### Data

In [2]:
reddit_posts = pd.read_csv('/nas/home/jwei/biases/rtgender/reddit_posts.csv')
facebook_public_figures = pd.read_csv('/nas/home/jwei/biases/rtgender/facebook_wiki_posts.csv')
facebook_politicians = pd.read_csv('/nas/home/jwei/biases/rtgender/facebook_congress_posts.csv')
fitocracy_posts = pd.read_csv('/nas/home/jwei/biases/rtgender/fitocracy_posts.csv')

# All questions

In [14]:
def q_rule(sentence):
    return sentence.endswith('?')

sent_detector = nltk.data.load('tokenizers/punkt/english.pickle')
def first_satisfies_rule(x, rule):
    if not type(x) == str:
        return None
    
    for x in sent_detector.tokenize(x):
        if rule(x):
            return x
    return None

In [15]:
for dataset in [reddit_posts, facebook_public_figures, facebook_politicians, fitocracy_posts]:
    dataset['q'] = dataset.post_text.apply(lambda x: first_satisfies_rule(x, q_rule))
    dataset['has_q'] = ~dataset.q.isna()

In [23]:
facebook_politicians[facebook_politicians.has_q].tail(2000).head(100)

Unnamed: 0,op_id,op_gender,post_id,post_text,post_type,q,has_q
508102,52515090,W,508102,Thousands of Nebraskans continue to contact me with stories of higher premiums and lost coverage...,photo,What's your story?,True
508204,52515090,W,508204,"ICYMI: Yesterday, I delivered remarks on the Senate floor outlining in detail my concerns with t...",video,"Did you know that in 2006 and 2007, a biometric entry/exit system at all points of entry – air, ...",True
508235,52515090,W,508235,"On top of the IRS' unfair, unconstitutional targeting of conservative groups, it's shocking to l...",link,How can the American people have confidence in this agency to carry out its basic responsibiliti...,True
508285,52515090,W,508285,Earth Day was first officially commemorated 43 years ago. Conservation is a priority of my famil...,status,What is your favorite place to go in our state to appreciate nature?,True
508315,52515090,W,508315,Have you submitted your ideas on how to cut the red tape of overregulation yet? I recently launc...,link,Have you submitted your ideas on how to cut the red tape of overregulation yet?,True
...,...,...,...,...,...,...,...
510265,6439709,M,510265,Can you imagine having to pay as much as $400 just to talk to your loved one for 15 minutes? For...,link,Can you imagine having to pay as much as $400 just to talk to your loved one for 15 minutes?,True
510290,6439709,M,510290,Republicans' first priority in 2016? Taking healthcare away from millions of Americans. Shocker.,link,Republicans' first priority in 2016?,True
510293,6439709,M,510293,A handful of the wealthiest Americans are spending millions of dollars on lobbying to save thems...,link,Now?,True
510306,6439709,M,510306,My response to Donald J. Trump calling for an end to Muslim immigration: Most people know more ...,status,Do we really want someone who doesn't understand our values speaking to the world on our behalf?,True


### Precision qy

In [3]:
def qy_rule(sentence):
    keywords = ['do', 'does', 'did', 'are', 'is', 'was', 'should']
    startswith = any(sentence.lower().startswith(i) for i in keywords)
    return startswith and sentence.endswith('?')

In [3]:
sent_detector = nltk.data.load('tokenizers/punkt/english.pickle')

def first_satisfies_rule2(x):
    for x in sent_detector.tokenize(x):
        if rule_2(x):
            return x
    return None

In [5]:
reddit_posts['qy'] = reddit_posts.post_text.apply(first_satisfies_rule2)
reddit_posts['has_qy'] = ~reddit_posts.qy.isna()

In [6]:
reddit_posts[reddit_posts.has_qy].head(3)

Unnamed: 0,op_id,op_gender,post_id,post_text,subreddit,op_gender_visible,qy,has_qy
5,SlayerMaster,M,5,Are they giving it to everyone for free at the events or are they just going to drop it the same...,2007scape,False,Are they giving it to everyone for free at the events or are they just going to drop it the same...,True
12,Kastoli,M,12,"Are blues still the most expensive, or are purples more expensive, since there wasnt the dupe bu...",2007scape,False,"Are blues still the most expensive, or are purples more expensive, since there wasnt the dupe bu...",True
31,SlayerMaster,M,31,Are steel drags pretty good money?,2007scape,False,Are steel drags pretty good money?,True


In [7]:
reddit_posts[reddit_posts.has_qy].op_gender.value_counts('op_gender')

M    0.771802
W    0.228198
Name: op_gender, dtype: float64

### Rank qy

In [8]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression

In [9]:
reddit_qys = reddit_posts[reddit_posts.has_qy].copy()
text = reddit_qys['qy']
genders = reddit_qys['op_gender'] == 'M'
np.mean(genders)

0.7718015992003998

In [21]:
vectorizer = CountVectorizer(max_features=10000, min_df=10, binary=True)
X = vectorizer.fit_transform(text)
y = genders
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

In [22]:
lr = LogisticRegression().fit(X_train, y_train)



In [23]:
# test set accuracy
np.mean(lr.predict(X_test) == y_test)

0.758881861482002

In [24]:
reddit_features = sorted(list(zip([i for i in lr.coef_[0]], vectorizer.get_feature_names())), key=lambda x: x[0])
print('Female:')
print(reddit_features[:40])
print('Male:')
print(reddit_features[-40:])

Female:
[(-1.7408111063838958, 'comfort'), (-1.5480281914473364, 'frequently'), (-1.53385533246326, 'apartment'), (-1.5311280043360171, 'penises'), (-1.51676075097512, 'therapy'), (-1.477753920661345, 'lips'), (-1.4395560917112706, 'volunteer'), (-1.431799697333022, 'wedding'), (-1.3640081851581358, 'orders'), (-1.3618311201093258, 'feeds'), (-1.3400268623421676, 'advice'), (-1.3044269933286, 'anal'), (-1.2960153947384943, 'turkey'), (-1.269356207459313, 'calls'), (-1.268732206171965, 'publicly'), (-1.2384475123897651, 'flowers'), (-1.2227580146389017, 'consequences'), (-1.2042028142595465, 'mothers'), (-1.1821604190781054, 'vagina'), (-1.1727750303359112, 'paul'), (-1.1485156017954192, 'healing'), (-1.147958251793627, 'jury'), (-1.1456843739950395, 'growing'), (-1.1220878546380995, 'alex'), (-1.1178403943579738, 'obviously'), (-1.115676009110142, 'shave'), (-1.1106386792249876, 'accepting'), (-1.1097000644004906, 'adults'), (-1.1043699500068593, 'depend'), (-1.09618336823463, 'treatme

In [25]:
X = vectorizer.transform(reddit_qys.qy)
y = lr.predict_proba(X)
reddit_qys['score'] = y

ValueError: Wrong number of items passed 2, placement implies 1

In [None]:
reddit_qys.sort_values('score')[['qy', 'gender', 'score']]

### Most predictive

In [29]:
reddit_qys[reddit_qys.post_text.str.contains('gaming')].op_gender.value_counts()

M    143
W     21
Name: op_gender, dtype: int64

### Save sets

50% random male samples, 50% random female samples

In [121]:
for i in range(1, 4):
    trial1 = pd.concat([reddit_qys[reddit_qys.op_gender == 'W'].sample(3000), reddit_qys[reddit_qys.op_gender == 'M'].sample(3000)])
    trial1[['op_gender', 'qy']].to_csv('random_equal_t%d.csv' % i, sep='\t', header=False, index=False)

50% high confidence male samples, 50% high confidence female samples

In [128]:
df = pd.concat([reddit_qys[reddit_qys.op_gender == 'W'].sort_values('score', ascending=False).head(3000),
           reddit_qys[reddit_qys.op_gender == 'M'].sort_values('score', ascending=True).head(3000)])
df[['op_gender', 'qy']].to_csv('high_conf_equal.csv', sep='\t', header=False, index=False)

random male/female samples

In [129]:
df = reddit_qys[reddit_qys.op_gender == 'W'].sample(6000)
df[['score', 'qy']].to_csv('random_women.csv', sep='\t', header=False, index=False)

df = reddit_qys[reddit_qys.op_gender == 'M'].sample(6000)
df[['score', 'qy']].to_csv('random_men.csv', sep='\t', header=False, index=False)