In [1]:
import pandas as pd
pd.options.display.max_colwidth = 100
import nltk
import numpy as np

### Data

In [2]:
reddit_posts = pd.read_csv('/nas/home/jwei/biases/rtgender/reddit_posts.csv')
facebook_public_figures = pd.read_csv('/nas/home/jwei/biases/rtgender/facebook_wiki_posts.csv')
facebook_politicians = pd.read_csv('/nas/home/jwei/biases/rtgender/facebook_congress_posts.csv')
fitocracy_posts = pd.read_csv('/nas/home/jwei/biases/rtgender/fitocracy_posts.csv')

# All questions

In [3]:
def q_rule(sentence):
    return sentence.endswith('?')

sent_detector = nltk.data.load('tokenizers/punkt/english.pickle')
def first_satisfies_rule(x, rule):
    if not type(x) == str:
        return None
    
    for x in sent_detector.tokenize(x):
        if rule(x):
            return x
    return None

In [4]:
for dataset in [reddit_posts, facebook_public_figures, facebook_politicians, fitocracy_posts]:
    dataset['q'] = dataset.post_text.apply(lambda x: first_satisfies_rule(x, q_rule))
    dataset['has_q'] = ~dataset.q.isna()

In [5]:
facebook_politicians[facebook_politicians.has_q].tail(1)

Unnamed: 0,op_id,op_gender,post_id,post_text,post_type,q,has_q
548219,18876566,M,548219,if you were giving President Obama a grade for his first 100 days -- what would it be?,status,if you were giving President Obama a grade for his first 100 days -- what would it be?,True


### Precision qy

In [6]:
def rule_2(sentence):
    if sentence is None:
        return None
    
    keywords = ['do', 'does', 'did', 'are', 'is', 'was', 'should']
    startswith = any(sentence.lower().startswith(i) for i in keywords)
    return startswith and sentence.endswith('?')

In [7]:
for dataset in [reddit_posts, facebook_public_figures, facebook_politicians, fitocracy_posts]:
    dataset['is_qy'] = dataset.q.apply(rule_2)

### Rank questions

In [8]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression

In [23]:
def score(text, genders):
    vectorizer = CountVectorizer(max_features=10000, min_df=10, binary=True, stop_words='english')
    X = vectorizer.fit_transform(text)
    y = genders
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)
    lr = LogisticRegression(solver='lbfgs', max_iter=200).fit(X_train, y_train)
    
    acc = np.mean(lr.predict(X_test) == y_test)
    
    X = vectorizer.transform(text)
    oov = (np.sum(X, axis=1)) < 1.
    y = lr.predict_proba(X)[:,0] # probability scores
    
    features = sorted(list(zip([i for i in lr.coef_[0]], vectorizer.get_feature_names())), key=lambda x: x[0])
    
    return y, oov, acc, features

In [24]:
for dataset in [reddit_posts, facebook_public_figures, facebook_politicians, fitocracy_posts]:
    print('Processing dataset...')
    dataset_questions = dataset[dataset.has_q]
    scores, oov, acc, features = score(dataset_questions['post_text'], dataset_questions['op_gender'])

    dataset.loc[dataset.has_q, 'oov'] = np.asarray(np.squeeze(oov))[0]
    dataset.loc[dataset.has_q, 'q_score'] = scores
    
    dataset_questions = dataset[dataset.has_q]
    print(dataset_questions.op_gender.value_counts(normalize=True))
    print(acc)

Processing dataset...




M    0.786438
W    0.213562
Name: op_gender, dtype: float64
0.7942478852519309
Processing dataset...
W    0.636587
M    0.363413
Name: op_gender, dtype: float64
0.8609406952965235
Processing dataset...
M    0.778471
W    0.221529
Name: op_gender, dtype: float64
0.8084924965893588
Processing dataset...
M    0.583724
W    0.416276
Name: op_gender, dtype: float64
0.6217887725975262


### Write to files

In [25]:
dataset.head(1)

Unnamed: 0,op_id,op_gender,post_id,post_text,q,has_q,is_qy,oov,q_score
0,102,W,0,"Thanks for the follow! I followed back :) I wanna kick some butt, too :) Let's do it!",,False,,,


In [26]:
for name, dataset in [('reddit', reddit_posts), ('facebook_public', facebook_public_figures), ('facebook_politicians', facebook_politicians), ('fitocracy', fitocracy_posts)]:
    dataset[dataset.has_q] \
    .sort_values('q_score') \
    [['op_id', 'op_gender', 'post_id', 'q', 'is_qy', 'q_score', 'oov']] \
    .to_csv('%s_questions.tsv' % name, sep='\t')

### Make samples