In [1]:
import pandas as pd
pd.set_option('display.max_colwidth', 300)
import numpy as np
import pickle

# Datasets

In [2]:
reddit_posts = pd.read_csv('/nas/home/jwei/biases/rtgender/reddit_posts.csv')
facebook_public_figures = pd.read_csv('/nas/home/jwei/biases/rtgender/facebook_wiki_posts.csv')

### Reddit dataset

In [3]:
reddit_posts.head()

Unnamed: 0,op_id,op_gender,post_id,post_text,subreddit,op_gender_visible
0,Kastoli,M,0,slayer task perhaps?,2007scape,False
1,Kastoli,M,1,"Black DHide legs, possibly an initiate pure?",2007scape,False
2,DCBizzle,M,2,Whats a tonk? lol,2007scape,False
3,ordona,M,3,Do the Stronghold of Security for a free 10k to start.,2007scape,False
4,SlayerMaster,M,4,I cant tell if this guy just doesnt speak English or if he is a total retard. Probably a combination of the two.,2007scape,False


In [4]:
reddit_posts['op_gender'].value_counts()

M    1148591
W     304921
Name: op_gender, dtype: int64

In [5]:
reddit_posts['subreddit'].unique()

array(['2007scape', 'AdviceAnimals', 'Amd', 'anime', 'AskMen', 'AskOuija',
       'AskReddit', 'asoiaf', 'aww', 'baseball', 'BigBrother', 'Bitcoin',
       'BlackPeopleTwitter', 'buildapc', 'canada', 'cars',
       'CasualConversation', 'CFB', 'conspiracy', 'counting',
       'CringeAnarchy', 'dankmemes', 'DBZDokkanBattle', 'DestinyTheGame',
       'de', 'DotA2', 'ethtrader', 'europe', 'explainlikeimfive',
       'FFBraveExvius', 'ffxiv', 'FIFA', 'FireEmblemHeroes', 'Fitness',
       'formula1', 'funny', 'gameofthrones', 'Games', 'gaming', 'gifs',
       'GlobalOffensiveTrade', 'GlobalOffensive', 'gonewild',
       'hearthstone', 'heroesofthestorm', 'hiphopheads', 'hockey',
       'Ice_Poseidon', 'india', 'Jokes', 'leagueoflegends', 'magicTCG',
       'marvelstudios', 'me_irl', 'mildlyinteresting', 'MMA', 'movies',
       'Music', 'nba', 'neoliberal', 'news', 'nfl', 'NintendoSwitch',
       'nottheonion', 'OkCupid', 'Overwatch', 'pathofexile',
       'pcmasterrace', 'personalfinance', 

In [21]:
almost_balanced = []
for i, group in reddit_posts.groupby('subreddit'):
    p = sum(group['op_gender'] == 'M') / len(group)
    print(i, p)
    if 0.5 < p < 0.56:
        almost_balanced.append((i,p))
almost_balanced

2007scape 0.9547413793103449
AdviceAnimals 0.7515089062269984
Amd 0.9783845278725825
AskMen 0.8254330943847072
AskOuija 0.8376068376068376
AskReddit 0.6827032520325204
BigBrother 0.4340518816222141
Bitcoin 0.9351100811123986
BlackPeopleTwitter 0.8197347266881029
CFB 0.9736661466458658
CasualConversation 0.559511084258958
CringeAnarchy 0.8620162932790224
DBZDokkanBattle 1.0
DestinyTheGame 0.9579165880354784
DotA2 0.9032136462161907
FFBraveExvius 0.9439655172413793
FIFA 0.995069033530572
FireEmblemHeroes 0.5605839416058395
Fitness 0.8890602991481655
Games 0.9413909397502507
GlobalOffensive 0.9512104283054004
GlobalOffensiveTrade 0.9985315712187959
Ice_Poseidon 0.9459459459459459
Jokes 0.8596179475788538
MMA 0.8317161477190441
Music 0.8531013881567387
NintendoSwitch 0.729632945389436
OkCupid 0.6248697845276605
Overwatch 0.9047882646000553
PUBATTLEGROUNDS 0.9933554817275747
Philippines 0.7405405405405405
Rainbow6 0.9853249475890985
RocketLeague 0.9835787089467724
RocketLeagueExchange 0.986

[('CasualConversation', 0.559511084258958)]

In [22]:
for i, j in almost_balanced:
    print(i, sum(reddit_posts.subreddit == i))

CasualConversation 21517


# Features

In [26]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression

### Reddit posts

In [27]:
balanced_reddit = reddit_posts[reddit_posts.subreddit.isin([i[0] for i in almost_balanced])]
text = balanced_reddit['post_text']
genders = balanced_reddit['op_gender'] == 'M'

In [28]:
print(np.mean(genders))

0.559511084258958


In [67]:
vectorizer = CountVectorizer(min_df=0.01, binary=True)
X = vectorizer.fit_transform(text)
y = genders
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.4, random_state=0)

In [68]:
lr = LogisticRegression(penalty='l1').fit(X_train, y_train)



In [69]:
np.mean(lr.predict(X_test) == y_test)

0.5876612059951203

In [70]:
reddit_features = sorted(list(zip([i for i in lr.coef_[0]], vectorizer.get_feature_names())), key=lambda x: x[0])
print('Female:')
print(reddit_features[:40])
print('Male:')
print(reddit_features[-40:])

Female:
[(-0.6529920921031099, 'sounds'), (-0.6110909977894239, 'am'), (-0.5910956461703697, 'hes'), (-0.5251775702614715, 'thanks'), (-0.4450506721454017, 'night'), (-0.4407194541481424, 'he'), (-0.4327254622097197, 'definitely'), (-0.40228303588353426, 'why'), (-0.3990047125804721, 'sorry'), (-0.3976853910609456, 'job'), (-0.38436117620734983, 'everyone'), (-0.3777102962525746, 'oh'), (-0.36527516738247157, 'ever'), (-0.36079237249293006, 'awesome'), (-0.3525995582056733, 'show'), (-0.34572210261723557, 'talk'), (-0.34144036811564693, 'may'), (-0.33999066731053035, 'him'), (-0.3315994287116268, 'tell'), (-0.3209531410505501, 'yes'), (-0.3160446705879145, 'make'), (-0.30583945387913236, 'super'), (-0.2964435280867572, 'hate'), (-0.2803164608686426, 'look'), (-0.2741491872186317, 'even'), (-0.26265269391373053, 'year'), (-0.2552153143985445, 'wanted'), (-0.25241270451235687, 'making'), (-0.2498439844859993, 'didnt'), (-0.24918664197493376, 'idea'), (-0.24905376660673903, 'because'), (-

In [38]:
y = lr.predict_proba(X)
balanced_reddit['score'] = y[:,0]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


##### Examples with male features

In [71]:
balanced_reddit.sort_values('score')[['post_text', 'score']].head(30)

Unnamed: 0,post_text,score
604230,"See, I read tarot and futhark for a good bit of time, and people like that piss me off. How many times in your life can you honestly say something has happened that is literally impossible to recover from? Shit, even dying... You probably wont really have time to care about it, thats pretty much...",0.033212
607271,"While you say Samsung, Id urge you to consider the other brands. I personally have a Sony Z3 Compact, and I absolutely love it. Its not a huge phone, so it can be used even with one small hand, but still powerful, its battery life is pretty good (I can generally get through two days without char...",0.035283
598051,"I dont use AV either. Basic common sense is AV enough in and of itself. You know, as much as we think of the current gen is pretty computer proficient, Id say most of them are stupidly dumb when its more complicated than Empty recycle bin. I know more than a few 18-20 year olds who wouldnt be a...",0.049202
610919,"Yeah, runs isnt something Id recommend until youre in decent shape (weightwise), it can be pretty bad for your joints if you carry excess weight, and asthma makes it hard. You might like swimming more, its isnt quite as taxing on your body but is still a great workout. Walking is a good thing ...",0.060126
597950,"Alright, Im from Denmark, so I can only comment on Copenhagen (although Im not from Copenhagen myself, so I might miss out on some of the cool spots). Some of the more famous places are Christiania and New Haven (Dont use the restaurants, theyre serious tourist traps and way too pricy compared ...",0.064142
610825,"I wont rate out of 10, but I can give my thoughts: Paris: Terrible. Its dirty, people are terrible drivers, so much pickpocketing. Antalya: Good as far as I remember. Id love to visit Turkey again. London: Pretty good, but its kind of average big city thing. Prague: Beautiful city, amazing o...",0.066108
601343,"Back when I used to play EVE, that was awesome. Being in a close-knit corp in nullsec, theres not really much to do, so most of your time in-game is just spent chilling on teamspeak talking with everyone. Ive been in & around a fair few competitive communities as well, most of them pretty cool ...",0.069056
599615,"It was pretty rainy when we were there. Which was actually kind of nice because it gets hot as shit in the valleys sometimes. It helped us cool off. But at the same time it made us a lot wetter and thus a lot colder at night and on the mountains, lol.",0.072674
609964,The story of your life is getting Reply didnt sent? Thats pretty cool. Youre on a hike as well? Thats pretty adventurous! I dont think itd be my thing either. I prefer to just chill at home without clothes rather than walk around outside without clothes.,0.072755
599888,"Sydneys a pretty big city :P There is one person I might IM every so often after I leave, which I think is pretty good. The weird thing is, so Im a consultant, theres no one else I work with as a consultant there, its all just the client. Its kinda weird, but hey getting to know people! Woo!",0.074238


##### Examples with female features

In [72]:
balanced_reddit.sort_values('score')[['post_text', 'score']].tail(30)

Unnamed: 0,post_text,score
603747,">Yes I do, but even as I write this I feel like someone is going to read through my comment history and be like alright this dude is a creep. We are all creeps to someone. Someone definitely thinks I am creepy. >and even if I was a creep I wouldnt date another American... Is it because we are...",0.866687
604640,"Wow this sounds so like me, except Ive only been with the company for four years! Ive been wanting to leave for months but I cant bring myself to actually apply for other jobs. I keep searching for them and making lists but never actually applying. I think Ive realised its because Im not actuall...",0.868054
599093,"I started with Windows 7 and still use it. It hast gotten slower, but thats Windows, not the laptop itself, but I am too lazy to reinstall it. Heat-wise I have no problems, but why live with a CPU temperature of 50-55°C, even when its hot, when I can get it down to 40°C-50°C with a cloth-wrappe...",0.868825
607297,"Oh yes, Ill be happy to update! My second wedding was very simple, a family-only garden affair, with barbecue and cheesecake afterward. The marriage sucked, but the ceremony was very nice, stress-free and relatively inexpensive. I think this one will be simpler still, at least until we start ...",0.869947
597767,"I have a friend (?) who Ill call Chris. Now, I came out as trangender about a month ago (though my close friends have known for 4-17 months) and started my authentic life full-time, and the reaction from my friends has ranged from pretending I dont exist, to inviting me to their wedding even th...",0.871454
615995,"Thank you, hes not active on social media or anywhere else so I cant look him up even if I wanted to :) I think some hard parts for me are when I get in my car I just think of the times he was with me and I associate the old memories together, there arent that many new memories you can make whil...",0.872614
612813,"I try to think positive whenever I feel I am overthinking. Sometimes it works but most of the time it seems to be only very weak. I always have the other thoughts tingling in the back of my head. I found out that I need to dristract myself from it, exercising or talking to people, but it is hard...",0.874194
614918,"Ugh, yeah that sounds difficult. Even if it ends on kinda good terms, its hard when youve been trying for so long. To answer your actual question, Im having an okay day. Im at work and its rainy, so I dont have much to do. I saw youre going on a trip in August, so am I! Im going with a friend t...",0.876823
613817,"Oh boy, I have so many: I hate the way I look and cant accept that my boyfriend is attracted to me because he appeared to be more into me when I was thinner... Im bisexual, only my boyfriend and a few other friends know that one. I get off on so many thoughts that my friends think are creepy ...",0.877957
617092,"I like the idea of kids, but I cant have my own naturally and neither my husband nor I want to go through the tremendous effort, expense, and emotional stress of either IVF or adoption at this point. Sometimes it makes me sad. Other times, I look at what my parent friends go through and Im so ...",0.881459


### Save

In [30]:
pickle.dump(reddit_features, open('./reddit_features.pkl', 'wb'))
pickle.dump(balanced_reddit, open('./balanced_reddit.pkl', 'wb'))