In [1]:
import pandas as pd
pd.set_option('display.max_colwidth', 300)
import numpy as np
import pickle

# Datasets

In [2]:
reddit_posts = pd.read_csv('/nas/home/jwei/biases/rtgender/reddit_posts.csv')
facebook_public_figures = pd.read_csv('/nas/home/jwei/biases/rtgender/facebook_wiki_posts.csv')

### Reddit dataset

In [3]:
reddit_posts.head()

Unnamed: 0,op_id,op_gender,post_id,post_text,subreddit,op_gender_visible
0,Kastoli,M,0,slayer task perhaps?,2007scape,False
1,Kastoli,M,1,"Black DHide legs, possibly an initiate pure?",2007scape,False
2,DCBizzle,M,2,Whats a tonk? lol,2007scape,False
3,ordona,M,3,Do the Stronghold of Security for a free 10k to start.,2007scape,False
4,SlayerMaster,M,4,I cant tell if this guy just doesnt speak English or if he is a total retard. Probably a combination of the two.,2007scape,False


In [4]:
reddit_posts['op_gender'].value_counts()

M    1148591
W     304921
Name: op_gender, dtype: int64

In [5]:
reddit_posts['subreddit'].unique()

array(['2007scape', 'AdviceAnimals', 'Amd', 'anime', 'AskMen', 'AskOuija',
       'AskReddit', 'asoiaf', 'aww', 'baseball', 'BigBrother', 'Bitcoin',
       'BlackPeopleTwitter', 'buildapc', 'canada', 'cars',
       'CasualConversation', 'CFB', 'conspiracy', 'counting',
       'CringeAnarchy', 'dankmemes', 'DBZDokkanBattle', 'DestinyTheGame',
       'de', 'DotA2', 'ethtrader', 'europe', 'explainlikeimfive',
       'FFBraveExvius', 'ffxiv', 'FIFA', 'FireEmblemHeroes', 'Fitness',
       'formula1', 'funny', 'gameofthrones', 'Games', 'gaming', 'gifs',
       'GlobalOffensiveTrade', 'GlobalOffensive', 'gonewild',
       'hearthstone', 'heroesofthestorm', 'hiphopheads', 'hockey',
       'Ice_Poseidon', 'india', 'Jokes', 'leagueoflegends', 'magicTCG',
       'marvelstudios', 'me_irl', 'mildlyinteresting', 'MMA', 'movies',
       'Music', 'nba', 'neoliberal', 'news', 'nfl', 'NintendoSwitch',
       'nottheonion', 'OkCupid', 'Overwatch', 'pathofexile',
       'pcmasterrace', 'personalfinance', 

In [6]:
almost_balanced = []
for i, group in reddit_posts.groupby('subreddit'):
    p = sum(group['op_gender'] == 'M') / len(group['op_gender'] == 'F')
    if 0.4 < p < 0.6:
        almost_balanced.append((i,p))
almost_balanced

[('BigBrother', 0.4340518816222141),
 ('CasualConversation', 0.559511084258958),
 ('FireEmblemHeroes', 0.5605839416058395),
 ('aww', 0.4890519061909525)]

In [7]:
for i, j in almost_balanced:
    print(i, sum(reddit_posts.subreddit == i))

BigBrother 8211
CasualConversation 21517
FireEmblemHeroes 685
aww 8997


### Facebook public figures

In [8]:
facebook_public_figures.head()

Unnamed: 0,op_id,op_gender,post_id,post_text,post_type
0,11679984,M,0,"Tornado watch in effect tonight. Be safe. Please feel free to share this with friends, so they know.",photo
1,11679984,M,1,,photo
2,11679984,M,2,Temps warming up for the holiday weekend! I will see you next Tuesday. Have a great weekend!,photo
3,11679984,M,3,Showers heading this way ..,photo
4,11679984,M,4,Storm potential update...feel free to share this with friends so they know when the storms will hit.,photo


In [9]:
facebook_public_figures.op_gender.value_counts()

W    99692
M    49756
Name: op_gender, dtype: int64

In [10]:
facebook_public_figures = facebook_public_figures.drop_duplicates('post_text')

# Features

In [11]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression

### Reddit posts

In [12]:
balanced_reddit = reddit_posts[reddit_posts.subreddit.isin([i[0] for i in almost_balanced])]
text = balanced_reddit['post_text']
genders = balanced_reddit['op_gender'] == 'M'

In [13]:
print(np.mean(genders))

0.5173052524739914


In [14]:
vectorizer = CountVectorizer(max_features=10000, binary=True)
X = vectorizer.fit_transform(text)
y = genders
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.4, random_state=0)

In [15]:
lr = LogisticRegression().fit(X_train, y_train)



In [16]:
np.mean(lr.predict(X_test) == y_test)

0.6141207815275311

In [17]:
reddit_features = sorted(list(zip([i for i in lr.coef_[0]], vectorizer.get_feature_names())), key=lambda x: x[0])
print('Female:')
print(reddit_features[:20])
print('Male:')
print(reddit_features[-20:])

Female:
[(-2.5549717908231457, 'tiff'), (-2.1699673030496176, 'accounting'), (-1.813619665985787, 'behaviour'), (-1.802988138114478, 'hahah'), (-1.7965707576393026, 'afternoon'), (-1.794535071380404, 'boys'), (-1.6480417011662425, 'lemon'), (-1.6110787123161356, 'plastic'), (-1.5562517715090045, 'stephen'), (-1.4982892097252096, 'vancouver'), (-1.4933823786199054, 'confident'), (-1.4899613633357782, 'showmances'), (-1.4876972280568086, 'cuteness'), (-1.4691017909671564, 'nat'), (-1.4582542694284963, 'jeez'), (-1.4332323844310002, 'removed'), (-1.4276670012780082, 'hubby'), (-1.4256149209243347, 'unicorn'), (-1.4203542107420337, 'pinterest'), (-1.4174269175337033, 'boyfriend')]
Male:
[(1.397995802472274, 'beard'), (1.4004780170623503, 'dates'), (1.4220930614170975, 'dinners'), (1.4801541210324538, 'ginamarie'), (1.4966896891825756, 'race'), (1.4969641530473543, 'intro'), (1.5171810547726792, 'beer'), (1.5587637858265555, 'pinball'), (1.5826514358597328, 'da'), (1.6190214021500755, 'surp

In [18]:
y = lr.predict_proba(X)
balanced_reddit['score'] = y[:,0]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


##### Examples with male features

In [19]:
balanced_reddit.sort_values('score')[['post_text', 'score']].head(30)

Unnamed: 0,post_text,score
597950,"Alright, Im from Denmark, so I can only comment on Copenhagen (although Im not from Copenhagen myself, so I might miss out on some of the cool spots). Some of the more famous places are Christiania and New Haven (Dont use the restaurants, theyre serious tourist traps and way too pricy compared ...",1.154365e-11
597495,"Alright, let me take a crack at it! For the record, Ive got some beef with the show, but its still entirely worth watching! #Production Quality The first thing I noticed when watching the show was the production quality. This is a TV show, yet it doest look cheap like a lot of TV shows are won...",1.667573e-06
617149,"Some of my favourite memories: * **Barcelona, Spain**: Walking around the city centre after Barcelona won the Champions League in 2015. Kids setting off fireworks in a tiny alleyway. The fireworks sounded like gunfire. Prostitutes of all genders, races, shapes, sizes lining the street like a pl...",2.865443e-06
600883,"Im currently reading 21 :) Story based: * Gunnerkrieg Court - a young girl finds herself in a most unusual school. The story is fantastic and its great for re-reading, as the author tends to put small details here and there that you only notice after youve read a few chapters ahead. * Girl Gen...",3.800511e-06
610082,"Is that 1,000 USD before or after the flight? It depends how cheaply you can live. Scandinavia might be a stretch if you need a hotel and stuff. Especially if you use the fancy ones. Lonelyplanet.com recommends ~$120/day if you use dorm beds in Denmark. I think Sweden is marginally cheaper and...",8.902927e-06
608175,"The one I drink has Anise, the same thing that makes sambuca. It gives it a liquorice flavour, which I absolutely love. Its also flavored with hemp (although it doesnt include THC). Vodka is indeed a one-size-fit-all alcohol. Well, as long as you dont buy the absolute bottom cheap stuff. I main...",3.63465e-05
563189,"I dont smoke cigs, so tell me if my math is wrong. 5 cartons = 50 packs = 1,000 cigs. According to Wikipedia, this season will be 98 days long. That means you can smoke 10.2 per day and make them last the full season....So lets just say 10 per day to be safe (save some so that people can bum a...",4.499031e-05
596680,"All through life, until after I graduated high school, I was really really into anime and gaming. It was my life and, besides going to school, was the only thing I did. Then when I moved off to college, I lost a ton of interest in it all of a sudden. I dont watch anime anymore, I still have ...",5.49727e-05
612988,"I havent finished the Manga yet. The Anime has 2 endings. The first is that of the TV Series, and the other is the movie The End of Evangelion. On its own, I really dig the TV Ending, even if it suddenly goes for this abstract, artsy approach having to do with budget constraints. As a follow up ...",6.28537e-05
599380,"That depends. I mean, I love the War Ensign of the Kaiserliche Marine. The 1858-1917 Imperial Standard of Russia is also pretty cool. The flag of the Kingdom of Bhutan is pretty damn badass. I also have a pretty patriotic like of the Nordic Crosses, I just find them pleasant. The flag of the Pap...",7.19021e-05


##### Examples with female features

In [20]:
balanced_reddit.sort_values('score')[['post_text', 'score']].tail(30)

Unnamed: 0,post_text,score
613068,"Congrats! I wish my husband could see this. He is 32 and has been working for the last 10 years at his job, but hes kind of scared to move on, for the same reasons that you are sad. Myself, I didnt even know my last week was it at my last job! I worked at a campground, and usually, at the end o...",0.999816
565604,"No idea, I dont know whether he actually has a strategy or is just going by who Paulie was wanting out. Its weird because Tiff trusts Paul and Day is likely to put her sights on him soon because he is competing for the spot that she wants. There are so many times where there have been conversati...",0.999819
548410,"Kittens arent usually aggressive, they do tend to nibble a bit but after 3 to 5 months old they generally learn bite inhibition on their own. I highly suggest if you have a large place to leave them in one room or floor of the house for the first few days. They are however loud and hyper and tak...",0.999849
564155,"tiff walked in on Zakiyah and Paulie together in the HoH room and then bolted downstairs. Not sure if she just actually wanted to nap, whether shes worried about Zak being annoyed or whether she was taken off guard seeing them together.",0.999852
541439,"This may or may not matter to you, but Im compelled to point out that heterosexual couples do things that screwy and even screwier in order to have a child. AI, IVF, sperm donation, egg donation, centrifugal fractionation, embryo freezing, genetic testing, surrogacy, and so on. Its a new era a...",0.999875
606786,"I am new to CC, so I keep waiting for the assholes to show up and shit all over it. But they havent! Everyone is so nice. I hate that I am suspicious. I am in Australia. I work in a lab, but we have a lot of staff and everyone gets their own computer, so I often find myself just sitting around ...",0.999883
613069,"I love your outlook, and Im sure it will be awesome. :) That is basically the exact same spot my husband is stuck in, making only a couple bucks more than when he started. Hes actually bringing home less now than he did five years ago because his health plan has shot up in cost and now costs mo...",0.999884
564099,8:59 cam 3/4 - the girls continue to talk. They discuss that if someone (Tiffany or Bronte?) goes home then the two girls on the other side will be loose. Day says that she thinks Frank is closer to the boys. Day tells Nicole about the fishing she did with Frank about getting Tiff out and how sh...,0.999891
564572,"Lets look at the available people. First: Frank, Michelle, Bridgette and Paulie are safe Noms: Tiff/Paul That leaves Zakiyah, Day, Nicole, James, Corey, Bronte, Natalie Take out 8pack: Zakiyah, Day, Nicole, James, Corey That leaves Bronte or Natalie. Take out the one person other than Ti...",0.999893
564036,"Tiffany later said to Natalie, Bridgette and Bronte that shes been picking up some strange vibes and she feels like something is going on - because there IS something going on. Nobody is talking about potentially sending James out so of course hes just fine; whereas last year when there was a ch...",0.99991


### Facebook public figures

In [21]:
facebook_public_figures = facebook_public_figures.drop_duplicates('post_text').dropna()
text = facebook_public_figures['post_text']
genders = facebook_public_figures['op_gender'] == 'M'

In [22]:
print(np.mean(genders))

0.3295730001864628


In [23]:
vectorizer = CountVectorizer(max_features=10000, binary=True)
X = vectorizer.fit_transform(text)
y = genders
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.4, random_state=0)

In [24]:
lr = LogisticRegression().fit(X_train, y_train)



In [25]:
np.mean(lr.predict(X_test) == y_test)

0.8627949888316986

In [26]:
features = sorted(list(zip([i for i in lr.coef_[0]], vectorizer.get_feature_names())), key=lambda x: x[0])
print('Female:')
print(features[:20])
print('Male:')
print(features[-20:])

Female:
[(-4.437578465512176, 'wgn'), (-4.410792930024218, 'hellogiggles'), (-4.012383186467645, 'nbcla'), (-3.9598381333901562, 'kateesackhoff'), (-3.909514675092159, 'yhoo'), (-3.7928127380232923, 'hln'), (-3.6009365122833783, 'realtalkkim'), (-3.4736666897284914, 'xop'), (-3.264940053491449, 'tracilords'), (-3.199623206833977, 'chrissie'), (-3.167962636953035, 'maityontheroad'), (-3.1583347737522924, 'longmire'), (-3.15215374604655, 'janetleemusic'), (-3.106901883891946, 'ootd'), (-3.0784068666084536, 'janeunchained'), (-3.022004239894846, 'thegivingkeys'), (-2.9962739449519415, 'billie'), (-2.9772191611853107, 'keek'), (-2.975877124176065, 'katee'), (-2.9714641504295316, 'victoriajustice')]
Male:
[(3.0476846468063177, 'fresno'), (3.059502981333088, 'gopdebate'), (3.0622865620161965, 'vasurge'), (3.1115354792482863, 'maher'), (3.1681479239658312, 'ch5'), (3.1719940193200284, 'georgelopez'), (3.179546983666228, 'rickie'), (3.2662723184641025, 'mohr'), (3.284725982479939, 'jayni'), (3

In [27]:
y = lr.predict_proba(X)
facebook_public_figures['score'] = y[:,0]

##### Examples with male features

In [28]:
facebook_public_figures.sort_values('score')[['post_text', 'score']].head(30)

Unnamed: 0,post_text,score
134099,"My great-great-great-great-great-grandmother, Mary Morin Scott, who served as an army nurse and spy during the American Revolutionary War. Daughter of John Morin Scott and Helena Rutgers, she was born in a house off the ""Indian Path"" (now Broadway) at what is now West 43rd Street, on July 17, 17...",0.0
99102,"So What Did I Miss? Ha. Back after our wonderful family vacation in Northern Italy…Erica, Sol and I had a fabulous time…from seeing Da Vinci's Last Supper, visiting the World Expo in Milan, taking gondolas in Venice, a boat ride past George Clooney's house on breathtakingly beautiful Lake Cuomo...",1.554312e-15
6782,"By Michael Moore (@MMFlint) Bill Maher is a friend of mine. He stood up for me when I was attacked after my Oscar speech (given on the fourth night of the Iraq War, a war Bill publicly opposed while 70% of the country, including the majority of Democrats in the U.S. Senate, supported it), and I...",1.776357e-15
100340,"Geraldo Rivera: Diminished But Still Formidable By the time the president strode onto the big stage at the jammed packed Time Warner Cable Arena Thursday night in Charlotte North Carolina, the Democratic delegates, alternates, and VIP guests had been primed for delirium. Although distracted and...",4.662937e-15
134777,"My great-great-grandfather's personal launch in San Francisco Bay on April 19, 1906. The previous morning, shortly after 5 am, a deadly earthquake devastated San Francisco and the resulting fires consumed over three-quarters of the city. Thousands died, but through the frantic efforts of rescuer...",5.995204e-15
4243,"For more than twenty years, Bill Maher has set the boundaries of where funny, political talk can go on American television. First on ""Politically Incorrect"" (Comedy Central, ABC, 1993-2002), and for the last fourteen years on HBO's ""Real Time,"" Maher's combination of unflinching honesty and big...",1.421085e-14
98772,"Geraldo Of Arabia: Tora Bora To Trump by Geraldo Rivera | May 24, 2016 This is an open letter to former Pentagon spokesman Colonel David Lapan USMC (ret.); former Editor, Baltimore Sun William Marimow, and to former Sun television writer David Folkenflik. Subject: Urgent: Lost Tapes Found ...",1.44329e-14
99056,"Who won the debate? Vote now @ http://WABCradio.com Results on my 77WABC radio show at 10am-noon. My thoughts: 1- On Donald Trump, who was targeted by most on stage rivals; the billionaire businessman fought boredom and handled most attacks including Governor Jeb Bush's spirited defense of spea...",3.352874e-14
99372,Who runs our country's foreign policy? Prime Minister Benjamin Netanyahu's disrespectful decision to go behind President Obama's back and secretly accept a deceitful invitation from the Republican Speaker of the House John Boehner to address the Congress of the United States is an insult to al...,4.107825e-14
99024,"Say Goodbye to the Sixties Bernie Sanders reminds me of me 50 years ago when I didn't trust anyone over thirty and my friends and I were going to bring ""the man"" to heel. The world is unfair, the war sucks, the government is owned by Wall Street and income re-distribution, social change and forc...",3.017586e-13


##### Examples with female features

In [29]:
facebook_public_figures.sort_values('score')[['post_text', 'score']].tail(30)

Unnamed: 0,post_text,score
74810,"FELIZ, HONRADA y AGRADECIDA con la invitación a dar el discurso de Graduación este próximo Mayo en mi universidad @elonuniversity 📚🎓 Me llena de mucha emoción pensar que hace 10 años fue mi graduación. Recuerdo como si fuera ayer estar sentada entre mis compañeros, lista y ansiosa por cumplir ...",1.0
74727,"#Gracias a mi gran jefa, amiga y mentora @luzmadoria por tus palabras, tu cariño y tu apoyo incondicional. ❣️ #Repost @luzmadoria with @repostapp ・・・ Feliz cumpleaños @maityinteriano La ganadora del Emmy. La nominada a ser una de las Poderosas de People. La periodista. La soñadora. La que hoy l...",1.0
74687,"¡Gracias, Gracias y mas Gracias! Mi corazón esta en paz y lleno de mucha alegría y agradecimiento por todas sus muestras de cariño y apoyo. Hoy mas que nunca seguiré trabajando por #FundacionAmoryVida y dando a conocer las necesidades de estos jóvenes que son y serán mi familia. No quedamos en l...",1.0
119512,"Chers amis, Je voudrais vous inviter à vous joindre à la marche d'Idle No More ce vendredi à Ottawa. Idle No More est un mouvement citoyen autochtone nés des attaques du projet de loi C-45 de Stephen Harper et ses attaques contre nos voies navigables. Le mouvement s'oppose au manque de respect ...",1.0
84709,"#shelfie time! After a hard week of training and dieting, I feel my self tightening up for my trip #stateside next week! I have had one shoot this week, and am really looking forward to catching up with the rest of Team Optimum Nutrition in Chicago on Monday! Here is my Back & Tris workout from...",1.0
74704,Gracias a sus votos y apoyo incondicional estamos en la recta final de las #25Poderosas de People en Español que se anuncia este VIERNES! No dejemos de votar aqui: http://peopleenespanol.com/celebridades/25-mujeres-mas-poderosas-votacion-ultima-semana/ #Gracias por el interes y apoyo que han d...,1.0
119687,"Elizabeth May, Member of Parliament for Saanich-Gulf Islands and Leader of the Green Party of Canada, will be attending the ""Defend our Coast"" sit-in. B.C. Premier Christy Clark is talking about putting a price on the west coast. The view is that for enough money we should ignore the voices of...",1.0
74697,#GRACIAS @karlamonroig! Feliz de ver tanto apoyo por #FundacionAmoryVida. A seguir VOTANDO! #Repost @karlamonroig with @repostapp ・・・ Ya ella es poderosa pero queremos verla en la lista de @PeopleenEspañol Voten x @maityinteriano! Su labor merece ser reconocida y su causa escuchada y apoyada p...,1.0
127156,"Este joven trabaja en un restaurante cerca del parque Woodley en DC... Estaba cansado, sudado, lucia agotado y muy ocupado. Caminaba de prisa de una esquina del restaurante a otra. Las 2 horas que estuve en ese sitio lo ví limpiando mesas, barriendo, sirviendo agua, llevando y recogiendo plato...",1.0
75823,"✈️ Otro viaje...pero este es muy especial. Por primera vez, los 33 niños y jóvenes de #FundacionAmoryVida de #Honduras cumplirán su sueño de viajar a #EEUU y visitar #Disney. Gracias a Dios y el esfuerzo incansable de angeles en la tierra, estos jóvenes no solo visitaran un lugar soñado por much...",1.0


### Save

In [30]:
pickle.dump(reddit_features, open('./reddit_features.pkl', 'wb'))
pickle.dump(balanced_reddit, open('./balanced_reddit.pkl', 'wb'))