In [137]:
#import all tools for project
import requests
import pandas as pd
from bs4 import BeautifulSoup
import json
import numpy as np
import datetime
import matplotlib.pyplot as plt
import re
from time import sleep
pd.set_option('display.max_rows', 50)

In [58]:
'''
Takes in a url to any specific doctor and returns all of their reviews as strs in a list
'''
headers = {'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.95 Safari/537.36'}
response = requests.get('https://checkyourrecc.com/posts', headers=headers)

#use Beautiful Soup to prettify the content
soup = BeautifulSoup(response.content)
soup.prettify()
content = str(soup.findAll('p'))
data = content.split('}, {')
data

['[<p>[{"gender": -1, "content": "my name is REDACTED REDACTED and this is a test.", "id": 1',
 '"gender": -1, "content": "REDACTED is so REDACTEDoyinging", "id": 2',
 '"gender": -1, "content": "REDACTED is so REDACTEDoyinging", "id": 3',
 '"gender": -1, "content": "redacted is REDACTEDredactedoyinging", "id": 4',
 '"gender": -1, "content": "redacted is REDACTEDredactedoyinging", "id": 5',
 '"gender": -1, "content": "redacted is redactedredactedoyinging REDACTEDREDACTEDREDACTEDREDACTED", "id": 6',
 '"gender": -1, "content": "So basically for the rest of the msec values, keep taking the slope between the Q point and a different point ((1,2) works) and then you\\u2019ll see that the slope gets closer and closer to 2, so the slope at the tangent line is ~2", "id": 7',
 '"gender": -1, "content": "dear review committee members,\\r\\n\\r\\nit is with great pleasure that i very strongly recommend REDACTED for the johns hopkins university\\u2019s summer internship program.\\r\\n\\r\\nin summer

In [64]:
revs = {}
for entry in data:
    genderSearch = re.search(r'\-?\d+', entry)
    genderNum = int(genderSearch.group(0))
    
    letter = re.search(r'"content": "(.*)", "id"', entry)
    letterString = letter.group(0)[12:-7]
    revs[letterString] = genderNum

In [67]:
lettersByGender = pd.DataFrame(list(revs.items()),columns = ['Review','Gender'])
lettersByGender

Unnamed: 0,Review,Gender
0,my name is REDACTED REDACTED and this is a test.,-1
1,REDACTED is so REDACTEDoyinging,-1
2,redacted is REDACTEDredactedoyinging,-1
3,redacted is redactedredactedoyinging REDACTEDR...,-1
4,"So basically for the rest of the msec values, ...",-1
...,...,...
97,I am writing to highly recommend that you acce...,0
98,i am writing to highly recommend that you acce...,0
99,i am writing to highly recommend that you acce...,0
100,\r\ni am writing to highly recommend that you ...,0


In [121]:
letters = pd.read_csv("./letters_tidied.csv")
letters.dropna(inplace = True)
letters.columns = ['Letter', 'Gender']
letters

Unnamed: 0,Letter,Gender
0,"dear review committee members,\r\n\r\nit is wi...",0.0
1,this letter addresses some of my thoughts and ...,0.0
2,I am writing this letter to recommend REDACTED...,1.0
4,it is my sincere pleasure to nominate dr. REDA...,0.0
5,i am writing to strongly recommend REDACTED RE...,1.0
...,...,...
92,I am writing to highly recommend that you acce...,0.0
93,i am writing to highly recommend that you acce...,0.0
94,i am writing to highly recommend that you acce...,0.0
95,\r\ni am writing to highly recommend that you ...,0.0


### Ideas from meeting w Culotta:
    - fit function per instance weights
    - calculate statistics against prior research
    - calculate gender of recommendee with our current algorithm, show it works

In [122]:
letters['Gender'] = letters['Gender'].replace(0.0, 'Female')
letters['Gender'] = letters['Gender'].replace(1, 'Male')
letters

Unnamed: 0,Letter,Gender
0,"dear review committee members,\r\n\r\nit is wi...",Female
1,this letter addresses some of my thoughts and ...,Female
2,I am writing this letter to recommend REDACTED...,Male
4,it is my sincere pleasure to nominate dr. REDA...,Female
5,i am writing to strongly recommend REDACTED RE...,Male
...,...,...
92,I am writing to highly recommend that you acce...,Female
93,i am writing to highly recommend that you acce...,Female
94,i am writing to highly recommend that you acce...,Female
95,\r\ni am writing to highly recommend that you ...,Female


# Determining the Frequency of Gendered Language

In [123]:
# Vectorize the whole thing...
import sklearn
import nltk
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split


# Vectorize and play with token sizes...
vec = TfidfVectorizer(min_df = 2, 
                      max_df = 0.98, 
                      ngram_range=(1,1)) # play with min_df and max_df

reviews = letters['Letter']

# transform this into a sparse vector!
vec.fit(reviews)
tf_idf_sparse = vec.transform(reviews)
tf_idf_sparse

#only 222 unique words due to min_df 0.03 --> change to 2, maybe change to 5 (could get meaningless words)

<89x2304 sparse matrix of type '<class 'numpy.float64'>'
	with 16415 stored elements in Compressed Sparse Row format>

In [124]:
revs = dict(zip(letters.Letter, letters.Gender))

In [125]:
#get list of cleaned, stemmed adjectives, for later analysis. 
vect = CountVectorizer()
vect.fit(revs)
words = vect.get_feature_names()

adjectives = []
stop_words = list(set(stopwords.words('english')))

# remove stopwords 
stopped = [w for w in words if not w in stop_words]
    
# parts of speech tagging for each word 
pos = nltk.pos_tag(stopped)

# make a list of  all adjectives identified by the allowed word types list above
for w in pos:
    if w[1][0] == 'J':
        adjectives.append(w[0].lower())

In [126]:
print(tf_idf_sparse[0, :])

  (0, 2286)	0.04930504095563128
  (0, 2278)	0.03407535496352742
  (0, 2275)	0.04449428958445482
  (0, 2272)	0.11673429844604141
  (0, 2268)	0.04711795373848037
  (0, 2266)	0.19239352898161854
  (0, 2263)	0.030644702744229105
  (0, 2253)	0.029876575987526573
  (0, 2244)	0.06977249906908796
  (0, 2229)	0.024369560370073148
  (0, 2209)	0.2196094538651588
  (0, 2208)	0.18566695596691343
  (0, 2202)	0.06687192394700639
  (0, 2191)	0.06016055251343332
  (0, 2190)	0.062143067873788795
  (0, 2180)	0.07317589484219926
  (0, 2164)	0.05017598996833564
  (0, 2159)	0.08028652435560235
  (0, 2109)	0.08281509784404091
  (0, 2107)	0.07740193026438712
  (0, 2099)	0.06435933148943418
  (0, 2084)	0.08281509784404091
  (0, 2078)	0.03455174730942665
  (0, 2067)	0.020114696426848364
  (0, 2064)	0.08281509784404091
  :	:
  (0, 463)	0.06687192394700639
  (0, 440)	0.04012643083849744
  (0, 394)	0.056729900294135
  (0, 389)	0.04619989397675426
  (0, 358)	0.06977249906908796
  (0, 350)	0.05131673271448122
  (0, 

In [127]:
# We can now use this to classify the reviews!! but we need to test/train split again.

# Split..
X_train, X_test, y_train, y_test = train_test_split(tf_idf_sparse, 
                                                    letters['Gender'], 
                                                    test_size=0.2)

In [128]:
logisticRegr = LogisticRegression(max_iter=100000, class_weight='balanced') 
model = logisticRegr.fit(X_train, y_train)



In [129]:
logisticRegr.classes_

array(['Female', 'Male'], dtype=object)

In [131]:
logisticRegr.coef_[:,0]

array([-0.04880409])

In [132]:
# Make a dataframe with the words, coefficients, and classes...
recs = []

for w,i in vec.vocabulary_.items():
    recs.append([str(w)] + list(logisticRegr.coef_[:,i]))
# If we only have one class then we only get weight..
# df_weights = pd.DataFrame(tripples, columns=['word']+list(logisticRegr.classes_))
df_weights = pd.DataFrame(recs, columns=['word', 'weight'])

In [138]:
df_weights.sort_values('weight', ascending=False)[:25]

Unnamed: 0,word,weight
261,he,1.770687
265,his,1.550729
708,mr,0.612067
289,him,0.599119
615,engineering,0.364294
1343,jackson,0.322904
769,american,0.284305
2046,josh,0.252226
768,latin,0.23188
5,is,0.227377


In [139]:
#print only adjectives
#male adjectives
maleAdjectives = df_weights[df_weights['word'].isin(adjectives)].sort_values('weight', ascending=False)[:25]
maleAdjectives

Unnamed: 0,word,weight
769,american,0.284305
1052,wonderful,0.128181
715,exceptional,0.114448
168,medical,0.112191
628,degree,0.103111
402,young,0.101088
640,extracurricular,0.096514
721,structural,0.093456
751,confident,0.084104
16,u2019s,0.083381


In [140]:
df_weights.sort_values('weight', ascending=True)[:25]

Unnamed: 0,word,weight
50,her,-1.298386
57,she,-1.040801
13,redacted,-0.404082
31,was,-0.271744
1064,ms,-0.269518
1233,architecture,-0.239739
18,internship,-0.22837
662,experience,-0.219877
672,academic,-0.217672
24,of,-0.205452


In [141]:
#female adjectives
femaleAdjectives = df_weights[df_weights['word'].isin(adjectives)].sort_values('weight', ascending=True)[:25]
#maybe take out adjective specification
#vocabulary being pruned down too aggressively
femaleAdjectives

Unnamed: 0,word,weight
18,internship,-0.22837
672,academic,-0.217672
1768,christine,-0.144539
971,current,-0.128965
935,deep,-0.113537
4,nit,-0.107937
2254,roman,-0.102546
48,first,-0.096663
1711,meaningful,-0.095936
1144,vertical,-0.095794
