In [71]:
!pip3 install lxml
#import all tools for project
import requests
import pandas as pd
from bs4 import BeautifulSoup
import json
import numpy as np
import datetime
import matplotlib.pyplot as plt
import re
from time import sleep



# Determining the Frequency of Gendered Language

### References
Scikit-learn: Machine Learning in Python, Pedregosa et al., JMLR 12, pp. 2825-2830, 2011.

Dr. Mattei, Tulane University. SimpleText Notebook. 2020. 
Bo Pang and Lillian Lee, A Sentimental Education: Sentiment Analysis Using Subjectivity Summarization Based on Minimum Cuts, Proceedings of ACL 2004.

In [74]:
#Import CSV containing mass amount of scraped reviews for analysis
reviewsByGender = pd.read_csv("./revs_gendered.csv")
reviewsByGender.dropna(inplace = True)
reviewsByGender.columns = ['Review', 'Gender']
reviewsByGender

Unnamed: 0,Review,Gender
0,"Exelent he really care about you, he is very p...",Male
1,Dr. A-Rahim was very knowledgeable about my lo...,Male
2,Horrible physician treats you like a kid. Pts ...,Female
3,Dr Aaberg has been a very good dr for my husba...,Male
4,Dr. Aaberg has been treating my macular degene...,Male
...,...,...
8889,Very helpful and sincere about my pain needs! ...,Male
8890,"Several of my children see Dr. Zach, she is a ...",Female
8891,Amazing Pediatric Neurologist stays up to date...,Female
8892,Dr. Zach has been a Godsend for our daughter! ...,Female


In [75]:
# Vectorize the whole thing...
import sklearn
import nltk
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split


# Vectorize and play with token sizes...
vec = TfidfVectorizer(min_df = 2, 
                      max_df = 0.98, 
                      ngram_range=(1,1)) # play with min_df and max_df

reviews = reviewsByGender['Review']

# transform this into a sparse vector!
vec.fit(reviews)
tf_idf_sparse = vec.transform(reviews)
tf_idf_sparse

#only 222 unique words due to min_df 0.03 --> change to 2, maybe change to 5 (could get meaningless words)

<8894x7365 sparse matrix of type '<class 'numpy.float64'>'
	with 322901 stored elements in Compressed Sparse Row format>

In [76]:
revs = dict(zip(reviewsByGender.Review, reviewsByGender.Gender))

In [77]:
#get list of cleaned, stemmed adjectives, for later analysis. 
vect = CountVectorizer()
vect.fit(revs)
words = vect.get_feature_names()

adjectives = []
stop_words = list(set(stopwords.words('english')))

# remove stopwords 
stopped = [w for w in words if not w in stop_words]
    
# parts of speech tagging for each word 
pos = nltk.pos_tag(stopped)

# make a list of  all adjectives identified by the allowed word types list above
for w in pos:
    if w[1][0] == 'J':
        adjectives.append(w[0].lower())

In [78]:
print(tf_idf_sparse[0, :])

  (0, 7326)	0.18151009711676813
  (0, 7004)	0.14519230383170104
  (0, 6487)	0.3553485932395106
  (0, 5284)	0.2785536271904824
  (0, 5073)	0.23861251000309455
  (0, 3489)	0.12256067027657984
  (0, 2968)	0.26339587134769354
  (0, 2375)	0.6439667942787456
  (0, 2052)	0.10518473469832991
  (0, 1396)	0.2868414706993512
  (0, 1126)	0.20412772247184602
  (0, 466)	0.0871391073049557
  (0, 192)	0.20851297268798946


In [79]:
# We can now use this to classify the reviews!! but we need to test/train split again.

# Split..
X_train, X_test, y_train, y_test = train_test_split(tf_idf_sparse, 
                                                    reviewsByGender['Gender'], 
                                                    test_size=0.2)

In [80]:
logisticRegr = LogisticRegression(max_iter=100000, class_weight='balanced') 
model = logisticRegr.fit(X_train, y_train)



In [81]:
logisticRegr.classes_

array(['Female', 'Male'], dtype=object)

In [82]:
logisticRegr.coef_[:,0]

array([0.21094106])

In [83]:
# Make a dataframe with the words, coefficients, and classes...
recs = []

for w,i in vec.vocabulary_.items():
    recs.append([str(w)] + list(logisticRegr.coef_[:,i]))
# If we only have one class then we only get weight..
# df_weights = pd.DataFrame(tripples, columns=['word']+list(logisticRegr.classes_))
df_weights = pd.DataFrame(recs, columns=['word', 'weight'])

In [84]:
df_weights.sort_values('weight', ascending=False)[:25]

Unnamed: 0,word,weight
1,he,13.413569
85,his,8.740351
84,him,7.26163
328,surgery,2.921846
920,man,1.739105
86,outstanding,1.445992
546,knee,1.440757
87,staff,1.425463
13,was,1.407302
222,surgeon,1.263276


In [85]:
#print only adjectives
#male adjectives
maleAdjectives = df_weights[df_weights['word'].isin(adjectives)].sort_values('weight', ascending=False)[:25]
maleAdjectives

Unnamed: 0,word,weight
86,outstanding,1.445992
1111,arrogant,1.055663
66,good,1.042893
20,great,1.005547
4753,faber,0.963013
944,positive,0.926504
2291,kindness,0.791062
561,efficient,0.757025
37,er,0.663317
364,surgical,0.636397


In [86]:
df_weights.sort_values('weight', ascending=True)[:25]

Unnamed: 0,word,weight
43,she,-13.800402
396,her,-11.209317
4699,ma,-3.015258
5981,kabbash,-1.778111
588,woman,-1.689117
7309,yablonski,-1.351773
6774,qadir,-1.339115
7007,saadai,-1.30026
4072,amato,-1.271246
4442,eady,-1.269676


In [87]:
#female adjectives
femaleAdjectives = df_weights[df_weights['word'].isin(adjectives)].sort_values('weight', ascending=True)[:25]
#maybe take out adjective specification
#vocabulary being pruned down too aggressively
femaleAdjectives

Unnamed: 0,word,weight
4442,eady,-1.269676
1452,sweet,-1.090542
2633,pediatrician,-1.07191
691,primary,-1.048746
699,smart,-0.917067
14,knowledgeable,-0.903161
303,unprofessional,-0.897514
3512,cabalona,-0.820959
999,ok,-0.789854
712,happy,-0.773196


In [88]:
#verify results
agentic_terms = ['assertive','confident','aggressive','ambitious','dominan','forceful','independent','daring','outspoken','intellectua','earn','gain','know','insight','think']
communal_terms = ['sympathetic','kind','help','affection','sensitive','nurtur','agreeab','tactful','interpersonal','warm','car','tactful']
socio_communal = ['husband','wife','kid','babies','brother','child','colleague','family']

print('Results for most commonly found female adjectives:')
print('Agentic: ', femaleAdjectives['word'].isin(agentic_terms).sum())
print('Communal: ', femaleAdjectives['word'].isin(communal_terms).sum())
print('Socio-communal: ', femaleAdjectives['word'].isin(socio_communal).sum())
print('')
print('Results for most commonly found male adjectives:')
print('Agentic: ', maleAdjectives['word'].isin(agentic_terms).sum())
print('Communal: ', maleAdjectives['word'].isin(communal_terms).sum())
print('Socio-communal: ', maleAdjectives['word'].isin(socio_communal).sum())

Results for most commonly found female adjectives:
Agentic:  0
Communal:  0
Socio-communal:  0

Results for most commonly found male adjectives:
Agentic:  0
Communal:  0
Socio-communal:  0


In [89]:
#can use medical reviews to run with recommnedation letter reviews to predict which is which, then throw out 
#heavily medical terms

# recommendation system
# maybe seek out positive words that don't distinguish between male and female
# 4-class problem rather than two-class problem: take in star data and add pos/neg dimension to the data
# word vector stuff from NLP class: use language models and figure out how often it appears in male context vs
# female context. Ex: look at similarities between words: compare "leadership" to "man/men" 

#doubt raisers may not come out from this analysis: "she may not be the best in her class..."
# subtle, hard to catch -> maybe the word "not"
# append "not" to nearest adjective, negated adjectives might indicate doubt raisers
# hedging terms to weaken a statement (weasel words): refusing to take a stand/hesitant
# "somewhat", "mostly"

# how to rephrase it: female-correlated to neutral, remove hedging terms
# take an education based approach: "this can be conveyed as ___. If that is not your intention, consider ___"
# crowdsourcing: emphasize anonymization

## How well does our healthgrades data predict the gender of our reference letter subjects?

In [90]:
letters = pd.read_csv("./letters_tidied.csv")
letters.dropna(inplace = True)
letters.columns = ['Letter', 'Gender']


# testing data:
letters

Unnamed: 0,Letter,Gender
0,"dear review committee members,\r\n\r\nit is wi...",0.0
1,this letter addresses some of my thoughts and ...,0.0
2,I am writing this letter to recommend REDACTED...,1.0
4,it is my sincere pleasure to nominate dr. REDA...,0.0
5,i am writing to strongly recommend REDACTED RE...,1.0
...,...,...
92,I am writing to highly recommend that you acce...,0.0
93,i am writing to highly recommend that you acce...,0.0
94,i am writing to highly recommend that you acce...,0.0
95,\r\ni am writing to highly recommend that you ...,0.0


In [91]:
# Training data:

reviewsByGender['Gender'] = reviewsByGender['Gender'].replace('Female', 0.0)
reviewsByGender['Gender'] = reviewsByGender['Gender'].replace('Male', 1.0)
reviewsByGender.dropna(inplace = True)
reviewsByGender


Unnamed: 0,Review,Gender
0,"Exelent he really care about you, he is very p...",1.0
1,Dr. A-Rahim was very knowledgeable about my lo...,1.0
2,Horrible physician treats you like a kid. Pts ...,0.0
3,Dr Aaberg has been a very good dr for my husba...,1.0
4,Dr. Aaberg has been treating my macular degene...,1.0
...,...,...
8889,Very helpful and sincere about my pain needs! ...,1.0
8890,"Several of my children see Dr. Zach, she is a ...",0.0
8891,Amazing Pediatric Neurologist stays up to date...,0.0
8892,Dr. Zach has been a Godsend for our daughter! ...,0.0


In [101]:
x_train = reviewsByGender['Review']
y_train = reviewsByGender['Gender']
x_new = letters['Letter']

In [102]:
from sklearn.naive_bayes import BernoulliNB
from sklearn.metrics import classification_report
from sklearn import metrics

vec = CountVectorizer(max_df=100, min_df=2, binary=False, strip_accents = None)
X_train = vec.fit_transform(x_train)
X_test = vec.transform(x_new)

In [103]:
nb = BernoulliNB()
nb.fit(X_train, y_train)
gender_pred = nb.predict(X_test)
print(classification_report(letters['Gender'], gender_pred, labels=[0,1]))

              precision    recall  f1-score   support

           0       0.69      0.97      0.81        61
           1       0.50      0.07      0.12        28

    accuracy                           0.69        89
   macro avg       0.60      0.52      0.47        89
weighted avg       0.63      0.69      0.59        89



In [104]:
print("Accuracy:",metrics.accuracy_score(letters['Gender'], gender_pred))

Accuracy: 0.6853932584269663


In [105]:
lr = LogisticRegression(C=1e25, class_weight="balanced")
lr.fit(X_train, y_train)
gender_pred = lr.predict(X_test)
print(classification_report(letters['Gender'], gender_pred, labels=[0,1]))



              precision    recall  f1-score   support

           0       0.67      0.13      0.22        61
           1       0.31      0.86      0.46        28

    accuracy                           0.36        89
   macro avg       0.49      0.49      0.34        89
weighted avg       0.55      0.36      0.29        89



In [106]:
print("Accuracy:",metrics.accuracy_score(letters['Gender'], gender_pred))

Accuracy: 0.3595505617977528


### This means: Logistic regression predicts accurate gender of letters at 46%
### Naive Bayes predicts accurate gender at 69%

Super small sample size though, so results are not definite

In [121]:
from sklearn.neighbors import KNeighborsClassifier
knn = KNeighborsClassifier(n_neighbors=2)
knn.fit(X_train, y_train)
gender_pred = knn.predict(X_test)
print(classification_report(letters['Gender'], gender_pred, labels=[0,1]))

              precision    recall  f1-score   support

           0       0.78      0.46      0.58        61
           1       0.38      0.71      0.49        28

    accuracy                           0.54        89
   macro avg       0.58      0.59      0.54        89
weighted avg       0.65      0.54      0.55        89



In [122]:
print("Accuracy:",metrics.accuracy_score(letters['Gender'], gender_pred))

Accuracy: 0.5393258426966292


In [123]:
from sklearn import svm
sv = svm.SVC(kernel='linear')
sv.fit(X_train, y_train)
gender_pred = sv.predict(X_test)
print(classification_report(letters['Gender'], gender_pred, labels=[0,1]))

              precision    recall  f1-score   support

           0       0.81      0.48      0.60        61
           1       0.40      0.75      0.52        28

    accuracy                           0.56        89
   macro avg       0.60      0.61      0.56        89
weighted avg       0.68      0.56      0.57        89



In [124]:
print("Accuracy:",metrics.accuracy_score(letters['Gender'], gender_pred))

Accuracy: 0.5617977528089888
