In [1]:
#import all tools for project
import requests
import pandas as pd
from bs4 import BeautifulSoup
import json
import numpy as np
import datetime
import matplotlib.pyplot as plt
import re
from time import sleep

# Determining the Frequency of Gendered Language

### References
Scikit-learn: Machine Learning in Python, Pedregosa et al., JMLR 12, pp. 2825-2830, 2011.

Dr. Mattei, Tulane University. SimpleText Notebook. 2020. 
Bo Pang and Lillian Lee, A Sentimental Education: Sentiment Analysis Using Subjectivity Summarization Based on Minimum Cuts, Proceedings of ACL 2004.

## - - - - - - - - > Datasets:

In [2]:
#Import CSV containing mass amount of scraped reviews for analysis
hg = pd.read_csv("./revs_gendered.csv")
hg.dropna(inplace = True)
hg.columns = ['Review', 'Gender']
hg

Unnamed: 0,Review,Gender
0,"Exelent he really care about you, he is very p...",Male
1,Dr. A-Rahim was very knowledgeable about my lo...,Male
2,Horrible physician treats you like a kid. Pts ...,Female
3,Dr Aaberg has been a very good dr for my husba...,Male
4,Dr. Aaberg has been treating my macular degene...,Male
...,...,...
8889,Very helpful and sincere about my pain needs! ...,Male
8890,"Several of my children see Dr. Zach, she is a ...",Female
8891,Amazing Pediatric Neurologist stays up to date...,Female
8892,Dr. Zach has been a Godsend for our daughter! ...,Female


In [3]:
letters = pd.read_csv("./letters_tidied.csv")
letters.dropna(inplace = True)
letters.columns = ['Review', 'Gender']
letters['Gender'] = letters['Gender'].replace(0.0, 'Female')
letters['Gender'] = letters['Gender'].replace(1, 'Male')
letters

Unnamed: 0,Review,Gender
0,"dear review committee members,\r\n\r\nit is wi...",Female
1,this letter addresses some of my thoughts and ...,Female
2,I am writing this letter to recommend REDACTED...,Male
4,it is my sincere pleasure to nominate dr. REDA...,Female
5,i am writing to strongly recommend REDACTED RE...,Male
...,...,...
92,I am writing to highly recommend that you acce...,Female
93,i am writing to highly recommend that you acce...,Female
94,i am writing to highly recommend that you acce...,Female
95,\r\ni am writing to highly recommend that you ...,Female


In [4]:
# concatenate two dataframes

allData = pd.concat([hg, letters])
allData

Unnamed: 0,Review,Gender
0,"Exelent he really care about you, he is very p...",Male
1,Dr. A-Rahim was very knowledgeable about my lo...,Male
2,Horrible physician treats you like a kid. Pts ...,Female
3,Dr Aaberg has been a very good dr for my husba...,Male
4,Dr. Aaberg has been treating my macular degene...,Male
...,...,...
92,I am writing to highly recommend that you acce...,Female
93,i am writing to highly recommend that you acce...,Female
94,i am writing to highly recommend that you acce...,Female
95,\r\ni am writing to highly recommend that you ...,Female


## - - - - - - - - > Logistic Regression

In [14]:
import sklearn
import nltk
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from sklearn import metrics
from sklearn import svm

In [15]:
# Vectorize
vec = TfidfVectorizer(min_df = 2, max_df = 0.98, ngram_range=(1,1)) 

reviews = allData['Review']

# transform this into a sparse vector!
vec.fit(reviews)
tf_idf_sparse = vec.transform(reviews)
tf_idf_sparse

<8983x8350 sparse matrix of type '<class 'numpy.float64'>'
	with 341059 stored elements in Compressed Sparse Row format>

In [16]:
revs = dict(zip(allData.Review, allData.Gender))

##  - - - - - - > create list of adjectives 

In [17]:
# create list of cleaned, stemmed adjectives, for later analysis. 

vect = CountVectorizer()
vect.fit(revs)
words = vect.get_feature_names()

adjectives = []
stop_words = list(set(stopwords.words('english')))

# remove stopwords 
stopped = [w for w in words if not w in stop_words]
    
# parts of speech tagging for each word 
pos = nltk.pos_tag(stopped)

# make a list of  all adjectives identified by the allowed word types list above
for w in pos:
    if w[1][0] == 'J':
        adjectives.append(w[0].lower())

## - - - - - - - 

In [18]:
print(tf_idf_sparse[0, :])

  (0, 8308)	0.18087671068924527
  (0, 7963)	0.14515562205290422
  (0, 7385)	0.35561253933942405
  (0, 6038)	0.27852469718081846
  (0, 5793)	0.2373437823422639
  (0, 3983)	0.12189971460612926
  (0, 3374)	0.2636511185535702
  (0, 2703)	0.6446316283629488
  (0, 2326)	0.10587678031682882
  (0, 1571)	0.28720820096031574
  (0, 1248)	0.20441648315214442
  (0, 519)	0.0869707345247959
  (0, 197)	0.2070634329049377


In [19]:
# We can now use this to classify the reviews!! but we need to test/train split again.

# Split..
X_train, X_test, y_train, y_test = train_test_split(tf_idf_sparse, 
                                                    allData['Gender'], 
                                                    test_size=0.2)

In [20]:
sv = svm.SVC(kernel='linear')
model = sv.fit(X_train, y_train)

In [21]:
sv.classes_

array(['Female', 'Male'], dtype=object)

In [22]:
sv.coef_[:,0]

<1x1 sparse matrix of type '<class 'numpy.float64'>'
	with 1 stored elements in Compressed Sparse Row format>

In [23]:
# Make a dataframe with the words, coefficients, and classes...
recs = []

for w,i in vec.vocabulary_.items():
    recs.append([str(w)] + list(sv.coef_[:,i]))

df_weights = pd.DataFrame(recs, columns=['word', 'weight'])

In [24]:
df_weights.sort_values('weight', ascending=False)[:25]

Unnamed: 0,word,weight
1,he,"(0, 0)\t7.298275262830276"
86,his,"(0, 0)\t4.678069508348017"
85,him,"(0, 0)\t4.069097372992144"
1362,mother,"(0, 0)\t1.9193244517777643"
329,surgery,"(0, 0)\t1.4202533090375775"
922,man,"(0, 0)\t1.281093323011754"
7669,wacks,"(0, 0)\t1.1848268725834115"
55,because,"(0, 0)\t1.0928887301126506"
852,mom,"(0, 0)\t1.07080707929212"
50,told,"(0, 0)\t1.0653623676786523"


In [25]:
#print only adjectives
#male adjectives
maleAdjectives = df_weights[df_weights['word'].isin(adjectives)].sort_values('weight', ascending=False)[:25]
maleAdjectives

Unnamed: 0,word,weight
2303,kindness,"(0, 0)\t1.0239926279636076"
1939,referral,"(0, 0)\t0.8595801649237742"
4358,amy,"(0, 0)\t0.8584852715001047"
4838,faber,"(0, 0)\t0.8249978492074967"
2328,suffered,"(0, 0)\t0.8178006074856657"
878,ignorant,"(0, 0)\t0.8123280358245024"
7525,udall,"(0, 0)\t0.7855125788588441"
3283,rudest,"(0, 0)\t0.7742862513320286"
1114,arrogant,"(0, 0)\t0.7588759576371599"
87,outstanding,"(0, 0)\t0.7099488062041794"


In [26]:
df_weights.sort_values('weight', ascending=True)[:25]

Unnamed: 0,word,weight
43,she,"(0, 0)\t-9.221319999052016"
397,her,"(0, 0)\t-8.055058620612881"
4510,eady,"(0, 0)\t-2.1573284226896194"
6156,kabbash,"(0, 0)\t-2.0405060982070706"
6970,pabolu,"(0, 0)\t-1.9924900484010042"
4782,ma,"(0, 0)\t-1.9696478619719204"
7812,redacted,"(0, 0)\t-1.896617039050697"
6380,lafaver,"(0, 0)\t-1.6871755183669923"
6740,nabors,"(0, 0)\t-1.640660702945808"
5355,cheryl,"(0, 0)\t-1.4985540426227937"


In [27]:
#female adjectives
femaleAdjectives = df_weights[df_weights['word'].isin(adjectives)].sort_values('weight', ascending=True)[:25]
#maybe take out adjective specification
#vocabulary being pruned down too aggressively
femaleAdjectives

Unnamed: 0,word,weight
4510,eady,"(0, 0)\t-2.1573284226896194"
3549,cabalona,"(0, 0)\t-1.3342415977348798"
304,unprofessional,"(0, 0)\t-1.3218169756147122"
2649,pediatrician,"(0, 0)\t-1.18272704935407"
2361,unhelpful,"(0, 0)\t-1.164047336978915"
3983,spoken,"(0, 0)\t-0.9739136727442044"
5596,gal,"(0, 0)\t-0.9461638396759378"
3152,insightful,"(0, 0)\t-0.9343649435150208"
4174,ambrosia,"(0, 0)\t-0.8741335326178087"
693,primary,"(0, 0)\t-0.7826765184060172"
