In [2]:
import requests
import pandas as pd
from bs4 import BeautifulSoup
import json
import numpy as np
import datetime
import matplotlib.pyplot as plt
import re
from time import sleep

## - - - - - - - - > Import and Combine Datasets:

In [27]:
# import HealthGrades Data
hg = pd.read_csv("./revs_gendered.csv")
hg.dropna(inplace = True)
hg.columns = ['Review', 'Gender']
hg

Unnamed: 0,Review,Gender
0,"Exelent he really care about you, he is very p...",Male
1,Dr. A-Rahim was very knowledgeable about my lo...,Male
2,Horrible physician treats you like a kid. Pts ...,Female
3,Dr Aaberg has been a very good dr for my husba...,Male
4,Dr. Aaberg has been treating my macular degene...,Male
...,...,...
9133,Very helpful and sincere about my pain needs! ...,Male
9134,"Several of my children see Dr. Zach, she is a ...",Female
9135,Amazing Pediatric Neurologist stays up to date...,Female
9136,Dr. Zach has been a Godsend for our daughter! ...,Female


In [4]:
# import crowdsourced letters of recommendation

letters = pd.read_csv("./letters_tidied.csv")
letters.dropna(inplace = True)
letters.columns = ['Review', 'Gender']
letters['Gender'] = letters['Gender'].replace(0.0, 'Female')
letters['Gender'] = letters['Gender'].replace(1, 'Male')
letters

Unnamed: 0,Review,Gender
0,"dear review committee members,\r\n\r\nit is wi...",Female
1,this letter addresses some of my thoughts and ...,Female
2,I am writing this letter to recommend REDACTED...,Male
4,it is my sincere pleasure to nominate dr. REDA...,Female
5,i am writing to strongly recommend REDACTED RE...,Male
...,...,...
92,I am writing to highly recommend that you acce...,Female
93,i am writing to highly recommend that you acce...,Female
94,i am writing to highly recommend that you acce...,Female
95,\r\ni am writing to highly recommend that you ...,Female


In [5]:
# concatenate both dataframes

allData = pd.concat([hg, letters])
allData

Unnamed: 0,Review,Gender
0,"Exelent he really care about you, he is very p...",Male
1,Dr. A-Rahim was very knowledgeable about my lo...,Male
2,Horrible physician treats you like a kid. Pts ...,Female
3,Dr Aaberg has been a very good dr for my husba...,Male
4,Dr. Aaberg has been treating my macular degene...,Male
...,...,...
92,I am writing to highly recommend that you acce...,Female
93,i am writing to highly recommend that you acce...,Female
94,i am writing to highly recommend that you acce...,Female
95,\r\ni am writing to highly recommend that you ...,Female


## Support Vector Machine

In [12]:
import sklearn
import nltk
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from sklearn import metrics
from sklearn import svm

In [13]:
# vectorize
vec = TfidfVectorizer(min_df = 2, max_df = 0.98, ngram_range=(1,1)) 
reviews = allData['Review']

# transform into a sparse vector
vec.fit(reviews)
tf_idf_sparse = vec.transform(reviews)

#create dictionary of reviews
revs = dict(zip(allData.Review, allData.Gender))
revs

{'Exelent he really care about you, he is very professional,  and , compassionate.  Thanks Dr A ‘ dbojeri.': 'Male',
 'Dr. A-Rahim was very knowledgeable about my long term condition and spent a great deal of time explaining my treatment plan to me and my husband.': 'Male',
 'Horrible physician treats you like a kid. Pts c/o in er is never addressed by this md , she dances around anything you come in for. You will be told call your internal medicine dr because nothing is wrong, horrible bedside manners and argues with basic pt care ': 'Female',
 'Dr Aaberg has been a very good dr for my husband  Very kind and answers all your questions ': 'Male',
 'Dr. Aaberg has been treating my macular degeneration for five years. I could not ask for a better, smarter, professional doctor. I highly recommend him and his outstanding staff.  They all care about me as a person and as a patient.': 'Male',
 'I am privileged to have Dr Aaberg in charge of my eye care.': 'Male',
 'I have been seeing Dr. Aab

In [14]:
# create list of cleaned, stemmed adjectives, for later analysis. 

vect = CountVectorizer()
vect.fit(revs)
words = vect.get_feature_names()

adjectives = []
stop_words = list(set(stopwords.words('english')))

# remove stopwords 
stopped = [w for w in words if not w in stop_words]
    
# parts of speech tagging for each word 
pos = nltk.pos_tag(stopped)

# make a list of  all adjectives identified by the allowed word types list above
for w in pos:
    if w[1][0] == 'J':
        adjectives.append(w[0].lower())

In [19]:
print(tf_idf_sparse[0, :])

  (0, 8308)	0.18087671068924527
  (0, 7963)	0.14515562205290422
  (0, 7385)	0.35561253933942405
  (0, 6038)	0.27852469718081846
  (0, 5793)	0.2373437823422639
  (0, 3983)	0.12189971460612926
  (0, 3374)	0.2636511185535702
  (0, 2703)	0.6446316283629488
  (0, 2326)	0.10587678031682882
  (0, 1571)	0.28720820096031574
  (0, 1248)	0.20441648315214442
  (0, 519)	0.0869707345247959
  (0, 197)	0.2070634329049377


In [20]:
# split dtata into test and training sets
X_train, X_test, y_train, y_test = train_test_split(tf_idf_sparse, allData['Gender'], test_size=0.2)

In [21]:
# use SVM to test

sv = svm.SVC(kernel='linear')
model = sv.fit(X_train, y_train)

In [24]:
#create dataframe of data
recs = []

for w,i in vec.vocabulary_.items():
    recs.append([str(w)] + list(sv.coef_[:,i]))

df_weights = pd.DataFrame(recs, columns=['word', 'weight'])

In [25]:
# print most gendered male words in set of preferred parts of speech
maleAdjectives = df_weights[df_weights['word'].isin(adjectives)].sort_values('weight', ascending=False)[:30]
maleAdjectives

Unnamed: 0,word,weight
2303,kindness,"(0, 0)\t0.9898557727443765"
1114,arrogant,"(0, 0)\t0.9235297567142859"
4358,amy,"(0, 0)\t0.8584852715001047"
2328,suffered,"(0, 0)\t0.6821493150463475"
7041,smartest,"(0, 0)\t0.6709904308656427"
87,outstanding,"(0, 0)\t0.6708220405966364"
2448,thoughtful,"(0, 0)\t0.6643555195069476"
67,good,"(0, 0)\t0.6562346550203494"
7525,udall,"(0, 0)\t0.6394898328500922"
2252,uti,"(0, 0)\t0.6317971723593443"


In [26]:
# print most gendered female words in set of preferred parts of speech
femaleAdjectives = df_weights[df_weights['word'].isin(adjectives)].sort_values('weight', ascending=True)[:50]
femaleAdjectives

Unnamed: 0,word,weight
4510,eady,"(0, 0)\t-1.9459689281183565"
2649,pediatrician,"(0, 0)\t-1.35320067997141"
3549,cabalona,"(0, 0)\t-1.0794953330678463"
304,unprofessional,"(0, 0)\t-1.0555093671171656"
1456,sweet,"(0, 0)\t-0.9243701689912094"
2361,unhelpful,"(0, 0)\t-0.9147937923273024"
3149,angel,"(0, 0)\t-0.9125759428419388"
1487,current,"(0, 0)\t-0.8586584385192475"
2708,nasty,"(0, 0)\t-0.8492948856767133"
2008,warm,"(0, 0)\t-0.843669932247232"
