In [1]:
import requests
import pandas as pd
from bs4 import BeautifulSoup
import json
import numpy as np
import datetime
import matplotlib.pyplot as plt
import re
from time import sleep

# Importing Testing and Training Data

In [7]:
# import HealthGrades reviews data
reviewsByGender = pd.read_csv("./revs_gendered.csv")
reviewsByGender.dropna(inplace = True)
reviewsByGender.columns = ['Review', 'Gender']

# training data:
reviewsByGender

Unnamed: 0,Review,Gender
0,"Exelent he really care about you, he is very p...",Male
1,Dr. A-Rahim was very knowledgeable about my lo...,Male
2,Horrible physician treats you like a kid. Pts ...,Female
3,Dr Aaberg has been a very good dr for my husba...,Male
4,Dr. Aaberg has been treating my macular degene...,Male
...,...,...
8889,Very helpful and sincere about my pain needs! ...,Male
8890,"Several of my children see Dr. Zach, she is a ...",Female
8891,Amazing Pediatric Neurologist stays up to date...,Female
8892,Dr. Zach has been a Godsend for our daughter! ...,Female


In [8]:
# import 

letters = pd.read_csv("./letters_tidied.csv")
letters.dropna(inplace = True)
letters.columns = ['Letter', 'Gender']
letters['Gender'] = letters['Gender'].replace(0.0, 'Female')
letters['Gender'] = letters['Gender'].replace(1.0, 'Male')
letters.dropna(inplace = True)


# testing data:
letters

Unnamed: 0,Letter,Gender
0,"dear review committee members,\r\n\r\nit is wi...",Female
1,this letter addresses some of my thoughts and ...,Female
2,I am writing this letter to recommend REDACTED...,Male
4,it is my sincere pleasure to nominate dr. REDA...,Female
5,i am writing to strongly recommend REDACTED RE...,Male
...,...,...
92,I am writing to highly recommend that you acce...,Female
93,i am writing to highly recommend that you acce...,Female
94,i am writing to highly recommend that you acce...,Female
95,\r\ni am writing to highly recommend that you ...,Female


In [35]:
import sklearn
import nltk
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression

In [36]:
# vectorize
vec = TfidfVectorizer(min_df = 2, max_df = 0.98, ngram_range=(1,1))
reviews = reviewsByGender['Review']

# transform into a sparse vector
vec.fit(reviews)
tf_idf_sparse = vec.transform(reviews)
tf_idf_sparse

<8894x7365 sparse matrix of type '<class 'numpy.float64'>'
	with 322901 stored elements in Compressed Sparse Row format>

## How well does our healthgrades data predict the gender of our reference letter subjects?
We tested four different machine learning techniques: Logistic Regression, Naive Bayes, Support vector Machine, and K nearest neighbors

In [40]:
# import machine learning tools
from sklearn.naive_bayes import BernoulliNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn import svm
from sklearn import metrics
from sklearn.metrics import classification_report

In [41]:
# define training and testing sets
x_train = reviewsByGender['Review']
y_train = reviewsByGender['Gender']
x_new = letters['Letter']

# vectorize
vec = CountVectorizer(max_df=100, min_df=2, binary=False, strip_accents = None)
X_train = vec.fit_transform(x_train)
X_test = vec.transform(x_new)

## Tests
# ---------------------------------------------------------------------------

In [42]:
# test Naive Bayes
nb = BernoulliNB()
nb.fit(X_train, y_train)
gender_pred = nb.predict(X_test)
print(classification_report(letters['Gender'], gender_pred, labels=['Female','Male']))
print("Naive Bayes Accuracy:",metrics.accuracy_score(letters['Gender'], gender_pred))

              precision    recall  f1-score   support

      Female       0.69      0.97      0.81        61
        Male       0.50      0.07      0.12        28

    accuracy                           0.69        89
   macro avg       0.60      0.52      0.47        89
weighted avg       0.63      0.69      0.59        89

Naive Bayes Accuracy: 0.6853932584269663


In [43]:
# test Logistic Regression
lr = LogisticRegression(C=1e25, class_weight="balanced")
lr.fit(X_train, y_train)
gender_pred = lr.predict(X_test)
print(classification_report(letters['Gender'], gender_pred, labels=['Female','Male']))
print("Logistic Regression Accuracy:",metrics.accuracy_score(letters['Gender'], gender_pred))



              precision    recall  f1-score   support

      Female       0.67      0.13      0.22        61
        Male       0.31      0.86      0.46        28

    accuracy                           0.36        89
   macro avg       0.49      0.49      0.34        89
weighted avg       0.55      0.36      0.29        89

Logistic Regression Accuracy: 0.3595505617977528


In [44]:
# test KNN
knn = KNeighborsClassifier(n_neighbors=2)
knn.fit(X_train, y_train)
gender_pred = knn.predict(X_test)
print(classification_report(letters['Gender'], gender_pred, labels=['Female','Male']))
print("KNN Accuracy:",metrics.accuracy_score(letters['Gender'], gender_pred))

              precision    recall  f1-score   support

      Female       0.78      0.46      0.58        61
        Male       0.38      0.71      0.49        28

    accuracy                           0.54        89
   macro avg       0.58      0.59      0.54        89
weighted avg       0.65      0.54      0.55        89

KNN Accuracy: 0.5393258426966292


In [45]:
# test SVM
sv = svm.SVC(kernel='linear')
sv.fit(X_train, y_train)
gender_pred = sv.predict(X_test)
print(classification_report(letters['Gender'], gender_pred, labels=['Female','Male']))
print("SVM Accuracy:",metrics.accuracy_score(letters['Gender'], gender_pred))

              precision    recall  f1-score   support

      Female       0.81      0.48      0.60        61
        Male       0.40      0.75      0.52        28

    accuracy                           0.56        89
   macro avg       0.60      0.61      0.56        89
weighted avg       0.68      0.56      0.57        89

SVM Accuracy: 0.5617977528089888
