In [1]:
import spacy
import textacy
import gensim
import os
import re
import numpy as np
import pandas as pd
import scipy
from tqdm import tnrange,tqdm_notebook

In [2]:
# this command will enable Jupyter to interactively display outputs from all expressions
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

In [3]:
# this command will enable Jupyter to hide warning messages
import warnings
warnings.filterwarnings("ignore")

# Prepare data and clarify gender

### Import raw data

In [3]:
review_panel = pd.read_csv('./data/reviews_panel_t8.csv')
review_panel.head()

Unnamed: 0,Id,review_id,href,overall_rating,Ease_of_Appointment,Promptness,Courteous_Staff,Accurate_Diagnosis,Bedside_Manner,Spends_Time_with_Me,...,Advanced_Technology,Caring_Manner,Pain_Minimized,Satisfaction,date,reviewer,title,content,helpful_vote,crawl_date
0,12949966,23507540,/dentists/Dr_Aarika_Anderson_Elter,5,4.0,,,,,,...,,5.0,5.0,,2012-11-16,,Great results,Dr. Anderson explained in detail my options fo...,,2016-09-06 20:54:01
1,12949967,27184431,/dentists/Dr_Aamir_Wahab,5,5.0,,,,,,...,5.0,5.0,5.0,5.0,2015-03-15,Brad s.,implant,Had an implant done and it was painless. I cou...,,2016-09-06 20:54:10
2,12949968,26307282,/dentists/Dr_Aanal_Parikh,1,1.0,,,,,,...,1.0,1.0,1.0,1.0,2014-10-12,Bill johnson,Warning ..would not see this dentist,Do not go to this dentist office they scam peo...,,2016-09-06 20:54:13
3,12949969,28904504,/dentists/Dr_Aaron_Aguilar,5,5.0,,,,,,...,5.0,5.0,5.0,5.0,2015-06-11,,,Very thoughtful Dr. Communicates/bedside mann...,,2016-09-06 20:54:14
4,12949970,28380953,/dentists/Dr_Aaron_D_Larsen,4,3.0,,,,,,...,4.0,4.0,4.0,5.0,2015-03-31,,Dr larsen great others no,Would continue to see Dr. Larsen however his ...,,2016-09-06 20:54:18


In [4]:
review_panel.shape

(1796204, 23)

In [94]:
# focus on text comments
review = review_panel.loc[:,['review_id','href','title','content']]
review.head()

Unnamed: 0,review_id,href,title,content
0,23507540,/dentists/Dr_Aarika_Anderson_Elter,Great results,Dr. Anderson explained in detail my options fo...
1,27184431,/dentists/Dr_Aamir_Wahab,implant,Had an implant done and it was painless. I cou...
2,26307282,/dentists/Dr_Aanal_Parikh,Warning ..would not see this dentist,Do not go to this dentist office they scam peo...
3,28904504,/dentists/Dr_Aaron_Aguilar,,Very thoughtful Dr. Communicates/bedside mann...
4,28380953,/dentists/Dr_Aaron_D_Larsen,Dr larsen great others no,Would continue to see Dr. Larsen however his ...


In [95]:
# combine the title and content as final mateiral
review_panel_text = review_panel['title'].fillna('').astype(str) + " " + review_panel['content'].fillna('')
review['comments']=review_panel_text
review.drop(['title','content'], axis=1, inplace=True)
review.head()

Unnamed: 0,review_id,href,title,content,comments
0,23507540,/dentists/Dr_Aarika_Anderson_Elter,Great results,Dr. Anderson explained in detail my options fo...,Great results Dr. Anderson explained in detai...
1,27184431,/dentists/Dr_Aamir_Wahab,implant,Had an implant done and it was painless. I cou...,implant Had an implant done and it was painles...
2,26307282,/dentists/Dr_Aanal_Parikh,Warning ..would not see this dentist,Do not go to this dentist office they scam peo...,Warning ..would not see this dentist Do not go...
3,28904504,/dentists/Dr_Aaron_Aguilar,,Very thoughtful Dr. Communicates/bedside mann...,Very thoughtful Dr. Communicates/bedside man...
4,28380953,/dentists/Dr_Aaron_D_Larsen,Dr larsen great others no,Would continue to see Dr. Larsen however his ...,Dr larsen great others no Would continue to se...


### Predict gender using first name

In [104]:
doc_list = pd.read_csv("./data/doctors_list_t8.csv").loc[:,['href','name']]
review = pd.merge(review, doc_list, on=['href'])
review.head()

Unnamed: 0,review_id,href,title,content,comments,name
0,23507540,/dentists/Dr_Aarika_Anderson_Elter,Great results,Dr. Anderson explained in detail my options fo...,Great results Dr. Anderson explained in detai...,Aarika D Anderson Elter
1,27184431,/dentists/Dr_Aamir_Wahab,implant,Had an implant done and it was painless. I cou...,implant Had an implant done and it was painles...,Aamir Wahab
2,26307282,/dentists/Dr_Aanal_Parikh,Warning ..would not see this dentist,Do not go to this dentist office they scam peo...,Warning ..would not see this dentist Do not go...,Aanal K Parikh
3,28904504,/dentists/Dr_Aaron_Aguilar,,Very thoughtful Dr. Communicates/bedside mann...,Very thoughtful Dr. Communicates/bedside man...,Aaron D Aguilar
4,28380953,/dentists/Dr_Aaron_D_Larsen,Dr larsen great others no,Would continue to see Dr. Larsen however his ...,Dr larsen great others no Would continue to se...,Aaron D Larsen


In [105]:
def firstname(x):
    name = str(x).lower().split(' ')
    return name[0]

review['name']=review['name'].apply(firstname)
review.head()

Unnamed: 0,review_id,href,title,content,comments,name
0,23507540,/dentists/Dr_Aarika_Anderson_Elter,Great results,Dr. Anderson explained in detail my options fo...,Great results Dr. Anderson explained in detai...,aarika
1,27184431,/dentists/Dr_Aamir_Wahab,implant,Had an implant done and it was painless. I cou...,implant Had an implant done and it was painles...,aamir
2,26307282,/dentists/Dr_Aanal_Parikh,Warning ..would not see this dentist,Do not go to this dentist office they scam peo...,Warning ..would not see this dentist Do not go...,aanal
3,28904504,/dentists/Dr_Aaron_Aguilar,,Very thoughtful Dr. Communicates/bedside mann...,Very thoughtful Dr. Communicates/bedside man...,aaron
4,28380953,/dentists/Dr_Aaron_D_Larsen,Dr larsen great others no,Would continue to see Dr. Larsen however his ...,Dr larsen great others no Would continue to se...,aaron


In [36]:
from __future__ import absolute_import, division, print_function
from six.moves import urllib
import os
from zipfile import ZipFile
import csv
import io


class simpleGenderPredictor():
    
    def __init__(self):
        self.name_dict = self.extractNamesDict()

    def downloadNames(self):
        urllib.request.urlretrieve(
            'https://www.ssa.gov/oact/babynames/names.zip', 'names.zip')

    def extractNamesDict(self):
        """
        download names.zip from SSA if necessary
        construct a dict from SSA name data: NAME: [number of M, number of F]
        """
        if not os.path.exists('names.zip'):
            print('names.zip does not exist, downloading from ssa.gov')
            self.downloadNames()
        else:
            print('names.zip exists, not downloading')
        zf = ZipFile('names.zip', 'r')
        filenames = zf.namelist()

        names = dict()
        genderMap = {'M': 0, 'F': 1}

        for filename in filenames:
            if filename.endswith('.txt'):
                file = zf.open(filename, 'r')
                rows = csv.reader(
                    io.TextIOWrapper(file, encoding="latin-1"), delimiter=',')

                for row in rows:
                    name = row[0].upper()
                    gender = genderMap[row[1]]
                    count = int(row[2])

                    if name not in names:
                        names[name] = [0, 0]
                    names[name][gender] = names[name][gender] + count
                file.close()
        print('name dictionary constructed')
        return names

    def predict_name(self, a_name):
        """
        Outputs a tuple: prediction, probability from historical data
        """
        freq = self.name_dict.get(a_name.upper())
        if freq is None:
            return 'Unknown', None
        elif freq[0] >= freq[1]:
            return 'M', freq[0] / (freq[0] + freq[1])
        else:
            return 'F', freq[1] / (freq[0] + freq[1])

In [106]:
p = simpleGenderPredictor()
firstNameList = list(review.name)

gender_pred = []
for i in tqdm_notebook(firstNameList):
    pred = p.predict_name(str(i))
    gender_pred += [pred[0]]

names.zip exists, not downloading
name dictionary constructed


In [108]:
review['name_gender']=pd.Series(gender_pred).values
review.head()

Unnamed: 0,review_id,href,title,content,comments,name,name_gender
0,23507540,/dentists/Dr_Aarika_Anderson_Elter,Great results,Dr. Anderson explained in detail my options fo...,Great results Dr. Anderson explained in detai...,aarika,F
1,27184431,/dentists/Dr_Aamir_Wahab,implant,Had an implant done and it was painless. I cou...,implant Had an implant done and it was painles...,aamir,M
2,26307282,/dentists/Dr_Aanal_Parikh,Warning ..would not see this dentist,Do not go to this dentist office they scam peo...,Warning ..would not see this dentist Do not go...,aanal,Unknown
3,28904504,/dentists/Dr_Aaron_Aguilar,,Very thoughtful Dr. Communicates/bedside mann...,Very thoughtful Dr. Communicates/bedside man...,aaron,M
4,28380953,/dentists/Dr_Aaron_D_Larsen,Dr larsen great others no,Would continue to see Dr. Larsen however his ...,Dr larsen great others no Would continue to se...,aaron,M


### Predict gender using comments    & Filter (nurse, helper, ...)

In [176]:
female_pronoun=set(['she','her'])
male_pronoun=set(['he','his','him'])
special_positions=set(['nurse','secretary','receptionist','assistant','nurses','nurseassistant','scheduler','helper'])

def contentHe(s):
    s = s.lower().strip() 
    s = re.sub('[^a-z]',' ',s)
    s = s.split(' ')
    content_he= int()
    if any(word in s for word in male_pronoun):
          content_he= 1
    return content_he

def contentShe(s):
    s = s.lower().strip() 
    s = re.sub('[^a-z]',' ',s)
    s = s.split(' ')
    content_she= int()
    if any(word in s for word in female_pronoun):
          content_she= 1
    return content_she

def positionFilter(s):
    s = s.lower().strip() 
    s = re.sub('[^a-z]',' ',s)
    s = s.split(' ')
    sp_positions = int()
    if any(word in s for word in special_positions):
        sp_positions = 1
    return sp_positions

In [177]:
# increate columns as marks (~10 mins)
review['content_he']=review['comments'].map(contentHe)
review['content_she']=review['comments'].apply(contentShe)
review['sp_positions']=review['comments'].apply(positionFilter)

review.head(10)

Unnamed: 0,review_id,href,title,content,comments,name,name_gender,content_he,content_she,sp_positions
0,23507540,/dentists/Dr_Aarika_Anderson_Elter,Great results,Dr. Anderson explained in detail my options fo...,Great results Dr. Anderson explained in detai...,aarika,F,0,0,0
1,27184431,/dentists/Dr_Aamir_Wahab,implant,Had an implant done and it was painless. I cou...,implant Had an implant done and it was painles...,aamir,M,1,0,0
2,26307282,/dentists/Dr_Aanal_Parikh,Warning ..would not see this dentist,Do not go to this dentist office they scam peo...,Warning ..would not see this dentist Do not go...,aanal,Unknown,0,0,0
3,28904504,/dentists/Dr_Aaron_Aguilar,,Very thoughtful Dr. Communicates/bedside mann...,Very thoughtful Dr. Communicates/bedside man...,aaron,M,0,0,0
4,28380953,/dentists/Dr_Aaron_D_Larsen,Dr larsen great others no,Would continue to see Dr. Larsen however his ...,Dr larsen great others no Would continue to se...,aaron,M,1,0,1
5,30421069,/dentists/Dr_Aaron_Acres,,Dr. Acres and his staff are an exceptionally p...,Dr. Acres and his staff are an exceptionally ...,aaron,M,1,0,0
6,24080817,/dentists/Dr_Aaron_A_Johnson,Not Friendly,After smiling and shaking my hand he was just ...,Not Friendly After smiling and shaking my hand...,aaron,M,1,1,0
7,26586170,/dentists/Dr_Aaron_Larock,Dr. LaRock,"There are three doctors, I've seen (Dr Sara, D...","Dr. LaRock There are three doctors, I've seen ...",aaron,M,0,0,0
8,30409668,/dentists/Dr_Aaron_K_Lee,Dental office,Dr. Lee is the best dentist I have seen in my ...,Dental office Dr. Lee is the best dentist I h...,aaron,M,1,0,0
9,30156562,/dentists/Dr_Aaron_K_Lee,wonderful office,Very clean office. Very modern. Staff is ext...,wonderful office Very clean office. Very mo...,aaron,M,0,0,0


In [182]:
review.comments[6]

"Not Friendly After smiling and shaking my hand he was just cold. I was basically ordered where to sit because I was too slow for him I guess. Didn't ask me how I was or try to get to know me in the least. I'm used to friendly dentists who try to work a little humanity into a visit but he must see people as just financial resources, oh maybe I didn't rate any real courtesy because he wasn't making much off me the day I was there .  Explained my dental problems well enough, I think, but he really didn't want me to say ANYTHING, tried to interrupt if I had a question. Walked off like he was God's gift after he was done talking about what work my teeth needed. He's practicing now with Progressive Dentistry.  I was totally turned off, and stunned. I wouldn't go back to see him if he was the last dentist on the planet. His dental hygienist was wonderful. I have nothing bad to say about her or my teeth cleaning experience, but Dr. Johnson needs charm lessons. "

In [185]:
review.sp_positions.sum()/len(review)
sum(review.name_gender=='Unknown')/len(review)

0.0645310793188714

0.06724944393289427

### Gender clarification

In [223]:
# column 'classify' is the final gender inference result
review['classify']=review['name_gender']
review.loc[(review.name_gender=='Unknown') & (review.content_he==1) & (review.content_she==1), 'classify']='Conflict'
review.loc[(review.name_gender=='Unknown') & (review.content_he==0) & (review.content_she==0), 'classify']='Unknown'
review.loc[(review.name_gender=='Unknown') & (review.content_he==1) & (review.content_she==0), 'classify']='M'
review.loc[(review.name_gender=='Unknown') & (review.content_he==0) & (review.content_she==1), 'classify']='F'

print('Unknown:', sum(review.classify=='Unknown'))
print('Confict:', sum(review.classify=='Confict'))
print('F:', sum(review.classify=='F'))
print('M:', sum(review.classify=='M')) 

Unknown: 34328
Confict: 0
F: 485657
M: 1269320


In [276]:
# column 'gender_conflict' indicates where name_gender conflicts with content_he/content_she 
# or content_he conflicts with content_she
review['gender_conflict']=0
review.loc[(review.name_gender=='F') & (review.content_he==1), 'gender_conflict']=1
review.loc[(review.name_gender=='M') & (review.content_she==1), 'gender_conflict']=1
review.loc[(review.classify=='Conflict'), 'gender_conflict']=1

print('Confict:', sum(review.gender_conflict), sum(review.gender_conflict)/len(review))
review.head()

Confict: 120345 0.06703721095790602


Unnamed: 0,review_id,href,comments,name,name_gender,content_he,content_she,sp_positions,classify,gender_conflict
0,23507540,/dentists/Dr_Aarika_Anderson_Elter,Great results Dr. Anderson explained in detai...,aarika,F,0,0,0,F,0
1,27184431,/dentists/Dr_Aamir_Wahab,implant Had an implant done and it was painles...,aamir,M,1,0,0,M,0
2,26307282,/dentists/Dr_Aanal_Parikh,Warning ..would not see this dentist Do not go...,aanal,Unknown,0,0,0,Unknown,0
3,28904504,/dentists/Dr_Aaron_Aguilar,Very thoughtful Dr. Communicates/bedside man...,aaron,M,0,0,0,M,0
4,28380953,/dentists/Dr_Aaron_D_Larsen,Dr larsen great others no Would continue to se...,aaron,M,1,0,1,M,0


In [277]:
clr_review=review[(review.sp_positions!=1) & (review.gender_conflict!=1)]
clr_review.shape
len(clr_review)/len(review)

(1586809, 10)

0.8839191464780746

### Save this cleaned and marked data

In [278]:
# save the data as a CSV file (20 seconds)
clr_review.to_csv("./data/clr_review_df.csv", index=False)

- Review_test dataset

In [220]:
# use first 100 comments to do experiment
review_test = review.iloc[:100,:]
review_test['name']= 1


# obtain first name
def firstname(x):
    name = str(x).lower().split('_')
    return name[1]
review_test['name']=review_test['href'].apply(firstname)    

# predict gender using first name
p = simpleGenderPredictor()
firstNameList = list(review_test.name)

gender_pre = []
for i in tqdm_notebook(firstNameList):
    pred = p.predict_name(str(i))
    gender_pre += [pred[0]]

review_test['name_gender']=pd.Series(gender_pre).values


# Predict gender using comments & Filter {nurse, helper,...}
review_test['content_he'] = 0
review_test['content_she'] = 0
review_test['sp_positions'] = 0

female_pronoun=set(['she','her'])
male_pronoun=set(['he','his','him'])
special_positions=set(['nurse','secretary','receptionist','assistant','nurses','nurseassistant','scheduler','helper'])

for i in tnrange(len(review_test)):
    s = review_test.iloc[i,4]  # read in comments
    s = s.lower().strip() 
    s = re.sub('[^a-z]',' ',s)
    s = s.split(' ')
    if any(word in s for word in male_pronoun):
        review_test.iloc[i,7] = 1   
    if any(word in s for word in female_pronoun):
        review_test.iloc[i,8] = 1        
    if any(word in s for word in special_positions):
        review_test.iloc[i,9] = 1

        
# column 'classify' is the final gender result
review_test['classify']=review_test['name_gender']
review_test.loc[(review_test.name_gender=='Unknown') & (review_test.content_he==1) & (review_test.content_she==1), 'classify']='Conflict'
review_test.loc[(review_test.name_gender=='Unknown') & (review_test.content_he==0) & (review_test.content_she==0), 'classify']='Unknown'
review_test.loc[(review_test.name_gender=='Unknown') & (review_test.content_he==1) & (review_test.content_she==0), 'classify']='M'
review_test.loc[(review_test.name_gender=='Unknown') & (review_test.content_he==0) & (review_test.content_she==1), 'classify']='F'

print('Unknown:', sum(review_test.classify=='Unknown'))
print('Confict:', sum(review_test.classify=='Confict'))
print('F:', sum(review_test.classify=='F'))
print('M:', sum(review_test.classify=='M'))        
        

# column 'gender_conflict' indicates where name_gender conflicts with content_he/content_she
review_test['gender_conflict']=0
review_test.loc[(review_test.name_gender=='F') & (review_test.content_he==1), 'gender_conflict']=1
review_test.loc[(review_test.name_gender=='M') & (review_test.content_she==1), 'gender_conflict']=1
review_test.loc[(review_test.classify=='Conflict'), 'gender_conflict']=1
print('Confict:', sum(review_test.gender_conflict==1))


# Now we have a clean review dataset to be our corpus
clr_review_test=review_test[(review_test.sp_positions!=1) & (review_test.gender_conflict!=1)]
clr_review_test.shape
len(clr_review_test)/len(review_test)

review_test

Unnamed: 0,review_id,href,comments,name,name_gender,content_he,content_she,sp_positions,classify,gender_conflict
0,23507540,/dentists/Dr_Aarika_Anderson_Elter,Great results Dr. Anderson explained in detai...,aarika,F,0,0,0,F,0
1,27184431,/dentists/Dr_Aamir_Wahab,implant Had an implant done and it was painles...,aamir,M,0,1,0,M,0
2,26307282,/dentists/Dr_Aanal_Parikh,Warning ..would not see this dentist Do not go...,aanal,Unknown,0,0,0,Unknown,0
3,28904504,/dentists/Dr_Aaron_Aguilar,Very thoughtful Dr. Communicates/bedside man...,aaron,M,0,0,0,M,0
4,28380953,/dentists/Dr_Aaron_D_Larsen,Dr larsen great others no Would continue to se...,aaron,M,0,1,1,M,0
5,30421069,/dentists/Dr_Aaron_Acres,Dr. Acres and his staff are an exceptionally ...,aaron,M,0,1,0,M,0
6,24080817,/dentists/Dr_Aaron_A_Johnson,Not Friendly After smiling and shaking my hand...,aaron,M,1,1,0,M,1
7,26586170,/dentists/Dr_Aaron_Larock,"Dr. LaRock There are three doctors, I've seen ...",aaron,M,0,0,0,M,0
8,30409668,/dentists/Dr_Aaron_K_Lee,Dental office Dr. Lee is the best dentist I h...,aaron,M,0,1,0,M,0
9,30156562,/dentists/Dr_Aaron_K_Lee,wonderful office Very clean office. Very mo...,aaron,M,0,0,0,M,0


# Pre-process via spaCy

### Punctuation removal & Lemmatization

In [234]:
nlp = spacy.load('en')

In [279]:
# take comments as text mateiral
review_panel_text = clr_review['comments'].fillna('').astype(str)
sample_s = review_panel_text.tolist()

In [280]:
# lemmatize and fitler documents; deal with punctuation(3h 30mins)
terms_list=[]
words_to_keep = set(['he','his','him','she','her','I','my','me','mine'])
def lemma(x):
    if x.text in words_to_keep: 
        return x.text
    elif x.text not in words_to_keep:
        return x.lemma_

for doc in tqdm_notebook(sample_s):
    doc_cleared = textacy.preprocess_text(doc, lowercase=False, no_punct=True)
    doc_new = nlp(doc_cleared)
    tokens=[lemma(x) for x in doc_new]  #to keep pronouns
    terms_list.append(tokens)

print(terms_list[:5])




### Spellcheck and correction
- Manually collect the most frequent typos of attribute words and gender words.
- This method can __obtain better correction results__ since we focused on key words we need.

In [242]:
words_to_filter={
    'heshe':'she','shehe':'he','hisher':'her','herhis':'his','himher':'her','herhim':'him',
    'womenas':'women','womena':'women','womans':'women','womens':'women',
    'females':'female','femalei':'female',
    'heri':'her','herand':'her',
    'hershe':'she','doctorshe':'she','babieshe':'she','physicianshe':'she','professionalshe':'she',
    'doctorsh':'she','drshe':'she','womanshe':'she',
    'ladys':'lady','girls':'girl','blond':'blonde','brunet':'brunette',
    'himhe':'he','himi':'him','hime':'him','himand':'him','hin':'him','himso':'him','hom':'him','himvery':'him',
    'himthat':'him','himwe':'him',
    'mens':'men','manand':'man',
    'boys':'boy','guys':'guy','males':'male',
    'doctorhe':'he','guyhe':'he','drhe':'he','manhe':'he','childrenhe':'he','personhe':'he','physicianhe':'he',
    'experiencehe':'he','professionalhe':'he',
    
    'intellegent':'intelligent','inteligent':'intelligent', 
    'conscientous':'conscientious',
    'imformative':'informative',
    'skillfull':'skillful',
    'efficent':'efficient',
    'helpfull':'helpful','helping':'helpful','helful':'helpful',
    'welleducat':'welleducated',
    'welltrain':'welltrained', 
    'preventative':'preventive','preventitive':'preventive',
    'effectivei':'efficacious',
    'diagnositic':'diagnostic','diagnosic':'diagnostic','dianostic':'diagnostic','diagnotic':'diagnostic',
    'ableto':'able',
    'qualifed':'qualified','qualify':'qualified',
    'competant':'competent','compentent':'competent',
    'curteous':'courteous','curtious':'courteous','courtious':'courteous','couteous':'courteous',
    'proffesional':'professional','proffessional':'professional','profesional':'professional',
    'professionali':'professional','professinal':'professional','helpfulprofessional':'professional',
    'knowledgable':'knowledgeable','knowlegable':'knowledgeable','knowlegeable':'knowledgeable',
    'knowledable':'knowledgeable','knowledgeble':'knowledgeable',
    'listen':'listening',
    'thoughful':'thoughtful',
    'pleasureable':'pleasurable',
    'attentative':'attentive',
    'compasionate':'compassionate','compationate':'compassionate',
    'conceren':'concerned','concered':'concerned','consern':'concerned','concerend':'concerned','concer':'concerned',
    'behavioural':'behavioral',
    'pleasent':'pleasant','plesant':'pleasant',
    'freindly':'friendly','friendlyi':'friendly','friendy':'friendly',
    'empathic':'empathetic',
    'careing':'caring','carring':'caring',
    'releif':'relief','relif':'relief','relive':'relief','reliefhe':'relief',
    'therapists':'therapist',
    'technic':'technique',
    'expereince':'experience','experiance':'experience','expierence':'experience','experence':'experience',
    'experince':'experience','experiencei':'experience','expirience':'experience','expierience':'experience',
    'exerience':'experience','expirence':'experience','experienc':'experience','experiece':'experience',
    'exprience':'experience','experiene':'experience','experiences':'experience','expericence':'experience',
    'expeience':'experience','exp':'experience','expeirence':'experience',
    'theraphy':'therapy','theropy':'therapy','therepy':'therapy','therapyi':'therapy','therpy':'therapy',
    'pt':'therapy','theapy':'therapy',
    'improvment':'improvement','improved':'improvement', 
    'dx':'diagnosis','diagnoses':'diagnosis','diagnosishe':'diagnosis','dignosis':'diagnosis',
    'diagnosiss':'diagnosis','diagnosisprognosis':'diagnosis','diagnosisvery':'diagnosis',
    'diagonsis':'diagnosis','diagnosisi':'diagnosis','dianosis':'diagnosis','diagnosisshe':'diagnosis',
    'diagonosis':'diagnosis','disgnosis':'diagnosis','diagnoisis':'diagnosis','diognosis':'diagnosis',
    'diagnois':'diagnosis','diagnosic':'diagnosis','diagnosisno':'diagnosis',
    'treament':'treatment','treatement':'treatment','treatmenti':'treatment','treatmentshe':'treatment',
    'medicationtreatment':'treatment','caretreatment':'treatment','diagnosistreatment':'treatment',
    'treatmenthe':'treatment',
    'standards':'standard','standar':'standard',
    'abilty':'ability',
    'skills':'skill','skillsi':'skill','skillset':'skill','skil':'skill','skillshe':'skill',
    'knowlege':'knowledge','knoweldge':'knowledge','knowlegde':'knowledge',
    'competance':'competence',
    'profesionalism':'professionalism','professionalismi':'professionalism',
    'professionalsim':'professionalism','proffesionalism':'professionalism',
    'expertize':'expertise','expertice':'expertise',
    'resultsi':'result','reult':'result','results':'result','resultsand':'result','resultsshe':'result',
    'resultshe':'result','resultswhich':'result','resut':'result','resultsthe':'result','resultsdr':'result',
    'resultsa':'result','resultsmy':'result','resulst':'result','resultsnot':'result','resutls':'result',
    'resultsit':'result','resulti':'result',
    'outcomei':'outcome',
    'listner':'listener',
    'comunicator':'communicator',
    'smilei':'smiley','smily':'smiley',
    'repoire':'rapport','repor':'rapport','repore':'rapport','raport':'rapport',
    'repoir':'rapport','rapore':'rapport','rapor':'rapport',
    'compasion':'compassion','compation':'compassion','compassioni':'compassion','compasssion':'compassion',
    'curtesy':'courtesy','courtsey':'courtesy',
    'repect':'respect','respecti':'respect','respectand':'respect','respecthe':'respect',
    'attn':'attention','attentioni':'attention','atttention':'attention','atention':'attention',
    'attnetion':'attention','attentionand':'attention','attenton':'attention','attetion':'attention',
    'attentionthe':'attention','attenion':'attention',
    'tust':'trust','turst':'trust',
    'behaviour':'behavior',
    'additude':'attitude','attitute':'attitude','attidude':'attitude',
    'atitude':'attitude','attitud':'attitude','attitudei':'attitude','attitiude':'attitude',
    'comunication':'communication','communications':'communication',
    'communcation':'communication','commuication':'communication',
    'realationship':'relationship','relationshipi':'relationship',
    'relationshiphe':'relationship','relatioship':'relationship',
    'manor':'manner','mannor':'manner','mannerit':'manner','mannerhe':'manner','manneri':'manner',
    'maner':'manner','mannershe':'manner','mannervery':'manner','mannersshe':'manner','mannerand':'manner',
    'mannera':'manner','mannerpersonality':'manner','mannersand':'manner','manneralway':'manner',
    'amnner':'manner','mammer':'manner','manne':'manner','mannar':'manner','mannerno':'manner',
    'mannerthe':'manner','mannerhave':'manner','mannerthere':'manner','manori':'manner',
    'mannerhis':'manner','mannerone':'manner','mannerher':'manner','mannersa':'manner','mannerexplain':'manner',
    'manneryou':'manner','mannerexcellent':'manner','manneris':'manner','mannerdr':'manner',
    'mannerwill':'manner','mannerwhen':'manner','mannernot':'manner','mannerin':'manner','mannernever':'manner',
    'manneroffice':'manner','mannerhave':'manner','mannerwould':'manner','mannerlisten':'manner',
    'mannerbut':'manner','mannerdid':'manner','mannersvery':'manner',
    'emphathy':'empathy',
    'pleasent':'pleasant','plesant':'pleasant',
    'accomodating':'accommodating','accomodat':'accommodating','accomadat':'accommodating',
    'accomidat':'accommodating','accomidating':'accommodating','accomadating':'accommodating'
}

In [281]:
# spellcheck the documents and correct spellings (35 seconds)
for line in tqdm_notebook(terms_list, mininterval=2):
    for i,word in enumerate(line):
        if word in words_to_filter:
            line[i]=words_to_filter[word]




### Save & import trained corpus

In [282]:
InteractiveShell.ast_node_interactivity = "none"  # speed this process

In [283]:
# save the corpus as a txt file (40 seconds)
with open("./data/clr_terms_list.txt","w") as thefile:
    for item in tqdm_notebook(terms_list):
      thefile.write("%s\n" % item)




In [284]:
InteractiveShell.ast_node_interactivity = "all" # resume the output option

In [None]:
# read in corpus (3 mins)
with open('./data/clr_terms_list.txt','r') as file:
    terms_list=[]
    for line in file.readlines():
        tmp=[]
        for i in line[1:-2].split(','):
            tmp.append(i.strip(" '"))
        terms_list.append(tmp)

# Build model via word2vec

In [285]:
# train word2vec on the corpus (10 mins)
model = gensim.models.Word2Vec(sentences=terms_list, size=100, window=7, min_count=5, workers=4)
# to save a trained model
model.save("./data/model_clr_comments")

- The analogic results seem better than the old model before clearance

In [249]:
# check any misspelling of key attribute words
model.wv.most_similar_cosmul(positive=['knowledgeable'],topn=10)

[('personable', 0.9074916243553162),
 ('intelligent', 0.8988879919052124),
 ('approachable', 0.8902322053909302),
 ('competent', 0.8805422186851501),
 ('compassionate', 0.8744218349456787),
 ('thoughtful', 0.8704231977462769),
 ('thorough', 0.8680344820022583),
 ('conscientious', 0.8632931709289551),
 ('informative', 0.8625794649124146),
 ('attentive', 0.8617123365402222)]

In [250]:
# found prevalant typos of gender words
model.wv.most_similar_cosmul(positive=['her'],topn=10)

[('she', 0.8664829730987549),
 ('-PRON-', 0.7346817851066589),
 ('shed', 0.7100279927253723),
 ('shell', 0.6926476359367371),
 ('overweight', 0.6855077743530273),
 ('that', 0.6851862668991089),
 ('daughter', 0.6832113265991211),
 ('alarm', 0.6810519695281982),
 ('parent', 0.674568235874176),
 ('obese', 0.6724981665611267)]

In [270]:
vocab_obj = model.wv.vocab["friendly"]
vocab_obj.count

30494

In [251]:
model.wv['I']  # numpy vector of a word

array([-0.64495885,  0.59357613,  0.05354114, -0.5356201 ,  0.03679908,
        1.2303874 ,  1.0802594 ,  0.43073598,  1.1777514 , -0.19657007,
       -0.61229813, -1.3695918 ,  3.3395555 ,  0.9843808 ,  0.5450105 ,
       -2.0221436 , -4.035487  , -0.3032979 ,  0.74440515, -0.19493127,
       -1.0600715 ,  2.2638512 ,  0.37473533,  1.253145  ,  0.7289774 ,
        0.3971539 ,  0.39925283,  2.3111734 , -1.6316954 ,  1.8963535 ,
        0.32695824, -0.561304  , -3.1125395 , -2.1620324 , -0.6255989 ,
       -1.2920182 , -0.28051597,  1.0827161 ,  1.9925852 ,  0.6398726 ,
        1.7055993 , -3.584166  , -1.2345035 , -0.78807765, -0.23221698,
        1.2614936 ,  1.2686561 , -2.348656  ,  1.522818  , -1.0812918 ,
       -0.5052574 ,  3.5710568 ,  0.21126084, -1.8686764 , -1.6131155 ,
       -2.6090913 , -1.5247493 , -0.38346907, -0.8097224 , -2.7224376 ,
        0.2064798 ,  4.6430526 , -1.1784403 ,  1.0314777 , -1.4779274 ,
       -1.8528348 , -0.32165417,  1.1619599 ,  2.2512786 ,  0.32

# Measure the gender bias
- via comparing comments on female and male physicians

In [10]:
# load the trained model
model = gensim.models.Word2Vec.load("./data/model_clr_comments")

### Build sets of gender words & attribute words

In [11]:
# gender words collected
Gender_words=pd.read_excel('./data/Gender_words.xlsx')
Gender_words

Unnamed: 0,man,woman
0,he,she
1,his,her
2,him,her
3,himself,herself
4,man,woman
5,men,women
6,male,female
7,boy,girl
8,guy,gal
9,gentleman,lady


In [12]:
# final gender word sets
man_set= set(Gender_words.man.dropna())  #n=11
woman_set= set(Gender_words.woman.dropna())-{'blonde','brunette'} #n=10
print(man_set)
print(woman_set)

{'gentleman', 'man', 'his', 'he', 'boy', 'guy', 'male', 'him', 'dude', 'men', 'himself'}
{'girl', 'chick', 'women', 'woman', 'herself', 'gal', 'her', 'female', 'she', 'lady'}


In [13]:
# attribute words collected
Vocabulary=pd.read_excel('./data/Vocabulary.xlsx', na_values="")
Vocabulary

Unnamed: 0,interaction,technique,enviroment,administration,interpersonal,technical,enviromental,administrative
0,empathy,outcome,facility,arrangemnet,responsive,expert,comfortable,timely
1,manner,result,atmosphere,timeliness,reliable,professional,smelly,organized
2,responsiveness,expert,tangible,waiting,enthusiastic,competent,clean,admin
3,relationship,expertise,surrounding,appointment,supportive,knowledgeable,sterile,managed
4,interplay,professionalism,smell,operation,caring,qualified,decorative,efficient
5,communication,competency,layout,admin,empathetic,skilled,clean,accessable
6,attitude,competence,color,administration,interactive,standard,enviromental,costly
7,behavior,reliability,look,coordination,friendly,able,,affordable
8,interaction,assurance,decoration,organization,pleasant,productive,,administrative
9,mutuality,knowledge,lighting,integration,behavioral,working,,


In [14]:
# only keep words existing in our model
def set_filter(att_set):
    new_set=[]
    del_word=[]
    for word in att_set:
        try:
            if model[word].all():
                new_set.append(word)
        except KeyError:
            del_word.append(word)
    print(del_word)
    return set(new_set)

In [15]:
# final attribute word sets
att_n_1= set(Vocabulary.interaction.dropna())
att_n_2= set(Vocabulary.technique.dropna())
att_adj_1= set(Vocabulary.interpersonal.dropna())
att_adj_2= set(Vocabulary.technical.dropna())

att_n_1=set_filter(att_n_1)
att_n_2=set_filter(att_n_2)
att_adj_1=set_filter(att_adj_1)
att_adj_2=set_filter(att_adj_2)

print(len(att_n_1),len(att_n_2),len(att_adj_1),len(att_adj_2))

['mutuality']
[]
[]
[]
40 38 39 35


In [16]:
# combine adj. and n. into one set
att_1=att_n_1.union(att_adj_1)
att_2=att_n_2.union(att_adj_2)
print(len(att_n_1), len(att_adj_1), len(att_1))
print(len(att_n_2), len(att_adj_2), len(att_2))

40 39 79
38 35 71


## Method 1: word analogies generated by model
- __analogy__ 'he' + (adj.) - 'she' = ? |  __converse__ 'she' + (adj.) - 'he' = ?
- __analogy__ 'his' + (noun) - 'her' = ? |  __converse__ 'her' + (noun) - 'his' = ?
- __To be noticed: Results heavily relies on the choice of keyword pairs__

### Pattern 1: analogy  'he' + (adj.) - 'she' = ?  |  converse 'she' + (adj.) - 'he' = ?
- attribute_words_adj_1 = {'nice','friendly','courteous','thoughtful','polite'}
- attribute_words_adj_2 = {'professional',__'effective'__,__'competent'__,'skilled',__'knowledgeable'__}

#### attribute words (adj) interpersonal (1-5)

In [316]:
# (1)
model.wv.most_similar(positive=['he', 'nice'], negative=['she'], topn=10)
# converse
model.wv.most_similar(positive=['she', 'nice'], negative=['he'], topn=10)

[('likable', 0.5753893256187439),
 ('cool', 0.5695706605911255),
 ('polite', 0.5640220642089844),
 ('likeable', 0.5541913509368896),
 ('friendly', 0.5536156892776489),
 ('pleasant', 0.5103829503059387),
 ('charming', 0.4990135431289673),
 ('handsome', 0.4929262101650238),
 ('talkative', 0.4812774956226349),
 ('cordial', 0.47065290808677673)]

[('sweet', 0.7454693913459778),
 ('friendly', 0.6398050785064697),
 ('lovely', 0.6141872406005859),
 ('pleasant', 0.6100215911865234),
 ('polite', 0.5795028209686279),
 ('welcoming', 0.5432094931602478),
 ('helpful', 0.5264031887054443),
 ('unfriendly', 0.5235804319381714),
 ('delightful', 0.508702278137207),
 ('cheery', 0.5075200796127319)]

In [320]:
# (2)
model.wv.most_similar(positive=['he', 'friendly'], negative=['she'], topn=10)
model.wv.most_similar(positive=['she', 'friendly'], negative=['he'], topn=10)

[('polite', 0.7373115420341492),
 ('courteous', 0.7035645246505737),
 ('cordial', 0.6462650299072266),
 ('nice', 0.630955696105957),
 ('pleasant', 0.6214845776557922),
 ('accommodating', 0.5908196568489075),
 ('welcoming', 0.5776514410972595),
 ('helpful', 0.5761107206344604),
 ('efficient', 0.5692339539527893),
 ('personable', 0.5680350661277771)]

[('polite', 0.6983291506767273),
 ('courteous', 0.6801798343658447),
 ('pleasant', 0.670880913734436),
 ('helpful', 0.664108395576477),
 ('efficient', 0.6403647661209106),
 ('welcoming', 0.6318656802177429),
 ('sweet', 0.6254236698150635),
 ('cheerful', 0.6135001182556152),
 ('accommodating', 0.5995230078697205),
 ('cordial', 0.594363272190094)]

In [322]:
# (3)
model.wv.most_similar(positive=['he', 'courteous'], negative=['she'], topn=10)
model.wv.most_similar(positive=['she', 'courteous'], negative=['he'], topn=10)

[('polite', 0.7063361406326294),
 ('friendly', 0.6599416136741638),
 ('cordial', 0.6510959267616272),
 ('efficient', 0.596300482749939),
 ('accommodating', 0.5921949744224548),
 ('helpful', 0.5637730360031128),
 ('corteous', 0.5395034551620483),
 ('pleasant', 0.5393297672271729),
 ('personable', 0.5384308695793152),
 ('punctual', 0.5360861420631409)]

[('friendly', 0.7223407030105591),
 ('polite', 0.6985869407653809),
 ('efficient', 0.6931324601173401),
 ('helpful', 0.6785531044006348),
 ('punctual', 0.6426445841789246),
 ('prompt', 0.6317688822746277),
 ('accommodating', 0.6264864802360535),
 ('attentive', 0.6263617873191833),
 ('cordial', 0.6258475184440613),
 ('pleasant', 0.6174027323722839)]

In [323]:
# (4)
model.wv.most_similar(positive=['he', 'thoughtful'], negative=['she'], topn=10)
model.wv.most_similar(positive=['she', 'thoughtful'], negative=['he'], topn=10)

[('compassionate', 0.644517183303833),
 ('considerate', 0.6329843997955322),
 ('humble', 0.6264907717704773),
 ('precise', 0.622470498085022),
 ('conscientious', 0.6113719344139099),
 ('forthright', 0.5942790508270264),
 ('perceptive', 0.5912184119224548),
 ('methodical', 0.5905348658561707),
 ('intelligent', 0.584755003452301),
 ('empathetic', 0.5834020972251892)]

[('insightful', 0.7054070830345154),
 ('empathetic', 0.7039657831192017),
 ('compassionate', 0.6779045462608337),
 ('intuitive', 0.6384857892990112),
 ('nonjudgmental', 0.6344080567359924),
 ('conscientious', 0.632402777671814),
 ('attentive', 0.6247828602790833),
 ('considerate', 0.6242361664772034),
 ('thorough', 0.6112649440765381),
 ('observant', 0.610867440700531)]

In [321]:
# (5)
model.wv.most_similar(positive=['he', 'polite'], negative=['she'], topn=10)
model.wv.most_similar(positive=['she', 'polite'], negative=['he'], topn=10)

[('courteous', 0.6953533291816711),
 ('cordial', 0.6845788359642029),
 ('friendly', 0.6820337772369385),
 ('helpful', 0.5947322845458984),
 ('accommodating', 0.5925295352935791),
 ('nice', 0.5891480445861816),
 ('personable', 0.5729230046272278),
 ('efficient', 0.5690980553627014),
 ('welcoming', 0.5574564337730408),
 ('pleasant', 0.5551237463951111)]

[('friendly', 0.7520256042480469),
 ('helpful', 0.7163360118865967),
 ('courteous', 0.7095066905021667),
 ('efficient', 0.6730093955993652),
 ('cordial', 0.6663464903831482),
 ('punctual', 0.651955246925354),
 ('welcoming', 0.6444542407989502),
 ('sweet', 0.6435736417770386),
 ('pleasant', 0.6395499110221863),
 ('accommodating', 0.6335143446922302)]

#### attribute words (adj) technical (6-10)

In [324]:
# (6)
model.wv.most_similar(positive=['he', 'professional'], negative=['she'], topn=10)
model.wv.most_similar(positive=['she', 'professional'], negative=['he'], topn=10)

[('humble', 0.570516049861908),
 ('polite', 0.557039201259613),
 ('personable', 0.5469394326210022),
 ('respectful', 0.5403343439102173),
 ('efficient', 0.539884626865387),
 ('considerate', 0.5393913984298706),
 ('knowledgeable', 0.5271739959716797),
 ('affable', 0.5208578109741211),
 ('courteous', 0.5169792771339417),
 ('cordial', 0.5133152604103088)]

[('efficient', 0.6268216967582703),
 ('attentive', 0.6003647446632385),
 ('warm', 0.5986876487731934),
 ('respectful', 0.5975770354270935),
 ('personable', 0.5875162482261658),
 ('knowledgeable', 0.5750784873962402),
 ('approachable', 0.5695705413818359),
 ('helpful', 0.5691096186637878),
 ('compassionate', 0.5613157153129578),
 ('considerate', 0.557100772857666)]

In [325]:
# (7)
model.wv.most_similar(positive=['he', 'effective'], negative=['she'], topn=10)
model.wv.most_similar(positive=['she', 'effective'], negative=['he'], topn=10)

[('efficacious', 0.5925412178039551),
 ('conservative', 0.5665525197982788),
 ('beneficial', 0.5654409527778625),
 ('prolotherapy', 0.5450719594955444),
 ('innovative', 0.5323076248168945),
 ('noninvasive', 0.5256170034408569),
 ('progressive', 0.5247855186462402),
 ('agressive', 0.5162532925605774),
 ('precise', 0.5111181735992432),
 ('affordable', 0.5066229104995728)]

[('economical', 0.5453357696533203),
 ('insightful', 0.5375237464904785),
 ('practical', 0.5225343704223633),
 ('proactive', 0.5217616558074951),
 ('openminded', 0.5201067924499512),
 ('intuitive', 0.5199506878852844),
 ('comprehensive', 0.5152804851531982),
 ('antiag', 0.5131298899650574),
 ('reliable', 0.5104416012763977),
 ('assertive', 0.5080427527427673)]

In [326]:
# (8)
model.wv.most_similar(positive=['he', 'competent'], negative=['she'], topn=10)
model.wv.most_similar(positive=['she', 'competent'], negative=['he'], topn=10)

[('skilled', 0.6980673670768738),
 ('capable', 0.6786777377128601),
 ('proficient', 0.6432988047599792),
 ('talented', 0.6312669515609741),
 ('likeable', 0.6094322800636292),
 ('experienced', 0.603441059589386),
 ('humble', 0.6018375158309937),
 ('skillful', 0.5979371666908264),
 ('intelligent', 0.5959039330482483),
 ('knowledgeable', 0.5694893598556519)]

[('capable', 0.6236226558685303),
 ('knowledgeable', 0.6198654174804688),
 ('intelligent', 0.6132165789604187),
 ('reliable', 0.6031516790390015),
 ('proficient', 0.5963271260261536),
 ('experienced', 0.5938717722892761),
 ('empathetic', 0.5903016924858093),
 ('conscientious', 0.5896420478820801),
 ('approachable', 0.5860242247581482),
 ('compassionate', 0.5844358801841736)]

In [327]:
# (9)
model.wv.most_similar(positive=['he', 'skilled'], negative=['she'], topn=10)
model.wv.most_similar(positive=['she', 'skilled'], negative=['he'], topn=10)

[('skillful', 0.6912488341331482),
 ('talented', 0.6832833290100098),
 ('gifted', 0.6398066878318787),
 ('brilliant', 0.5831794738769531),
 ('proficient', 0.5742428302764893),
 ('competent', 0.5738006830215454),
 ('humble', 0.5737603902816772),
 ('capable', 0.5659710168838501),
 ('experienced', 0.5465304255485535),
 ('masterful', 0.5369408130645752)]

[('competent', 0.6718350648880005),
 ('experienced', 0.6145303249359131),
 ('proficient', 0.6063436269760132),
 ('skillful', 0.5931771397590637),
 ('capable', 0.5892971754074097),
 ('talented', 0.5890049338340759),
 ('qualified', 0.586457371711731),
 ('intelligent', 0.5837894678115845),
 ('gifted', 0.5480527877807617),
 ('compassionate', 0.544803261756897)]

In [328]:
# (10)
model.wv.most_similar(positive=['he', 'knowledgeable'], negative=['she'], topn=10)
model.wv.most_similar(positive=['she', 'knowledgeable'], negative=['he'], topn=10)

[('personable', 0.6437782645225525),
 ('intelligent', 0.6355828642845154),
 ('humble', 0.5993810296058655),
 ('competent', 0.5951390862464905),
 ('likeable', 0.5922819375991821),
 ('approachable', 0.5755761861801147),
 ('conscientious', 0.572158694267273),
 ('compassionate', 0.5716752409934998),
 ('smart', 0.5642468929290771),
 ('informative', 0.5628517270088196)]

[('personable', 0.6683598756790161),
 ('thorough', 0.639936625957489),
 ('approachable', 0.6396913528442383),
 ('intelligent', 0.6352032423019409),
 ('attentive', 0.6290909051895142),
 ('compassionate', 0.6201356649398804),
 ('empathetic', 0.6155948638916016),
 ('thourough', 0.6096585392951965),
 ('thoughtful', 0.6072746515274048),
 ('conscientious', 0.6059531569480896)]

### Pattern 2: analogy 'his' + (noun) - 'her' = ?  |  converse 'her' + (noun) - 'his' = ?
- attribute_words_n_1 = {'smiling','warmth','empathy','manner',__'patience'__}
- attribute_words_n_2 = {__'technique'__,'methodology',__'execution'__,'competence','result'}

#### attribute words (noun) interpersonal (11-15)

In [329]:
# (11)
model.wv.most_similar(positive=['his', 'smiling'], negative=['her'], topn=10)
# converse
model.wv.most_similar(positive=['her', 'smiling'], negative=['his'], topn=10)

[('cheerful', 0.538101077079773),
 ('smiley', 0.5111837983131409),
 ('spotless', 0.49080124497413635),
 ('cheery', 0.48590290546417236),
 ('immaculate', 0.46739456057548523),
 ('cordial', 0.46713095903396606),
 ('welcoming', 0.46486133337020874),
 ('mehis', 0.45859459042549133),
 ('groovy', 0.45831525325775146),
 ('friendlydr', 0.45525670051574707)]

[('hygienist', 0.48793694376945496),
 ('lady', 0.4849381446838379),
 ('girl', 0.4846162796020508),
 ('smile', 0.47751131653785706),
 ('greeting', 0.4659428298473358),
 ('smiley', 0.4599030613899231),
 ('wolud', 0.45589035749435425),
 ('ma', 0.4465061128139496),
 ('np', 0.4403764307498932),
 ('alexa', 0.43450212478637695)]

In [330]:
# (12)
model.wv.most_similar(positive=['his', 'warmth'], negative=['her'], topn=10)
model.wv.most_similar(positive=['her', 'warmth'], negative=['his'], topn=10)

[('gentleness', 0.6774899363517761),
 ('sincerity', 0.6729522347450256),
 ('humility', 0.665016233921051),
 ('professionalism', 0.6452186107635498),
 ('kindness', 0.6297380328178406),
 ('friendliness', 0.6288626194000244),
 ('perfectionism', 0.6247813701629639),
 ('thoughtfulness', 0.6242542862892151),
 ('generosity', 0.6197987794876099),
 ('politeness', 0.6110007762908936)]

[('coldness', 0.504183292388916),
 ('sensitivity', 0.5024848580360413),
 ('compassion', 0.4906674027442932),
 ('sincerity', 0.48857590556144714),
 ('empathy', 0.48008283972740173),
 ('gentleness', 0.47522401809692383),
 ('intelligence', 0.47333455085754395),
 ('personability', 0.4602939188480377),
 ('she', 0.45958322286605835),
 ('genuineness', 0.45948153734207153)]

In [331]:
# (13)
model.wv.most_similar(positive=['his', 'empathy'], negative=['her'], topn=10)
model.wv.most_similar(positive=['her', 'empathy'], negative=['his'], topn=10)

[('compassion', 0.6661914587020874),
 ('humility', 0.6226228475570679),
 ('integrity', 0.6125585436820984),
 ('humanity', 0.5870400071144104),
 ('sincerity', 0.5741028785705566),
 ('politeness', 0.5623422265052795),
 ('display', 0.5529748797416687),
 ('generosity', 0.5511346459388733),
 ('tact', 0.5500410795211792),
 ('gentleness', 0.5438867211341858)]

[('sympathy', 0.6213557720184326),
 ('compassion', 0.6207326650619507),
 ('tact', 0.5672049522399902),
 ('emotion', 0.5317968130111694),
 ('sensitivity', 0.5317386984825134),
 ('patience', 0.5086492896080017),
 ('thereof', 0.48057225346565247),
 ('she', 0.46320682764053345),
 ('disdain', 0.461967408657074),
 ('impatience', 0.45820391178131104)]

In [332]:
# (14)
model.wv.most_similar(positive=['his', 'manner'], negative=['her'], topn=10)
model.wv.most_similar(positive=['her', 'manner'], negative=['his'], topn=10)

[('mannerism', 0.6578498482704163),
 ('mannerfriendly', 0.6566104292869568),
 ('mannerkind', 0.6321370005607605),
 ('mannerthat', 0.61961430311203),
 ('mannereven', 0.6079037189483643),
 ('mannerreally', 0.6077778339385986),
 ('manneralthough', 0.6061023473739624),
 ('mannner', 0.6047405004501343),
 ('mannermy', 0.6007585525512695),
 ('mannersmy', 0.5940633416175842)]

[('mannergood', 0.5459598302841187),
 ('mannerspent', 0.5416707992553711),
 ('mannercar', 0.5355691313743591),
 ('nanner', 0.5312108397483826),
 ('mannerspend', 0.5300453305244446),
 ('mannerwas', 0.5298323035240173),
 ('mannersh', 0.5294790267944336),
 ('mannerwhat', 0.5286840796470642),
 ('msnner', 0.5267499089241028),
 ('mannerif', 0.5226318836212158)]

In [333]:
# (15)
model.wv.most_similar(positive=['his', 'patience'], negative=['her'], topn=10)
model.wv.most_similar(positive=['her', 'patience'], negative=['his'], topn=10)

[('humility', 0.6150791645050049),
 ('kindness', 0.5952591896057129),
 ('generosity', 0.590259313583374),
 ('gentleness', 0.589596152305603),
 ('compassion', 0.5854233503341675),
 ('sincerity', 0.5794406533241272),
 ('humanity', 0.5671722292900085),
 ('thoughtfulness', 0.5571594834327698),
 ('professionalism', 0.551396369934082),
 ('integrity', 0.5442808270454407)]

[('compassion', 0.5383747220039368),
 ('empathy', 0.508453369140625),
 ('patientsshe', 0.5064065456390381),
 ('sensitivity', 0.47917377948760986),
 ('sympathy', 0.45107731223106384),
 ('finesse', 0.44842568039894104),
 ('patien', 0.44740718603134155),
 ('patient', 0.4457676410675049),
 ('she', 0.4427374005317688),
 ('tact', 0.4299698770046234)]

#### attribute words (noun) technical (16-20)

In [334]:
# (16)
model.wv.most_similar(positive=['his', 'technique'], negative=['her'], topn=10)
model.wv.most_similar(positive=['her', 'technique'], negative=['his'], topn=10)

[('precision', 0.596350371837616),
 ('technology', 0.5933947563171387),
 ('innovative', 0.5781473517417908),
 ('surgical', 0.5646595358848572),
 ('artistry', 0.554171085357666),
 ('execution', 0.5484579205513),
 ('stateofthe', 0.5413083434104919),
 ('cuttingedge', 0.5402004718780518),
 ('methodology', 0.5383870005607605),
 ('aesthetic', 0.5366421341896057)]

[('method', 0.5395215749740601),
 ('product', 0.5057425498962402),
 ('technology', 0.5006154775619507),
 ('supplement', 0.4911513924598694),
 ('antiaging', 0.48409947752952576),
 ('tool', 0.48200881481170654),
 ('hrt', 0.48063504695892334),
 ('nutrient', 0.4795064926147461),
 ('nutritional', 0.4759272336959839),
 ('bioidentical', 0.46592050790786743)]

In [335]:
# (17)
model.wv.most_similar(positive=['his', 'methodology'], negative=['her'], topn=10)
model.wv.most_similar(positive=['her', 'methodology'], negative=['his'], topn=10)

[('scientific', 0.598362147808075),
 ('execution', 0.5828267931938171),
 ('innovative', 0.5814200639724731),
 ('evidencebased', 0.5761497020721436),
 ('perfectionism', 0.5650762915611267),
 ('technical', 0.5649383068084717),
 ('technique', 0.562258243560791),
 ('mastery', 0.5545235276222229),
 ('cuttingedge', 0.5538672804832458),
 ('creative', 0.5520240068435669)]

[('bio', 0.5253489017486572),
 ('commonsense', 0.5180771946907043),
 ('hrt', 0.515362560749054),
 ('strategy', 0.5109862089157104),
 ('naturopathic', 0.5105907917022705),
 ('nutrient', 0.5025222897529602),
 ('method', 0.49993187189102173),
 ('medicinal', 0.4990527033805847),
 ('theory', 0.4951913356781006),
 ('nutritional', 0.4948401153087616)]

In [336]:
# (18)
model.wv.most_similar(positive=['his', 'execution'], negative=['her'], topn=10)
model.wv.most_similar(positive=['her', 'execution'], negative=['his'], topn=10)

[('precision', 0.6262564063072205),
 ('surgical', 0.5908267498016357),
 ('perfectionism', 0.5809765458106995),
 ('artful', 0.5383291840553284),
 ('artistry', 0.5294864773750305),
 ('surgerical', 0.5241363048553467),
 ('performing', 0.5226128697395325),
 ('masterful', 0.5169772505760193),
 ('craftsmanship', 0.5073372721672058),
 ('craftsman', 0.5072494149208069)]

[('treatmentsurgery', 0.48946690559387207),
 ('timeline', 0.48132041096687317),
 ('treatman', 0.47466427087783813),
 ('actioni', 0.47223854064941406),
 ('preconception', 0.4674174189567566),
 ('adeboyejo', 0.46161097288131714),
 ('implementation', 0.46078506112098694),
 ('grounding', 0.45687708258628845),
 ('conception', 0.45634305477142334),
 ('checkoff', 0.4543534517288208)]

In [337]:
# (19)
model.wv.most_similar(positive=['his', 'competence'], negative=['her'], topn=10)
model.wv.most_similar(positive=['her', 'competence'], negative=['his'], topn=10)

[('perfectionism', 0.689668595790863),
 ('integrity', 0.6522009372711182),
 ('proficiency', 0.6430216431617737),
 ('humility', 0.6423042416572571),
 ('unquestionable', 0.6386390924453735),
 ('professionalism', 0.6379348039627075),
 ('sincerity', 0.6368511915206909),
 ('competency', 0.6350669264793396),
 ('commendable', 0.6312151551246643),
 ('intellect', 0.6278290152549744)]

[('competency', 0.6551920175552368),
 ('capability', 0.5680730938911438),
 ('intelligence', 0.5631552338600159),
 ('unwavering', 0.5498874187469482),
 ('aptitude', 0.5461034178733826),
 ('qualification', 0.5214747190475464),
 ('warmth', 0.5200161933898926),
 ('acuman', 0.5159662961959839),
 ('judgment', 0.5119194388389587),
 ('intellect', 0.5075143575668335)]

In [338]:
# (20)
model.wv.most_similar(positive=['his', 'result'], negative=['her'], topn=10)
model.wv.most_similar(positive=['her', 'result'], negative=['his'], topn=10)

[('outcome', 0.578392505645752),
 ('simulation', 0.5075244903564453),
 ('resultshis', 0.4985257089138031),
 ('precision', 0.4697610139846802),
 ('execution', 0.46634620428085327),
 ('sevinor', 0.4556583762168884),
 ('surgical', 0.41367825865745544),
 ('eisbach', 0.4049486219882965),
 ('resultsfrom', 0.4044643044471741),
 ('aftercare', 0.39804190397262573)]

[('papsmear', 0.4748949110507965),
 ('mammogram', 0.4732517600059509),
 ('resultsafter', 0.45518192648887634),
 ('pap', 0.45111632347106934),
 ('ultrasonic', 0.4494585692882538),
 ('gynecologist', 0.4443858861923218),
 ('sonogram', 0.4304776191711426),
 ('gynocologist', 0.4301804006099701),
 ('she', 0.42046189308166504),
 ('labcorp', 0.41114500164985657)]

## Method 2: vector similarities calculated by model
- __Compute cosine similarity between two words__
- E.g. similarity('woman', 'man')=0.585, similarity('woman', 'woman')=1
- E.g. similarity('she','professional') - similarity('he','professional') < 0  
==>  male physicians are more likely to be associated with 'professional' in patients' comments.

### Calcluate set_diff for each attribute word set
- If __set_diff(set) < 0__
==> this set of words are more likely to be associated with __female physicians__ in patients' comments.


In [17]:
# calculate the difference of mean similarity between each attribute word and two gender word sets
def set_diff(att_set):
    set_diff=[]
    for att_word in att_set:
        similarity_to_man_set = [model.wv.similarity(att_word, gender_word) for gender_word in man_set]
        similarity_to_woman_set = [model.wv.similarity(att_word, gender_word) for gender_word in woman_set]
        avg_sml_man = np.mean(similarity_to_man_set)  # mean similarity between one word and gender set
        avg_sml_woman = np.mean(similarity_to_woman_set)

        word_diff = avg_sml_man - avg_sml_woman
        set_diff.append(word_diff)
    return set_diff

In [18]:
# the difference of similarities between att_set and two gender sets
diff_1=set_diff(att_1)
diff_2=set_diff(att_2)

print(np.mean(diff_1), np.mean(diff_2))
diff_1[:5]
diff_2[:5]

0.0038619021278460965 0.032239971865480076


[-0.0711082213800811,
 0.07043214527122314,
 0.024743018098339405,
 0.09308269720223544,
 0.032063571352673015]

[0.08528353645234951,
 0.06925593402858699,
 0.07401946560683216,
 -0.018143258594640648,
 -0.008346695276604449]

### (1) One-sample T-test for each attribute word set
- Null hypothesis: mean = 0
- Alternative hypothesis: mean < 0 or mean > 0

In [287]:
true_mu = 0
onesample_results = scipy.stats.ttest_1samp(diff_1, true_mu)
onesample_results

Ttest_1sampResult(statistic=0.8484450529385318, pvalue=0.3988545625704858)

In [288]:
true_mu = 0
onesample_results = scipy.stats.ttest_1samp(diff_2, true_mu)
onesample_results

Ttest_1sampResult(statistic=5.196832089632375, pvalue=1.9114296946649937e-06)

### (2) Two-sample T-test for attribute word sets
- mean_1 (interpersonal) and mean_2 (technical) might be naturally positive (more frequently associated with male physicians)
- Null hypothesis: mean_1 = mean_2
- Alternative hypothesis: mean_1 < mean_2 or  mean_1 > mean_2

In [289]:
twosample_results = scipy.stats.ttest_ind(diff_1, diff_2)
twosample_results

Ttest_indResult(statistic=-3.2848436687609817, pvalue=0.001277532226016919)

- Here p-value < 0.05, so we have highly significant evidence against H0, which means technical words rather than interpersonal words are more likely to be associated with male physicians.

## Method 0: Word frequency in Male/Female reviews

In [299]:
# select comments that is clearly for Male or Female doctors
female_review=clr_review[(clr_review.classify=='F')]
male_review=clr_review[(clr_review.classify=='M')]

print(female_review.shape)
print(male_review.shape)
female_review.head()

(422865, 10)
(1130498, 10)


Unnamed: 0,review_id,href,comments,name,name_gender,content_he,content_she,sp_positions,classify,gender_conflict
0,23507540,/dentists/Dr_Aarika_Anderson_Elter,Great results Dr. Anderson explained in detai...,aarika,F,0,0,0,F,0
79,23651485,/dentists/Dr_Abbey_Onan,Dr Onan is an excellent dentist in the Facult...,abbey,F,0,1,0,F,0
114,24141330,/dentists/Dr_Abbey_Lamanna,Crooked... I had a scheduling conflict for my ...,abbey,F,0,1,0,F,0
115,23789205,/dentists/Dr_Abbey_Lamanna,Beware I had a scheduling conflict for my clea...,abbey,F,0,1,0,F,0
152,29363831,/dentists/Dr_Abigail_Tubio,They are awesome. I must say for years I let m...,abigail,F,0,1,0,F,0


### Pre-process male/female datasets

In [300]:
# save these datasets as CSV files (20 seconds)
female_review.to_csv("./data/female_review_df.csv", index=False)
male_review.to_csv("./data/male_review_df.csv", index=False)

In [301]:
# lemmatize and fitler documents; deal with punctuation(30mins + 40mins)
female_s=female_review['comments'].fillna('').astype(str).tolist()
male_s=male_review['comments'].fillna('').astype(str).tolist()

f_terms_list=[]
m_terms_list=[]

words_to_keep = set(['he','his','him','she','her','I','my','me','mine'])
def lemma(x):
    if x.text in words_to_keep: 
        return x.text
    elif x.text not in words_to_keep:
        return x.lemma_

for doc in tqdm_notebook(female_s):
    doc_cleared = textacy.preprocess_text(doc, lowercase=False, no_punct=True)
    doc_new = nlp(doc_cleared)
    tokens=[lemma(x) for x in doc_new]  #to keep pronouns
    f_terms_list.append(tokens)
    
for doc in tqdm_notebook(male_s):
    doc_cleared = textacy.preprocess_text(doc, lowercase=False, no_punct=True)
    doc_new = nlp(doc_cleared)
    tokens=[lemma(x) for x in doc_new]  #to keep pronouns
    m_terms_list.append(tokens)

print(len(f_terms_list), len(m_terms_list))





422865 1130498


In [305]:
# spellcheck the documents and correct spellings (35 seconds)
for line in tqdm_notebook(f_terms_list):
    for i,word in enumerate(line):
        if word in words_to_filter:
            line[i]=words_to_filter[word]

for line in tqdm_notebook(m_terms_list):
    for i,word in enumerate(line):
        if word in words_to_filter:
            line[i]=words_to_filter[word]







In [306]:
InteractiveShell.ast_node_interactivity = "none"  # speed this process

In [307]:
# save the corpus as a txt file (40 seconds)
with open("./data/female_terms_list.txt","w") as thefile:
    for item in tqdm_notebook(f_terms_list):
      thefile.write("%s\n" % item)




In [308]:
# save the corpus as a txt file (40 seconds)
with open("./data/male_terms_list.txt","w") as thefile:
    for item in tqdm_notebook(m_terms_list):
      thefile.write("%s\n" % item)




In [309]:
InteractiveShell.ast_node_interactivity = "all" # resume the output option

### Build models and count word frequency

In [315]:
# train word2vec on each corpus (20 mins)
f_model = gensim.models.Word2Vec(sentences=f_terms_list, size=100, window=7, min_count=5, workers=4)
m_model = gensim.models.Word2Vec(sentences=m_terms_list, size=100, window=7, min_count=5, workers=4)

# to save a trained model
f_model.save("./data/model_f_clr_comments")
m_model.save("./data/model_m_clr_comments")

In [4]:
# load the trained model
f_model = gensim.models.Word2Vec.load("./data/model_f_clr_comments")
m_model = gensim.models.Word2Vec.load("./data/model_m_clr_comments")

In [120]:
# only keep words existing in both female model and male model
def set_filter_2(att_set):
    new_set=[]
    del_word=[]
    for word in att_set:
        try:
            if f_model[word].all() and m_model[word].all():
                new_set.append(word)
        except KeyError:
            del_word.append(word)
    print(del_word)
    return new_set

In [108]:
# count how many times the word presents in each model
def word_counter_f(word):
    vocab_obj_f = f_model.wv.vocab[str(word)]
    word_freq_f = vocab_obj_f.count
    return word_freq_f

def word_counter_m(word):
    vocab_obj_m = m_model.wv.vocab[str(word)]
    word_freq_m = vocab_obj_m.count
    return word_freq_m

In [130]:
# for interpersonal attribute words
tmp1=set_filter_2(Vocabulary['interaction'])
tmp2=set_filter_2(Vocabulary['interpersonal'].dropna())
tmp={'interpersonal': tmp1 + tmp2 }
freq_att_1=pd.DataFrame(tmp, columns=['interpersonal','count_f','percent_f','count_m','percent_m'])

freq_att_1['count_f']=freq_att_1['interpersonal'].map(word_counter_f)
freq_att_1['count_m']=freq_att_1['interpersonal'].map(word_counter_m)
freq_att_1['percent_f']=round(freq_att_1['count_f']/422865,4) #female_review.shape[0],4)
freq_att_1['percent_m']=round(freq_att_1['count_m']/1130498,4) #m_len,4)

freq_att_1 .tail()

['interplay', 'mutuality']
['hearty']


Unnamed: 0,interpersonal,count_f,percent_f,count_m,percent_m
75,gracious,234,0.0006,632,0.0006
76,thoughtful,2557,0.006,5913,0.0052
77,listening,47204,0.1116,101235,0.0895
78,considerate,1830,0.0043,5446,0.0048
79,conscientious,635,0.0015,1540,0.0014


In [129]:
# for technical attribute words
tmp1=set_filter_2(Vocabulary['technique'].dropna())
tmp2=set_filter_2(Vocabulary['technical'].dropna())
tmp={'technical': tmp1 + tmp2 }
freq_att_2=pd.DataFrame(tmp, columns=['technical','count_f','percent_f','count_m','percent_m'])

freq_att_2['count_f']=freq_att_2['technical'].map(word_counter_f)
freq_att_2['count_m']=freq_att_2['technical'].map(word_counter_m)
freq_att_2['percent_f']=round(freq_att_2['count_f']/422865,4) #female_review.shape[0],4)
freq_att_2['percent_m']=round(freq_att_2['count_m']/1130498,4) #m_len,4)
freq_att_2.tail()

[]
[]


Unnamed: 0,technical,count_f,percent_f,count_m,percent_m
68,observant,125,0.0003,250,0.0002
69,perceptive,205,0.0005,330,0.0003
70,effective,1438,0.0034,3642,0.0032
71,innovative,100,0.0002,502,0.0004
72,conservative,705,0.0017,3333,0.0029


### Wilcoxon Signed-Rank Test on word frequencies in male/female model
- For attribute words, we want to know whether their frequencies in female and male models are from populations having the same distribution
- H0: difference between the pairs follows a symmetric distribution around zero
- H1: difference between the pairs does not follow a symmetric distribution around zero.

In [134]:
freq_att_1.tail()

Unnamed: 0,interpersonal,count_f,percent_f,count_m,percent_m
75,gracious,234,0.0006,632,0.0006
76,thoughtful,2557,0.006,5913,0.0052
77,listening,47204,0.1116,101235,0.0895
78,considerate,1830,0.0043,5446,0.0048
79,conscientious,635,0.0015,1540,0.0014


In [137]:
# for interpersonal attribute words
wilcoxon_result_1=scipy.stats.wilcoxon(freq_att_1.percent_f, freq_att_1.percent_m, 
                                       zero_method='pratt', correction=False)
wilcoxon_result_1

print('mean_freq_female:', np.mean(freq_att_1.percent_f), '. mean_freq_male:', np.mean(freq_att_1.percent_m))

WilcoxonResult(statistic=903.0, pvalue=0.0005837025471812024)

mean_freq_female: 0.011743749999999999 . mean_freq_male: 0.010929999999999994


- Since p-value < 0.05, so we have highly significant evidence against H0, which means interpersonal words are more frequently used to assess female physicians than male ones.

In [138]:
# for technical attribute words
wilcoxon_result_2=scipy.stats.wilcoxon(freq_att_2.percent_f, freq_att_2.percent_m, 
                                       zero_method='pratt', correction=False)
wilcoxon_result_2

print('mean_freq_female:', np.mean(freq_att_2.percent_f), '. mean_freq_male:', np.mean(freq_att_2.percent_m))

WilcoxonResult(statistic=800.0, pvalue=0.002473639383286504)

mean_freq_female: 0.009252054794520546 . mean_freq_male: 0.009797260273972597


- Since p-value < 0.05, so we have highly significant evidence against H0, which means technical words are more frequently used to assess male physicians than female ones.