In [1]:
import spacy
import textacy
import gensim
import os
import numpy as np
import pandas as pd

In [9]:
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

# Import raw data

In [3]:
review_panel = pd.read_csv('./data/reviews_panel_t8.csv')
review_panel.head()

Unnamed: 0,Id,review_id,href,overall_rating,Ease_of_Appointment,Promptness,Courteous_Staff,Accurate_Diagnosis,Bedside_Manner,Spends_Time_with_Me,...,Advanced_Technology,Caring_Manner,Pain_Minimized,Satisfaction,date,reviewer,title,content,helpful_vote,crawl_date
0,12949966,23507540,/dentists/Dr_Aarika_Anderson_Elter,5,4.0,,,,,,...,,5.0,5.0,,2012-11-16,,Great results,Dr. Anderson explained in detail my options fo...,,2016-09-06 20:54:01
1,12949967,27184431,/dentists/Dr_Aamir_Wahab,5,5.0,,,,,,...,5.0,5.0,5.0,5.0,2015-03-15,Brad s.,implant,Had an implant done and it was painless. I cou...,,2016-09-06 20:54:10
2,12949968,26307282,/dentists/Dr_Aanal_Parikh,1,1.0,,,,,,...,1.0,1.0,1.0,1.0,2014-10-12,Bill johnson,Warning ..would not see this dentist,Do not go to this dentist office they scam peo...,,2016-09-06 20:54:13
3,12949969,28904504,/dentists/Dr_Aaron_Aguilar,5,5.0,,,,,,...,5.0,5.0,5.0,5.0,2015-06-11,,,Very thoughtful Dr. Communicates/bedside mann...,,2016-09-06 20:54:14
4,12949970,28380953,/dentists/Dr_Aaron_D_Larsen,4,3.0,,,,,,...,4.0,4.0,4.0,5.0,2015-03-31,,Dr larsen great others no,Would continue to see Dr. Larsen however his ...,,2016-09-06 20:54:18


In [5]:
review_panel.shape

(1796204, 23)

In [6]:
review_panel_text = review_panel['title'].fillna('').astype(str) + " " + review_panel['content'].fillna('')
sample_s = review_panel_text.tolist()

# Pre-process via spaCy

In [33]:
nlp = spacy.load('en')

In [8]:
doc = nlp(u'Apple is looking at buying U.K. startup for $1 billion. he his him')

for token in doc:
    print(token.text, token.lemma_, token.pos_, token.tag_, token.dep_,
          token.shape_, token.is_alpha, token.is_stop)

Apple apple PROPN NNP nsubj Xxxxx True False
is be VERB VBZ aux xx True True
looking look VERB VBG ROOT xxxx True False
at at ADP IN prep xx True True
buying buy VERB VBG pcomp xxxx True False
U.K. u.k. PROPN NNP compound X.X. False False
startup startup NOUN NN dobj xxxx True False
for for ADP IN prep xxx True True
$ $ SYM $ quantmod $ False False
1 1 NUM CD compound d False False
billion billion NUM CD pobj xxxx True False
. . PUNCT . punct . False False
he -PRON- PRON PRP nsubj xx True True
his -PRON- ADJ PRP$ ROOT xxx True True
him -PRON- PRON PRP dobj xxx True True


In [35]:
words_to_keep= set(['he','his','him', 'she','her', 'I','my','me','mine'])
def lemma(x):
    if x.text in words_to_keep: 
        return x.text
    elif x.text not in words_to_keep:
        return x.lemma_
    
    
a_doc = nlp("He said that you didn't buy apples for her. But I bought mine.")  # feed in string or documents
print([lemma(x) for x in a_doc])   # lemmatize words

['-PRON-', 'say', 'that', '-PRON-', 'do', 'not', 'buy', 'apple', 'for', 'her', '.', 'but', 'I', 'buy', 'mine', '.']


In [10]:
# lemmatize and fitler documents; deal with punctuation and typo (2h 30mins)
terms_list=[]
words_to_filter = set(['dr', 'doctor', 'oh'])
words_to_keep = set(['he','his','him','she','her','I','my','me','mine'])
def lemma(x):
    if x.text in words_to_keep: 
        return x.text
    elif x.text not in words_to_keep:
        return x.lemma_

for doc in sample_s:
    doc_cleared = textacy.preprocess_text(doc, lowercase=False, no_punct=True, transliterate=True)
    doc_new = nlp(doc_cleared)
    tokens=[lemma(x) for x in doc_new]  #to keep pronouns
    tokens_filtered = [y for y in tokens if y not in words_to_filter]
    terms_list.append(tokens_filtered)

print(terms_list[:5])

[['great',
  'result',
  'anderson',
  'explain',
  'in',
  'detail',
  'my',
  'option',
  'for',
  'the',
  'repair',
  'on',
  'my',
  'tooth',
  'in',
  'the',
  'end',
  'I',
  'receive',
  'great',
  'result'],
 ['implant',
  'have',
  'an',
  'implant',
  'do',
  'and',
  '-PRON-',
  'be',
  'painless',
  'I',
  'could',
  'not',
  'afford',
  '-PRON-',
  'so',
  'he',
  'put',
  'me',
  'on',
  'a',
  'payment',
  'plan',
  'I',
  'be',
  'pleased',
  'with',
  'the',
  'service',
  'great'],
  'would',
  'not',
  'see',
  'this',
  'dentist',
  'do',
  'not',
  'go',
  'to',
  'this',
  'dentist',
  'office',
  '-PRON-',
  'scam',
  'people',
  'and',
  'do',
  'not',
  'pay',
  'there',
  'bill',
  '-PRON-',
  'snow',
  'plow',
  'for',
  '-PRON-',
  'and',
  'when',
  'the',
  'season',
  'be',
  'over',
  '-PRON-',
  'would',
  'not',
  'pay',
  '-PRON-',
  'this',
  'be',
  'just',
  'to',
  'let',
  '-PRON-',
  'know',
  'what',
  'type',
  'of',
  'dentist',
  '-PRON-',


In [20]:
# save the corpus as a txt file (2 min)
with open("./data/terms_list.txt","w") as thefile:
    for item in terms_list:
      thefile.write("%s\n" % item)

In [55]:
file= open('./data/terms_list.txt','r')
terms=file.read()
print(terms[:1143])

['great', 'result', 'anderson', 'explain', 'in', 'detail', 'my', 'option', 'for', 'the', 'repair', 'on', 'my', 'tooth', 'in', 'the', 'end', 'I', 'receive', 'great', 'result']
['implant', 'have', 'an', 'implant', 'do', 'and', '-PRON-', 'be', 'painless', 'I', 'could', 'not', 'afford', '-PRON-', 'so', 'he', 'put', 'me', 'on', 'a', 'payment', 'plan', 'I', 'be', 'pleased', 'with', 'the', 'service', 'great']
['very', 'thoughtful', 'communicatesbedside', 'manner', 'great']
['larsen', 'great', 'other', 'no', 'would', 'continue', 'to', 'see', 'larsen', 'however', 'his', 'front', 'desk', 'secretary', 'be', 'something', 'else', '-PRON-', 'sadden', 'me', 'to', 'have', 'to', 'give', 'my', 'money', 'to', 'someone', 'else']



# Build model via word2vec

In [37]:
# read in corpus
sts_terms_list = gensim.models.word2vec.LineSentence('terms_list.txt')

In [30]:
# train word2vec on the corpus (10 mins)
model = gensim.models.Word2Vec(sentences=terms_list, size=100, window=7, min_count=5, workers=4)
# to save a trained model
model.save("./data/model_doc_comments1")

In [3]:
model.wv['I']  # numpy vector of a word

array([ 0.8985168 ,  0.21133737, -1.0174987 ,  0.23950161,  0.6535615 ,
       -2.1597543 , -0.96719617, -1.2723322 ,  0.6952619 ,  0.52317786,
        3.8957522 , -4.2797956 , -3.5484123 ,  2.9083505 , -2.0390408 ,
        1.4697789 ,  2.7010856 , -2.316967  ,  1.357116  ,  2.9058228 ,
       -1.5485703 , -2.5820878 ,  1.5781274 , -4.7464647 ,  2.5480347 ,
       -1.0955589 , -4.0962763 ,  2.5019717 , -1.9741727 ,  2.3681483 ,
        2.2892256 , -1.7966137 ,  2.341933  ,  1.9638792 ,  1.0589308 ,
       -3.6643832 ,  0.8503319 ,  2.3559475 ,  0.8861464 ,  2.4579482 ,
        0.09558363, -1.3544432 , -1.3810408 ,  3.4680386 , -2.2576418 ,
       -1.45918   ,  1.1770133 ,  0.47717357,  0.82000923,  1.227406  ,
       -3.9137058 , -0.06107468, -0.597307  , -2.9509046 , -1.9264892 ,
       -0.3778871 , -1.1883132 ,  2.1589031 , -0.23421006,  0.4014843 ,
       -1.801502  , -2.567079  , -1.50274   , -1.3262734 ,  1.2228625 ,
       -3.5384681 ,  2.4423347 ,  1.5887536 ,  1.7367889 ,  0.56

In [5]:
model['I']

  """Entry point for launching an IPython kernel.


array([ 0.8985168 ,  0.21133737, -1.0174987 ,  0.23950161,  0.6535615 ,
       -2.1597543 , -0.96719617, -1.2723322 ,  0.6952619 ,  0.52317786,
        3.8957522 , -4.2797956 , -3.5484123 ,  2.9083505 , -2.0390408 ,
        1.4697789 ,  2.7010856 , -2.316967  ,  1.357116  ,  2.9058228 ,
       -1.5485703 , -2.5820878 ,  1.5781274 , -4.7464647 ,  2.5480347 ,
       -1.0955589 , -4.0962763 ,  2.5019717 , -1.9741727 ,  2.3681483 ,
        2.2892256 , -1.7966137 ,  2.341933  ,  1.9638792 ,  1.0589308 ,
       -3.6643832 ,  0.8503319 ,  2.3559475 ,  0.8861464 ,  2.4579482 ,
        0.09558363, -1.3544432 , -1.3810408 ,  3.4680386 , -2.2576418 ,
       -1.45918   ,  1.1770133 ,  0.47717357,  0.82000923,  1.227406  ,
       -3.9137058 , -0.06107468, -0.597307  , -2.9509046 , -1.9264892 ,
       -0.3778871 , -1.1883132 ,  2.1589031 , -0.23421006,  0.4014843 ,
       -1.801502  , -2.567079  , -1.50274   , -1.3262734 ,  1.2228625 ,
       -3.5384681 ,  2.4423347 ,  1.5887536 ,  1.7367889 ,  0.56

# Measure the stereotype bias
- via comparison of comments on female and male physicians

In [2]:
# load the trained model
model = gensim.models.Word2Vec.load("./data/model_doc_comments1")

## Method 1: word analogies generated by model
- __analogy__ 'he' + (adj.) - 'she' = ? |  __converse__ 'she' + (adj.) - 'he' = ?
- __analogy__ 'his' + (noun) - 'her' = ? |  __converse__ 'her' + (noun) - 'his' = ?
- __To be noticed: Results heavily relies on the choice of keyword pairs__

### Part 1: analogy  'he' + (adj.) - 'she' = ?  |  converse 'she' + (adj.) - 'he' = ?
- keywords_adj_1 = {'professional','efficient','competent'}
- keywords_adj_2 = {'nice',__'friendly','polite'__}

In [12]:
# he+(adj.)-she
model.wv.most_similar(positive=['he', 'professional'], negative=['she'], topn=10)
# she+(adj.)-he
model.wv.most_similar(positive=['she', 'professional'], negative=['he'], topn=10)

[('proffesional', 0.6564347743988037),
 ('proffessional', 0.6406091451644897),
 ('personable', 0.6229191422462463),
 ('professionalhe', 0.6222789883613586),
 ('knowledgeable', 0.6085402369499207),
 ('humble', 0.608481228351593),
 ('professionali', 0.6083875894546509),
 ('competent', 0.6081501841545105),
 ('efficient', 0.6004852056503296),
 ('knowledgable', 0.5969569087028503)]

[('efficient', 0.6449151039123535),
 ('helpful', 0.6382687091827393),
 ('polite', 0.6301308274269104),
 ('proffesional', 0.623944878578186),
 ('attentive', 0.6222675442695618),
 ('respectful', 0.6202351450920105),
 ('warm', 0.6143782138824463),
 ('friendly', 0.6115367412567139),
 ('personable', 0.599821925163269),
 ('supportive', 0.5963608026504517)]

In [11]:
# he+(adj.)-she
model.wv.most_similar(positive=['he', 'effective'], negative=['she'], topn=10)
# she+(adj.)-he
model.wv.most_similar(positive=['she', 'effective'], negative=['he'], topn=10)

[('beneficial', 0.6459837555885315),
 ('conservative', 0.6394050121307373),
 ('innovative', 0.5897181034088135),
 ('economical', 0.5809915661811829),
 ('noninvasive', 0.572291910648346),
 ('prudent', 0.5711985230445862),
 ('prolotherapy', 0.5664175152778625),
 ('inexpensive', 0.5597268342971802),
 ('exacting', 0.5417249202728271),
 ('controversial', 0.5396667122840881)]

[('agreeable', 0.5745129585266113),
 ('economical', 0.5587084293365479),
 ('pragmatic', 0.54994797706604),
 ('beneficial', 0.5498460531234741),
 ('insightful', 0.5408653020858765),
 ('therapeutic', 0.5386779308319092),
 ('proactive', 0.5321661829948425),
 ('efficacious', 0.5267099738121033),
 ('openminded', 0.5205910801887512),
 ('astute', 0.5198585987091064)]

In [14]:
# he+(adj.)-she
model.wv.most_similar(positive=['he', 'competent'], negative=['she'], topn=10)
# she+(adj.)-he
model.wv.most_similar(positive=['she', 'competent'], negative=['he'], topn=10)

[('skilled', 0.7663887143135071),
 ('capable', 0.7054327726364136),
 ('proficient', 0.7037885189056396),
 ('competant', 0.6754753589630127),
 ('talented', 0.673223078250885),
 ('experienced', 0.6638932228088379),
 ('skillful', 0.656440019607544),
 ('qualified', 0.6559319496154785),
 ('intelligent', 0.6537479758262634),
 ('knowledgeable', 0.6486888527870178)]

[('competant', 0.7423862218856812),
 ('capable', 0.6668056845664978),
 ('knowlegeable', 0.6394909620285034),
 ('proficient', 0.6360024213790894),
 ('knowledgable', 0.6337969303131104),
 ('knowledgeable', 0.6328122019767761),
 ('welltrain', 0.6277335286140442),
 ('experienced', 0.6254478096961975),
 ('knowlegable', 0.6198614835739136),
 ('engaged', 0.6171719431877136)]

In [13]:
# he+(adj.)-she
model.wv.most_similar(positive=['he', 'nice'], negative=['she'], topn=10)
# she+(adj.)-he
model.wv.most_similar(positive=['she', 'nice'], negative=['he'], topn=10)

[('friendly', 0.6206178665161133),
 ('polite', 0.6170006394386292),
 ('likable', 0.6024885177612305),
 ('pleasant', 0.5870307683944702),
 ('friendy', 0.5755089521408081),
 ('likeable', 0.5656896829605103),
 ('pleasent', 0.5615097284317017),
 ('plesant', 0.5488872528076172),
 ('cool', 0.5443068742752075),
 ('charming', 0.543339192867279)]

[('sweet', 0.8070353865623474),
 ('lovely', 0.6870039701461792),
 ('friendly', 0.68165123462677),
 ('polite', 0.6418354511260986),
 ('pleasant', 0.623091995716095),
 ('pleasent', 0.6188926696777344),
 ('welcoming', 0.6035218834877014),
 ('unfriendly', 0.6007041335105896),
 ('snotty', 0.5943301320075989),
 ('plesant', 0.5931156277656555)]

In [10]:
# he+(adj.)-she
model.wv.most_similar(positive=['he', 'friendly'], negative=['she'], topn=10)
# she+(adj.)-he
model.wv.most_similar(positive=['she', 'friendly'], negative=['he'], topn=10)

[('polite', 0.780290961265564),
 ('courteous', 0.7779586911201477),
 ('pleasant', 0.6985913515090942),
 ('personable', 0.6834683418273926),
 ('cordial', 0.6830505132675171),
 ('efficient', 0.6762321591377258),
 ('curteous', 0.6745314598083496),
 ('nice', 0.6646460294723511),
 ('curtious', 0.6569429039955139),
 ('courtious', 0.6483271718025208)]

[('polite', 0.7677793502807617),
 ('helpful', 0.7214754223823547),
 ('courteous', 0.7051953673362732),
 ('pleasant', 0.7012017965316772),
 ('sweet', 0.6971908807754517),
 ('welcoming', 0.6816627979278564),
 ('accomodat', 0.6815884113311768),
 ('cheerful', 0.6797686219215393),
 ('accommodating', 0.6655685901641846),
 ('cordial', 0.6591864824295044)]

In [15]:
# he+(adj.)-she
model.wv.most_similar(positive=['he', 'polite'], negative=['she'], topn=10)
# she+(adj.)-he
model.wv.most_similar(positive=['she', 'polite'], negative=['he'], topn=10)

[('courteous', 0.777746319770813),
 ('friendly', 0.74969482421875),
 ('cordial', 0.7161503434181213),
 ('curteous', 0.7138438820838928),
 ('courtious', 0.6966575384140015),
 ('curtious', 0.6917126178741455),
 ('personable', 0.6851114630699158),
 ('efficient', 0.6846919059753418),
 ('helpful', 0.6555166244506836),
 ('helpfull', 0.650011420249939)]

[('friendly', 0.796964704990387),
 ('helpful', 0.7659982442855835),
 ('courteous', 0.7271215319633484),
 ('cordial', 0.7114821076393127),
 ('sweet', 0.7110629081726074),
 ('curtious', 0.706183135509491),
 ('curteous', 0.6957941055297852),
 ('accomodat', 0.6941994428634644),
 ('accommodating', 0.6795791983604431),
 ('courtious', 0.671298086643219)]

### Part 2: analogy 'his' + (noun) - 'her' = ?  |  converse 'her' + (noun) - 'his' = ?
- keywords_n_1 = {'technique','methodology','execution'}
- keywords_n_2 = {'smile','grin',__'warm'__}

In [16]:
# his+(noun)-her
model.wv.most_similar(positive=['his', 'technique'], negative=['her'], topn=10)
# her+(noun)-his
model.wv.most_similar(positive=['her', 'technique'], negative=['his'], topn=10)

[('technology', 0.60670006275177),
 ('cuttingedge', 0.5887786746025085),
 ('innovative', 0.5801389813423157),
 ('precision', 0.570105791091919),
 ('surgical', 0.5657679438591003),
 ('aesthetic', 0.5644391179084778),
 ('artistry', 0.5632181763648987),
 ('execution', 0.5563507676124573),
 ('technical', 0.5525886416435242),
 ('methodology', 0.544705867767334)]

[('technic', 0.5587255358695984),
 ('method', 0.5398416519165039),
 ('technology', 0.5327882766723633),
 ('tool', 0.5315686464309692),
 ('instrument', 0.46823790669441223),
 ('methodology', 0.4624430537223816),
 ('advancement', 0.46089014410972595),
 ('brava', 0.4550757110118866),
 ('vaseline', 0.4540991187095642),
 ('hrt', 0.45188942551612854)]

In [18]:
# his+(noun)-her
model.wv.most_similar(positive=['his', 'methodology'], negative=['her'], topn=10)
# her+(noun)-his
model.wv.most_similar(positive=['her', 'methodology'], negative=['his'], topn=10)

[('scientific', 0.6270068883895874),
 ('cuttingedge', 0.6237863302230835),
 ('innovative', 0.6234255433082581),
 ('execution', 0.6092877984046936),
 ('evidencebased', 0.5992555022239685),
 ('procedural', 0.5958627462387085),
 ('technique', 0.5914133191108704),
 ('mastery', 0.5861795544624329),
 ('pharmacology', 0.583117663860321),
 ('technical', 0.5739743113517761)]

[('method', 0.5421013236045837),
 ('therapeutic', 0.535122275352478),
 ('strategy', 0.5350595712661743),
 ('approach', 0.5251684188842773),
 ('commonsense', 0.5161302089691162),
 ('logic', 0.5129414796829224),
 ('theory', 0.5114564299583435),
 ('naturopathic', 0.5072319507598877),
 ('regimen', 0.5017905831336975),
 ('homeopathy', 0.4981175661087036)]

In [19]:
# his+(noun)-her
model.wv.most_similar(positive=['his', 'execution'], negative=['her'], topn=10)
# her+(noun)-his
model.wv.most_similar(positive=['her', 'execution'], negative=['his'], topn=10)

[('precision', 0.6059291958808899),
 ('surgical', 0.5837683081626892),
 ('procedural', 0.5648508071899414),
 ('execute', 0.5634226202964783),
 ('bydons', 0.5609303116798401),
 ('perfectionism', 0.5564645528793335),
 ('technical', 0.5530896782875061),
 ('deductive', 0.5486582517623901),
 ('artistry', 0.5484542846679688),
 ('vickerys', 0.5405755043029785)]

[('implementation', 0.5497423410415649),
 ('roadmap', 0.4897775948047638),
 ('calculated', 0.4576040208339691),
 ('actioni', 0.44992777705192566),
 ('hulka', 0.44863227009773254),
 ('therapuetic', 0.44706565141677856),
 ('conception', 0.44655200839042664),
 ('gingivitis', 0.44056543707847595),
 ('wholebody', 0.43626290559768677),
 ('dimeanor', 0.4339507818222046)]

In [17]:
# his+(noun)-her
model.wv.most_similar(positive=['his', 'smile'], negative=['her'], topn=10)
# her+(noun)-his
model.wv.most_similar(positive=['her', 'smile'], negative=['his'], topn=10)

[('cheerful', 0.5321331024169922),
 ('smiling', 0.5262094736099243),
 ('handshake', 0.5197967290878296),
 ('smiley', 0.493060439825058),
 ('cheery', 0.479512095451355),
 ('poise', 0.4728557765483856),
 ('smilei', 0.4726208746433258),
 ('smileand', 0.470217764377594),
 ('grin', 0.46686071157455444),
 ('welcome', 0.4646042287349701)]

[('hug', 0.5498396158218384),
 ('handshake', 0.51720130443573),
 ('pantie', 0.5140137672424316),
 ('scowl', 0.5068621635437012),
 ('grin', 0.48976388573646545),
 ('shoelace', 0.48190537095069885),
 ('girl', 0.47115427255630493),
 ('smirk', 0.46735459566116333),
 ('blanket', 0.4635048806667328),
 ('lollipop', 0.4630123972892761)]

In [20]:
# his+(noun)-her
model.wv.most_similar(positive=['his', 'grin'], negative=['her'], topn=10)
# her+(noun)-his
model.wv.most_similar(positive=['her', 'grin'], negative=['his'], topn=10)

[('lets', 0.4965308606624603),
 ('wink', 0.48335111141204834),
 ('bluntness', 0.47686275839805603),
 ('scowl', 0.46949297189712524),
 ('laughter', 0.46827754378318787),
 ('heas', 0.4680287837982178),
 ('smirk', 0.45693308115005493),
 ('smile', 0.45322272181510925),
 ('expression', 0.4438091516494751),
 ('poker', 0.4429737329483032)]

[('smirk', 0.6086892485618591),
 ('scowl', 0.60369873046875),
 ('pantie', 0.5709898471832275),
 ('she', 0.5605303645133972),
 ('screaming', 0.5567564368247986),
 ('tongue', 0.5254310965538025),
 ('zit', 0.5244307518005371),
 ('flashlight', 0.5176804661750793),
 ('blister', 0.5069534778594971),
 ('pant', 0.5042190551757812)]

In [21]:
# his+(noun)-her
model.wv.most_similar(positive=['his', 'warm'], negative=['her'], topn=10)
# her+(noun)-his
model.wv.most_similar(positive=['her', 'warm'], negative=['his'], topn=10)

[('professional', 0.6197702288627625),
 ('charming', 0.6069566011428833),
 ('efficient', 0.6004388332366943),
 ('cordial', 0.5991696119308472),
 ('delightful', 0.5822018384933472),
 ('congenial', 0.582179069519043),
 ('cheerful', 0.5816338658332825),
 ('friendly', 0.5798388123512268),
 ('respectful', 0.5742918252944946),
 ('charismatic', 0.5739561915397644)]

[('nurturing', 0.5317784547805786),
 ('sweet', 0.51926189661026),
 ('nurture', 0.5070083737373352),
 ('nonjudgemental', 0.4679553210735321),
 ('loving', 0.4642377495765686),
 ('cheery', 0.4581487774848938),
 ('bubbly', 0.4564359486103058),
 ('relatable', 0.4518079161643982),
 ('cheerful', 0.4480541944503784),
 ('warming', 0.4464818835258484)]

## Method 2: vector similarities calculated by model
- __Compute cosine similarity between two words__
- E.g. Similarity('woman', 'man')=0.585, similarity('woman', 'woman')=1
- similarity('she','professional') - similarity('he','professional') < 0  
==>  male physicians are more likely to be associated with 'professional' in patients' comments.

In [3]:
print(model.wv.similarity('she','professional'))
print(model.wv.similarity('he','professional'))

print(model.wv.similarity('she','professional')-model.wv.similarity('he','professional'))

-0.20230704662320764
-0.15150283317531602
-0.05080421344789163


In [30]:
print(model.wv.similarity('she','professional')-model.wv.similarity('he','professional'))
print(model.wv.similarity('she','efficient')-model.wv.similarity('he','efficient'))
print(model.wv.similarity('she','helpful')-model.wv.similarity('he','helpful'))
print('-------------')
print(model.wv.similarity('she','friendly')-model.wv.similarity('he','friendly'))
print(model.wv.similarity('she','polite')-model.wv.similarity('he','polite'))
print(model.wv.similarity('she','nice')-model.wv.similarity('he','nice'))

-0.05080421344789163
0.0005303721403622508
0.07739653979201196
-------------
0.04022910658313149
0.01828594020133216
0.006341864591320655


In [31]:
print(model.wv.similarity('her','technique')-model.wv.similarity('his','technique'))
print(model.wv.similarity('her','methodology')-model.wv.similarity('his','methodology'))
print(model.wv.similarity('her','execution')-model.wv.similarity('his','execution'))
print('-------------')
print(model.wv.similarity('her','smile')-model.wv.similarity('his','smile'))
print(model.wv.similarity('her','handshake')-model.wv.similarity('his','handshake'))
print(model.wv.similarity('her','grin')-model.wv.similarity('his','grin'))

-0.16854096134784646
-0.1182542978886263
-0.33514627863349655
-------------
0.04540064853195455
0.015162475385465113
0.03154003349371577


#### By comparison, we found that female physicians are relatively more likely to be associate with comforting behaviors while male physicians are more frequently to be judged according to their professional standards.

### Build sets of target words & attribute words

In [None]:
TW_adj_1= set(['technique','methodology','execution'])
TW_adj_2= set(['friendly','polite','nice'])
TW_n_1= set(['technique','methodology','methodology'])
TW_n_2= set(['smile','handshake','grin'])

In [None]:
AW_1= set(['she','her','female'])
AW_2= set(['he','his','male'])