In [13]:
# In this part of the course, we explore pretrained word embedding models
# We will start in the Browser – please go to
# https://projector.tensorflow.org

In [14]:
# The Gensim library provides a convenient way to load and use different models and corpora
# Here you find more information: https://radimrehurek.com/gensim/downloader.html
# And here a jupyter notebook with examples: https://radimrehurek.com/gensim/auto_examples/howtos/run_downloader_api.html

import gensim.downloader as api
import json

info = api.info()
print(json.dumps(info["models"], indent=4))

{
    "fasttext-wiki-news-subwords-300": {
        "num_records": 999999,
        "file_size": 1005007116,
        "base_dataset": "Wikipedia 2017, UMBC webbase corpus and statmt.org news dataset (16B tokens)",
        "reader_code": "https://github.com/RaRe-Technologies/gensim-data/releases/download/fasttext-wiki-news-subwords-300/__init__.py",
        "license": "https://creativecommons.org/licenses/by-sa/3.0/",
        "parameters": {
            "dimension": 300
        },
        "description": "1 million word vectors trained on Wikipedia 2017, UMBC webbase corpus and statmt.org news dataset (16B tokens).",
        "read_more": [
            "https://fasttext.cc/docs/en/english-vectors.html",
            "https://arxiv.org/abs/1712.09405",
            "https://arxiv.org/abs/1607.01759"
        ],
        "checksum": "de2bb3a20c46ce65c9c131e1ad9a77af",
        "file_name": "fasttext-wiki-news-subwords-300.gz",
        "parts": 1
    },
    "conceptnet-numberbatch-17-06-300": {
    

In [15]:
# We will download and use a small wikipedia model
model = api.load("glove-wiki-gigaword-50")

In [16]:
# Let's ask for nearest neighbors of a word

model.most_similar("linguistics")

[('anthropology', 0.8712018132209778),
 ('philology', 0.8677397966384888),
 ('sociology', 0.8010218143463135),
 ('comparative', 0.7935947775840759),
 ('psychology', 0.7921764850616455),
 ('mathematics', 0.7858846187591553),
 ('phonology', 0.7803564667701721),
 ('biology', 0.7800031900405884),
 ('literatures', 0.7772234678268433),
 ('musicology', 0.7711768746376038)]

In [17]:
# As we can see, the closest words (cosine similarity) are semantically similar, even potential synonyms
model.most_similar("attack")

[('attacks', 0.9274438619613647),
 ('bombing', 0.8695275783538818),
 ('suicide', 0.8600562214851379),
 ('raid', 0.8567200303077698),
 ('bomb', 0.8250046968460083),
 ('ambush', 0.8242325186729431),
 ('killing', 0.8198404312133789),
 ('deadly', 0.8161443471908569),
 ('strikes', 0.8123002052307129),
 ('militants', 0.8120480179786682)]

In [18]:
# The most_similar function also has a parameter called topn, which allows us to specify how many words we want to retrieve
model.most_similar("transphobia", topn=25)

[('homophobia', 0.7731683850288391),
 ('heterosexism', 0.729769766330719),
 ('ariano', 0.7098562121391296),
 ('zoophilia', 0.696968138217926),
 ('anti-asian', 0.6925331354141235),
 ('mudan', 0.6768403053283691),
 ('eling', 0.675630509853363),
 ('covetousness', 0.6748610138893127),
 ('classism', 0.66963791847229),
 ('casteism', 0.668304443359375),
 ('anti-black', 0.6663869619369507),
 ('gender-based', 0.6573017239570618),
 ('misogyny', 0.6561839580535889),
 ('heteronormativity', 0.6560376882553101),
 ('cyberbullying', 0.6537344455718994),
 ('polyarthritis', 0.6513605713844299),
 ('storyland', 0.6510611772537231),
 ('discriminations', 0.6508241891860962),
 ('untouchability', 0.6493785381317139),
 ('cut-throat', 0.6465195417404175),
 ('anti-militarism', 0.6464475393295288),
 ('anti-muslim', 0.6445224285125732),
 ('zero-one', 0.6417361497879028),
 ('nasaf', 0.6404805183410645),
 ('speciesism', 0.6389020681381226)]

In [19]:
# Now take some time to explore the model and look at word neighbourhoods that might be interesting for your research

# ...

In [20]:
# We can also do math with words
# For example, we can find the words that are most similar to ...
# king - man + woman
model.most_similar(positive=["king", "woman"], negative=["man"])

[('queen', 0.8523604273796082),
 ('throne', 0.7664334177970886),
 ('prince', 0.759214460849762),
 ('daughter', 0.7473882436752319),
 ('elizabeth', 0.7460219860076904),
 ('princess', 0.7424570322036743),
 ('kingdom', 0.7337412238121033),
 ('monarch', 0.7214491367340088),
 ('eldest', 0.7184861898422241),
 ('widow', 0.7099431157112122)]

In [21]:
# Or we do Paris - France + Germany
model.most_similar(positive=["paris", "germany"], negative=["france"])

[('berlin', 0.9203965663909912),
 ('frankfurt', 0.8201637268066406),
 ('vienna', 0.8182449340820312),
 ('munich', 0.8152028918266296),
 ('hamburg', 0.7986699342727661),
 ('stockholm', 0.7764841914176941),
 ('budapest', 0.7678731083869934),
 ('warsaw', 0.7668997645378113),
 ('prague', 0.7664732336997986),
 ('amsterdam', 0.7555989623069763)]

In [22]:
# But as you can see, the results are not always what we expect
model.most_similar(positive=["paris", "brazil"], negative=["france"])

[('janeiro', 0.8591146469116211),
 ('aires', 0.8547869324684143),
 ('paulo', 0.8382591605186462),
 ('buenos', 0.8382505178451538),
 ('sao', 0.7907378673553467),
 ('rio', 0.7878146767616272),
 ('lisbon', 0.7390437126159668),
 ('lima', 0.7380942106246948),
 ('madrid', 0.7324861288070679),
 ('brazilian', 0.7284553647041321)]

In [23]:
# There is a famous paper "Man is to Computer Programmer as Woman is to Homemaker?" (Bolukbasi et al. 2016)
# Not quite the case here, but let's see what we get

model.most_similar(positive=["programmer", "woman"], negative=["man"])

[('prodigy', 0.6731116771697998),
 ('therapist', 0.6414065361022949),
 ('psychotherapist', 0.6201493740081787),
 ('technician', 0.6191534996032715),
 ('programmers', 0.59874427318573),
 ('educator', 0.5932906866073608),
 ('psychologist', 0.5907979607582092),
 ('installer', 0.5891358256340027),
 ('acrobat', 0.5890737771987915),
 ('indian-american', 0.5828384160995483)]

In [24]:
model.most_similar(positive=["programmer", "man"], negative=["woman"])

[('programmers', 0.6660891175270081),
 ('software', 0.6582856774330139),
 ('computer', 0.6441952586174011),
 ('setup', 0.6373560428619385),
 ('animator', 0.6243171095848083),
 ('backup', 0.6232407689094543),
 ('innovator', 0.6204433441162109),
 ('entrepreneur', 0.6109629273414612),
 ('smart', 0.6065289974212646),
 ('developer', 0.6034530997276306)]

In [25]:
# Woman is to Nurse as Man is to Doctor
model.most_similar(positive=["man", "nurse"], negative=["woman"])

[('doctor', 0.8087943196296692),
 ('trained', 0.709814190864563),
 ('officer', 0.7070510983467102),
 ('surgeon', 0.7042073011398315),
 ('psychiatrist', 0.7012713551521301),
 ('teacher', 0.6995468139648438),
 ('physician', 0.6971623301506042),
 ('boy', 0.6922944784164429),
 ('young', 0.6911004185676575),
 ('firefighter', 0.6885278820991516)]

In [26]:
model.most_similar(positive=["man", "basketball"], negative=["woman"])

[('football', 0.8626790046691895),
 ('baseball', 0.8460296988487244),
 ('nba', 0.8321873545646667),
 ('hockey', 0.8035606145858765),
 ('cowboys', 0.7964950203895569),
 ('nfl', 0.7951563000679016),
 ('league', 0.7933791875839233),
 ('team', 0.78618323802948),
 ('soccer', 0.7773200869560242),
 ('coach', 0.7707177400588989)]

In [27]:
model.most_similar(positive=["woman", "basketball"], negative=["man"])

[('volleyball', 0.7890125513076782),
 ('softball', 0.779455840587616),
 ('hockey', 0.7546179890632629),
 ('varsity', 0.7487680912017822),
 ('collegiate', 0.7479637861251831),
 ('gymnastics', 0.7399787306785583),
 ('ncaa', 0.735418975353241),
 ('athletics', 0.7342723608016968),
 ('lacrosse', 0.7327582836151123),
 ('junior', 0.7303614616394043)]