# Trying out German wordvectors from Spacy and other sources

Results as of 31-05-2019.

In [None]:
import spacy
nlp = spacy.load('de_core_news_md')

## Checking out the word vector for 'Katze'

In [11]:
tokens = nlp("Katze")
for token in tokens:
    print(token.text, token.has_vector, token.vector_norm, token.is_oov)

Katze True 1.5040343 False


In [12]:
token.vector

array([ 7.94400e-02,  6.50390e-02,  3.84020e-02,  5.71960e-02,
       -1.07483e-01,  9.44500e-02,  1.26056e-01, -3.67150e-02,
       -5.19510e-02,  5.70890e-02, -3.57100e-03, -1.54680e-01,
       -1.79570e-01,  1.22324e-01,  2.95920e-02, -1.52795e-01,
        1.75330e-02,  2.72260e-02, -3.26880e-02, -5.72130e-02,
        7.61460e-02, -1.15061e-01, -1.18281e-01,  6.27390e-02,
        5.73380e-02,  8.77280e-02, -4.46090e-02, -7.46910e-02,
        6.35170e-02,  6.42840e-02,  5.79190e-02, -1.95661e-01,
       -3.66710e-02,  1.02721e-01, -1.22134e-01,  1.80000e-03,
        5.61740e-02,  6.45820e-02, -1.47540e-02, -1.47828e-01,
       -3.81200e-02, -1.64022e-01, -6.24050e-02,  5.61530e-02,
        1.64387e-01,  6.80460e-02,  1.80591e-01, -2.91230e-02,
        4.31600e-03,  1.40180e-02,  4.37030e-02, -2.29150e-02,
       -4.17530e-02, -7.53750e-02,  6.99130e-02, -6.91130e-02,
        1.00700e-03,  2.34009e-01, -7.96400e-02,  7.47110e-02,
        2.84470e-02,  5.21340e-02,  8.57650e-02, -5.567

## Let's compare some vectors

In [22]:
tokens = nlp("Mann Frau Banane Apfel")

In [23]:
for token1 in tokens:
    for token2 in tokens:     
        if token1 != token2:
            print(token1.text, token2.text, token1.similarity(token2))

Mann Frau 0.49876732
Mann Banane 0.40122947
Mann Apfel 0.51279485
Frau Mann 0.49876732
Frau Banane 0.42747173
Frau Apfel 0.5439015
Banane Mann 0.40122947
Banane Frau 0.42747173
Banane Apfel 0.55466294
Apfel Mann 0.51279485
Apfel Frau 0.5439015
Apfel Banane 0.55466294


Seems German women are more similar to apples than to men according to this model.

## What's the most similar word?

In [24]:
def most_similar(word):
    queries = [w for w in word.vocab if w.is_lower == word.is_lower and w.prob >= -15 and w.has_vector]
    by_similarity = sorted(queries, key=lambda w: word.similarity(w), reverse=True)
    return by_similarity[:20]

In [25]:
[w.lower_ for w in most_similar(nlp.vocab[u'hund'])]

['hund',
 'engel',
 'tur',
 'nech',
 'nis',
 'nat',
 'denne',
 'enn',
 'ene',
 'hände',
 'neh',
 'flo',
 'nä',
 'hy',
 'mord',
 'hunden',
 'igen',
 'dig',
 'jau',
 'ju']

In [26]:
[w.lower_ for w in most_similar(nlp.vocab[u'Apfel'])]

['apfel',
 'mops',
 'kohl',
 'kuss',
 'chr',
 'schwanz',
 'krankheit',
 'schleim',
 'elefanten',
 'esel',
 'trottel',
 'knoblauch',
 'koffer',
 'schleier',
 'mantel',
 'schlitten',
 'dreck',
 'geruch',
 'freude',
 'werth']

## Is it any better in English?

In [29]:
import spacy
nlp = spacy.load('en_core_web_md')

In [30]:
[w.lower_ for w in most_similar(nlp.vocab[u'dog'])]

['dog',
 'kennel',
 'canine',
 'hound',
 'canines',
 'dogs',
 'puppy',
 'poodle',
 'terrier',
 'husky',
 'greyhound',
 'retriever',
 'pet',
 'grooming',
 'feline',
 'cat',
 'puppies',
 'pitbulls',
 'huskies',
 'pitbull']

In [31]:
[w.lower_ for w in most_similar(nlp.vocab[u'apple'])]

['apple',
 'blackberry',
 'apples',
 'pears',
 'iphone',
 'fruit',
 'fig',
 'strawberry',
 'popsicle',
 'icecream',
 'ipad',
 'kiwi',
 'grapefruit',
 'mango',
 'pineapple',
 'cider',
 'mead',
 'ipod',
 'cranberry',
 'pomegranate']

In [34]:
tokens = nlp("man woman banana apple")

In [35]:
for token1 in tokens:
    for token2 in tokens:     
        if token1 != token2:
            print(token1.text, token2.text, token1.similarity(token2))

man woman 0.7401745
man banana 0.2426803
man apple 0.20974894
woman man 0.7401745
woman banana 0.2260633
woman apple 0.19461377
banana man 0.2426803
banana woman 0.2260633
banana apple 0.5831845
apple man 0.20974894
apple woman 0.19461377
apple banana 0.5831845


That's more like it

## Conclusion German language model in Spacy

Doesn't seem very impressive compared to the English one. Are there any better models available?

# Trying out German wordvectors from https://devmount.github.io/GermanWordEmbeddings/

In [36]:
from gensim.models.keyedvectors import KeyedVectors

In [37]:
word_vectors_model = KeyedVectors.load_word2vec_format('german.model', binary=True)

In [43]:
word_vectors_model["Katze"]

array([ 0.01952714,  0.0331448 ,  0.05852811, -0.06965609, -0.31470928,
        0.39255175, -0.14503856, -0.02480443, -0.29768062,  0.17916209,
        0.00299389,  0.16924381,  0.02153021,  0.05605159, -0.38303784,
        0.22865692, -0.28690276,  0.16789638,  0.16018285,  0.0944666 ,
       -0.13439137, -0.07034101,  0.36826703, -0.20224169,  0.11887279,
       -0.11946199, -0.07052661, -0.05839675,  0.05703048,  0.1431304 ,
       -0.12603366,  0.15745148, -0.04769883, -0.04310802,  0.0388437 ,
        0.09252485, -0.13361232,  0.05377932,  0.09278485,  0.22191218,
       -0.29280597,  0.22656001, -0.0546908 ,  0.11347145,  0.0254037 ,
       -0.12268334,  0.45216   ,  0.12394026,  0.03888717,  0.04726605,
        0.15844122,  0.09545826, -0.17165002,  0.0130187 , -0.24803649,
       -0.14646243, -0.05550057,  0.05950546, -0.15945338,  0.08621194,
        0.34104365, -0.09479828, -0.10784826, -0.20649078, -0.03533724,
        0.7011522 ,  0.00796406,  0.31761488,  0.26239318,  0.22

In [42]:
word_vectors_model["Inkasso"]

array([ 3.52544963e-01,  2.56488323e-01,  2.62146950e-01,  1.96612179e-01,
        2.39362642e-02, -2.29121178e-01,  1.81114152e-02,  1.67492196e-01,
       -1.84928089e-01, -5.80801070e-02,  1.28630862e-01,  8.61339197e-02,
       -2.29549274e-01,  4.30914432e-01, -3.94249484e-02,  1.52189806e-01,
       -4.90663648e-01,  3.00991652e-03,  6.13174260e-01,  2.92929467e-02,
       -1.00033090e-01,  1.50265153e-02,  8.77532586e-02, -2.19525263e-01,
        1.44876376e-01,  5.37484651e-04, -5.65597266e-02, -8.94784629e-02,
       -2.10487004e-02, -1.09548368e-01,  2.18051791e-01,  1.63616657e-01,
       -2.17007458e-01,  8.87971297e-02,  3.89541477e-01, -2.07743734e-01,
       -5.93353361e-02, -2.76203193e-02, -1.05253190e-01,  4.87749279e-02,
        1.81017578e-01,  2.98786044e-01, -8.35407991e-03,  1.04766123e-01,
       -2.53370441e-02,  1.65802881e-01, -1.18738793e-01,  4.38623518e-01,
        2.48694092e-01,  2.32866913e-01,  5.86935505e-03,  1.22913845e-01,
        7.34427795e-02, -

In [48]:
words = ["Mann", "Frau", "Apfel", "Banane"]
for token1 in words:
    for token2 in words:     
        if token1 != token2:
            print(token1, token2, word_vectors_model.similarity(token1,token2))

Mann Frau 0.8095551
Mann Apfel 0.30760363
Mann Banane 0.28847867
Frau Mann 0.8095551
Frau Apfel 0.26595286
Frau Banane 0.31406382
Apfel Mann 0.30760363
Apfel Frau 0.26595286
Apfel Banane 0.6604762
Banane Mann 0.28847867
Banane Frau 0.31406382
Banane Apfel 0.6604762


## What's the most similar word?

In [53]:
word_vectors_model.most_similar("Hund", topn=20)

[('Katze', 0.8130383491516113),
 ('Vierbeiner', 0.807488203048706),
 ('Tier', 0.7901308536529541),
 ('Dackel', 0.7848663330078125),
 ('Frauchen', 0.7831077575683594),
 ('Schaeferhund', 0.7816431522369385),
 ('Herrchen', 0.7813007831573486),
 ('Huendin', 0.7660676836967468),
 ('Gassi_gehen', 0.7625339031219482),
 ('Mischlingshund', 0.7615839242935181),
 ('Hunde', 0.7571041584014893),
 ('Jagdhund', 0.7460682988166809),
 ('Mischling', 0.7409281134605408),
 ('Hunden', 0.7333001494407654),
 ('ausgebuexten', 0.7294281125068665),
 ('Gassigehen', 0.7249583601951599),
 ('Hamster', 0.7240623235702515),
 ('bellend', 0.7238216996192932),
 ('Papagei', 0.7225003242492676),
 ('Cockerspaniel', 0.7203546166419983)]

In [54]:
word_vectors_model.most_similar("Apfel", topn=20)

[('Kuerbis', 0.7526000738143921),
 ('Gurke', 0.7188601493835449),
 ('Zitrone', 0.7182591557502747),
 ('Wurst', 0.7072211503982544),
 ('Kartoffel', 0.7019344568252563),
 ('Kohlrabi', 0.6948261260986328),
 ('Erdbeere', 0.6946195363998413),
 ('Aepfeln', 0.6938279271125793),
 ('Speck', 0.688808798789978),
 ('Himbeeren', 0.687086820602417),
 ('Spitzkohl', 0.6857706308364868),
 ('Birne', 0.6840704083442688),
 ('Tomate', 0.6834304332733154),
 ('Pfirsich', 0.683165431022644),
 ('Bohnen', 0.6814666986465454),
 ('Moehre', 0.6810375452041626),
 ('Aepfel', 0.6807883977890015),
 ('Kraut', 0.6775155067443848),
 ('Wassermelone', 0.6771656274795532),
 ('Paprika', 0.6768082976341248)]

## Some math with word vectors

### Koenig - Mann + Frau

In [56]:
positive = ["Frau", "Koenig"]
negative = ["Mann"]
word_vectors_model.most_similar(positive=positive, negative=negative, topn=5)

[('Koenigin', 0.7524535655975342),
 ('Prinzessin', 0.71452796459198),
 ('Prinz', 0.6881615519523621),
 ('Jungschuetzenkoenigin', 0.6740391254425049),
 ('Majestaet', 0.659064769744873)]

### Putin - Rusland + Deutschland

In [61]:
positive = ["Putin", "Deutschland"]
negative = ["Russland"]
word_vectors_model.most_similar(positive=positive, negative=negative, topn=5)

[('Bundeskanzlerin', 0.6233305931091309),
 ('Kanzlerin_Merkel', 0.6167038679122925),
 ('Angela_Merkel', 0.6104297637939453),
 ('Kanzlerin', 0.6102859377861023),
 ('Merkel', 0.5968186855316162)]

# Trying out German wordvectors from Fasttext

In [1]:
from gensim.models.wrappers import FastText
word_vectors_model = FastText.load_fasttext_format('cc.de.300.bin')

In [3]:
word_vectors_model["Katze"]

array([ 0.00705658,  0.03665943, -0.04904251,  0.03038029, -0.05216099,
        0.07476059,  0.04375604, -0.0576677 , -0.00598645,  0.00954182,
       -0.02619467,  0.03105956, -0.00193281,  0.04072258,  0.05409295,
        0.01081038,  0.02409723,  0.03717486, -0.12060858, -0.04739867,
        0.008459  , -0.03755536,  0.03592665,  0.01946772, -0.09275625,
        0.0206445 , -0.00370327,  0.08105088, -0.02166642, -0.06563529,
       -0.0382905 , -0.02920803,  0.01238743,  0.01038295,  0.10285275,
        0.07828199, -0.05679911, -0.00513924,  0.07510853, -0.04531677,
        0.03413609,  0.11717834, -0.02007757, -0.01422268, -0.01368631,
        0.03565939, -0.11995326, -0.0227239 ,  0.02291417, -0.02815491,
       -0.02096579, -0.01028913, -0.01171383,  0.05634379, -0.01442095,
       -0.02910704, -0.03573475, -0.06545392,  0.00390892, -0.02880078,
       -0.01523009,  0.01006212,  0.03027238,  0.14800395, -0.05113512,
       -0.01366111,  0.0897688 , -0.04535784,  0.03976589,  0.01

In [4]:
word_vectors_model.most_similar("Hund", topn=20)

[('Vierbeiner', 0.8140769004821777),
 ('Hunde', 0.792273998260498),
 ('Hunden', 0.7635980844497681),
 ('Hund.', 0.7381591796875),
 ('Herrchen', 0.7317228317260742),
 ('Katze', 0.7219496369361877),
 ('Familienhund', 0.7209774851799011),
 ('Nachbarshund', 0.7175627946853638),
 ('Nachbarhund', 0.716743528842926),
 ('Hundes', 0.7121168971061707),
 ('Junghund', 0.7093837261199951),
 ('Welpe', 0.7068514823913574),
 ('Dackel', 0.7014358043670654),
 ('Hundefreund', 0.6981956958770752),
 ('hund', 0.6962772607803345),
 ('Hundekumpel', 0.6933536529541016),
 ('Hundebesitzer', 0.6889547109603882),
 ('Frauchen', 0.6885875463485718),
 ('Einzelhund', 0.684305727481842),
 ('Hundchen', 0.6801247000694275)]

In [6]:
positive = ["Frau", "Koenig"]
negative = ["Mann"]
word_vectors_model.most_similar(positive=positive, negative=negative, topn=5)

[('Koenigin', 0.52174973487854),
 ('Koenigs', 0.5118163228034973),
 ('Alexandra', 0.5115703344345093),
 ('Ursula', 0.5099889039993286),
 ('Christa', 0.504902720451355)]

In [7]:
positive = ["Putin", "Deutschland"]
negative = ["Russland"]
word_vectors_model.most_similar(positive=positive, negative=negative, topn=5)

[('Merkel', 0.6383850574493408),
 ('Obama', 0.5623136758804321),
 ('Erdogan', 0.559893012046814),
 ('Kanzlerin', 0.5595794320106506),
 ('Gauck', 0.5471944808959961)]

# Conclusion
The default German wordvec model from spacy is not very convincing. Wordvectors from
https://devmount.github.io/GermanWordEmbeddings/ or from https://fasttext.cc/ seem more reliable. For quick experiments the first one is more suitable, since the fasttext model is extremely large (7 Gb).