### Loading and prerequesites

In [22]:
import spacy
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import scipy
import statsmodels.api as sm
import gensim
import re
from pprint import pprint
from time import time
from collections import defaultdict
from gensim import corpora
from gensim.test.utils import datapath
from gensim import utils
import gensim.models
from gensim.models.phrases import Phrases, Phraser
import multiprocessing
from gensim.models import Word2Vec

import gensim
import gensim.corpora as corpora
from gensim.utils import simple_preprocess
from gensim.models import CoherenceModel

import nltk

# Plotting tools
import pyLDAvis
import pyLDAvis.gensim
import matplotlib.pyplot as plt
%matplotlib inline

import logging
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)

from matplotlib import rcParams
rcParams.update({'figure.autolayout': True})

cores = multiprocessing.cpu_count()

import warnings
warnings.filterwarnings("ignore",category=DeprecationWarning)

In [23]:
VARIANT = "FULL" # "SMALL", "MEDIUM" , "FULL"

if VARIANT == "SMALL":
    df = pd.read_pickle("../Data/df_full.pkl")
    df = df.head(30)
elif VARIANT == "MEDIUM":
    df = pd.read_pickle("../Data/df_full.pkl")
    df = df.head(500)
else:
    df = pd.read_pickle("../Data/df_full.pkl")

In [24]:
# Load stopwords from txt file using utf-8 encoding
with open("../Data/stopwords-de.txt", "r", encoding="utf-8") as file:
    stoplist = file.read().split("\n")

# NLTK Stop words
from nltk.corpus import stopwords
stop_words = stopwords.words('german')

# Go through stop_words and swap ä,ö,ü with a,o,u
for i, word in enumerate(stop_words):
    word = word.replace("ä", "a")
    word = word.replace("ö", "o")
    word = word.replace("ü", "u")
    stop_words[i] = word

### Gensim LDA Topic Modelling

In [25]:
# Convert to list
data = df.text.values.tolist()

# Remove Emails
data = [re.sub('\S*@\S*\s?', '', sent) for sent in data]

# Remove new line characters
data = [re.sub('\s+', ' ', sent) for sent in data]

# Remove distracting single quotes
data = [re.sub("\'", "", sent) for sent in data]

pprint(data[:1])

['Wenn Großbritannien Ende März den Ausstieg aus der EU wie geplant einleitet, '
 'ist ein Ausschluss aus dem Binnenmarkt wahrscheinlich. Dann drohen hohe '
 'Zölle auf Exporte aus dem Königreich. London (AP) - Ist es Kabeljau oder '
 'Schellfisch? Fragen dieser Art werden für Großbritannien künftig von großer '
 'Bedeutung sein, falls das Land den zollfreien EU-Binnenmarkt verlässt und '
 'sich stattdessen den Regeln der Welthandelsorganisation (WTO) unterwirft. Im '
 'Zuge eines solchen «harten Brexits» wäre London mit EU-Zöllen auf '
 'schätzungsweise 15 000 Waren konfrontiert. Einige Exporte wie etwa '
 'Medikamente würden zwar nicht darunter fallen, die große Mehrheit allerdings '
 'schon.Die Höhe der Zölle wird von einer Vielzahl komplexer Faktoren '
 'abhängen. So mögen zwar Schellfisch und Kabeljau in den Küchen vieler '
 'britischer «Fish And Chips»-Restaurants direkt nebeneinander liegen. Doch '
 'von der EU werden die beiden Fischsorten unterschiedlich klassifiziert: '
 'Kab

In [26]:
def sent_to_words(sentences):
    for sentence in sentences:
        yield(gensim.utils.simple_preprocess(str(sentence), deacc=True))  # deacc=True removes punctuations

data_words = list(sent_to_words(data))

print(data_words[:1])

[['wenn', 'großbritannien', 'ende', 'marz', 'den', 'ausstieg', 'aus', 'der', 'eu', 'wie', 'geplant', 'einleitet', 'ist', 'ein', 'ausschluss', 'aus', 'dem', 'binnenmarkt', 'wahrscheinlich', 'dann', 'drohen', 'hohe', 'zolle', 'auf', 'exporte', 'aus', 'dem', 'konigreich', 'london', 'ap', 'ist', 'es', 'kabeljau', 'oder', 'schellfisch', 'fragen', 'dieser', 'art', 'werden', 'fur', 'großbritannien', 'kunftig', 'von', 'großer', 'bedeutung', 'sein', 'falls', 'das', 'land', 'den', 'zollfreien', 'eu', 'binnenmarkt', 'verlasst', 'und', 'sich', 'stattdessen', 'den', 'regeln', 'der', 'wto', 'unterwirft', 'im', 'zuge', 'eines', 'solchen', 'harten', 'brexits', 'ware', 'london', 'mit', 'eu', 'zollen', 'auf', 'schatzungsweise', 'waren', 'konfrontiert', 'einige', 'exporte', 'wie', 'etwa', 'medikamente', 'wurden', 'zwar', 'nicht', 'darunter', 'fallen', 'die', 'große', 'mehrheit', 'allerdings', 'schon', 'die', 'hohe', 'der', 'zolle', 'wird', 'von', 'einer', 'vielzahl', 'komplexer', 'faktoren', 'abhangen', 

In [27]:
# Build the bigram and trigram models
bigram = gensim.models.Phrases(data_words, min_count=5, threshold=100) # higher threshold fewer phrases.
trigram = gensim.models.Phrases(bigram[data_words], threshold=100)  

# Faster way to get a sentence clubbed as a trigram/bigram
bigram_mod = gensim.models.phrases.Phraser(bigram)
trigram_mod = gensim.models.phrases.Phraser(trigram)

# See trigram example
print(trigram_mod[bigram_mod[data_words[0]]])

2024-02-16 13:27:51,435 : INFO : collecting all words and their counts
2024-02-16 13:27:51,435 : INFO : PROGRESS: at sentence #0, processed 0 words and 0 word types
2024-02-16 13:27:53,196 : INFO : collected 1078777 token types (unigram + bigrams) from a corpus of 2476957 words and 4365 sentences
2024-02-16 13:27:53,196 : INFO : merged Phrases<1078777 vocab, min_count=5, threshold=100, max_vocab_size=40000000>
2024-02-16 13:27:53,196 : INFO : Phrases lifecycle event {'msg': 'built Phrases<1078777 vocab, min_count=5, threshold=100, max_vocab_size=40000000> in 1.76s', 'datetime': '2024-02-16T13:27:53.196036', 'gensim': '4.3.2', 'python': '3.10.10 (tags/v3.10.10:aad5f6a, Feb  7 2023, 17:20:36) [MSC v.1929 64 bit (AMD64)]', 'platform': 'Windows-10-10.0.19045-SP0', 'event': 'created'}
2024-02-16 13:27:53,218 : INFO : collecting all words and their counts
2024-02-16 13:27:53,218 : INFO : PROGRESS: at sentence #0, processed 0 words and 0 word types
2024-02-16 13:27:57,665 : INFO : collected 1

['wenn', 'großbritannien', 'ende', 'marz', 'den', 'ausstieg', 'aus', 'der', 'eu', 'wie', 'geplant', 'einleitet', 'ist', 'ein', 'ausschluss', 'aus', 'dem', 'binnenmarkt', 'wahrscheinlich', 'dann', 'drohen', 'hohe', 'zolle', 'auf', 'exporte', 'aus', 'dem', 'konigreich', 'london', 'ap', 'ist', 'es', 'kabeljau', 'oder', 'schellfisch', 'fragen', 'dieser', 'art', 'werden', 'fur', 'großbritannien', 'kunftig', 'von', 'großer', 'bedeutung', 'sein', 'falls', 'das', 'land', 'den', 'zollfreien', 'eu', 'binnenmarkt', 'verlasst', 'und', 'sich', 'stattdessen', 'den', 'regeln', 'der', 'wto', 'unterwirft', 'im', 'zuge', 'eines', 'solchen', 'harten_brexits', 'ware', 'london', 'mit', 'eu', 'zollen', 'auf', 'schatzungsweise', 'waren', 'konfrontiert', 'einige', 'exporte', 'wie', 'etwa', 'medikamente', 'wurden', 'zwar', 'nicht', 'darunter', 'fallen', 'die', 'große', 'mehrheit', 'allerdings', 'schon', 'die', 'hohe', 'der', 'zolle', 'wird', 'von', 'einer', 'vielzahl', 'komplexer', 'faktoren', 'abhangen', 'so'

In [28]:
# Define functions for stopwords, bigrams, trigrams and lemmatization
def remove_stopwords(texts):
    return [[word for word in simple_preprocess(str(doc)) if word not in stop_words] for doc in texts]

def make_bigrams(texts):
    return [bigram_mod[doc] for doc in texts]

def make_trigrams(texts):
    return [trigram_mod[bigram_mod[doc]] for doc in texts]

def lemmatization(texts, allowed_postags=['NOUN', 'ADJ', 'VERB', 'ADV']):
    """https://spacy.io/api/annotation"""
    texts_out = []
    for sent in texts:
        doc = nlp(" ".join(sent)) 
        texts_out.append([token.lemma_ for token in doc if token.pos_ in allowed_postags])
    return texts_out

In [29]:
# Remove Stop Words
data_words_nostops = remove_stopwords(data_words)

# Form Bigrams
data_words_bigrams = make_bigrams(data_words_nostops)

# Initialize spacy 'de' model, keeping only tagger component (for efficiency)
# python3 -m spacy download de
nlp = spacy.load('de_core_news_sm', disable=['parser', 'ner'])

# Do lemmatization keeping only noun, adj, vb, adv
data_lemmatized = lemmatization(data_words_bigrams, allowed_postags=['NOUN', 'ADJ', 'VERB', 'ADV'])

print(data_lemmatized[:1])

[['Ende', 'Ausstieg', 'planen', 'einleiten', 'ausschluss', 'binnenmarken', 'wahrscheinlich', 'drohen', 'hoch', 'zolle', 'exporen', 'Konigreich', 'kabeljau', 'schellfisch', 'Frage', 'Art', 'kunftig', 'groß', 'Bedeutung', 'Land', 'zollfreien', 'binnenmarken', 'verlasst', 'stattdessen', 'Regel', 'unterwerfen', 'Zug', 'harten_brexits', 'zoll', 'Schatzungsweise', 'konfrontieren', 'exporen', 'etwa', 'medikamenen', 'darunter', 'fallen', 'groß', 'Mehrheit', 'allerdings', 'schon', 'hoch', 'Zoll', 'komplex', 'Faktor', 'abhang', 'Moge', 'schellfisch', 'kabeljau', 'kuchen', 'Britischer', 'Restaurant', 'direkt', 'nebeneinander', 'liegen', 'fischsorten', 'unterschiedlich', 'klassifizieren', 'Kabeljau', 'Prozent', 'verzollen', 'schellfisch', 'dagegen', 'Prozent', 'Beispiel', 'durchschnitt', 'zoll', 'britisch', 'exporte', 'Analyse', 'Londoner', 'Thinktanks', 'etwa', 'Prozent', 'liegen', 'Experte', 'darunter', 'Brexit', 'Befurworter', 'halten', 'schlimm', 'britisch', 'Wirtschaft', 'Grund', 'brechen', '

In [30]:
# Create Dictionary
id2word = corpora.Dictionary(data_lemmatized)

# Create Corpus
texts = data_lemmatized

# Term Document Frequency
corpus = [id2word.doc2bow(text) for text in texts]

# View
print(corpus[:1])

2024-02-16 13:28:49,211 : INFO : adding document #0 to Dictionary<0 unique tokens: []>
2024-02-16 13:28:49,868 : INFO : built Dictionary<101620 unique tokens: ['Abstimmung', 'Absturz', 'Ahnlichkeit', 'Analyse', 'Angehort']...> from 4365 documents (total 1063843 corpus positions)
2024-02-16 13:28:49,870 : INFO : Dictionary lifecycle event {'msg': "built Dictionary<101620 unique tokens: ['Abstimmung', 'Absturz', 'Ahnlichkeit', 'Analyse', 'Angehort']...> from 4365 documents (total 1063843 corpus positions)", 'datetime': '2024-02-16T13:28:49.870067', 'gensim': '4.3.2', 'python': '3.10.10 (tags/v3.10.10:aad5f6a, Feb  7 2023, 17:20:36) [MSC v.1929 64 bit (AMD64)]', 'platform': 'Windows-10-10.0.19045-SP0', 'event': 'created'}


[[(0, 1), (1, 1), (2, 1), (3, 1), (4, 1), (5, 2), (6, 1), (7, 1), (8, 1), (9, 1), (10, 1), (11, 1), (12, 3), (13, 2), (14, 1), (15, 6), (16, 2), (17, 1), (18, 3), (19, 1), (20, 1), (21, 1), (22, 1), (23, 1), (24, 1), (25, 1), (26, 2), (27, 2), (28, 1), (29, 1), (30, 1), (31, 1), (32, 1), (33, 1), (34, 1), (35, 1), (36, 1), (37, 3), (38, 1), (39, 1), (40, 1), (41, 1), (42, 1), (43, 1), (44, 1), (45, 1), (46, 1), (47, 1), (48, 1), (49, 1), (50, 1), (51, 1), (52, 2), (53, 1), (54, 1), (55, 2), (56, 1), (57, 1), (58, 1), (59, 1), (60, 1), (61, 2), (62, 2), (63, 1), (64, 1), (65, 1), (66, 2), (67, 1), (68, 1), (69, 1), (70, 7), (71, 1), (72, 1), (73, 1), (74, 1), (75, 1), (76, 1), (77, 1), (78, 1), (79, 1), (80, 1), (81, 1), (82, 1), (83, 1), (84, 1), (85, 1), (86, 1), (87, 1), (88, 1), (89, 1), (90, 2), (91, 1), (92, 1), (93, 1), (94, 1), (95, 2), (96, 1), (97, 1), (98, 1), (99, 2), (100, 1), (101, 1), (102, 1), (103, 1), (104, 1), (105, 4), (106, 1), (107, 1), (108, 1), (109, 1), (110, 1)

In [31]:
# Build LDA model
lda_model = gensim.models.ldamodel.LdaModel(corpus=corpus,
                                           id2word=id2word,
                                           num_topics=10, 
                                           random_state=100,
                                           update_every=1,
                                           chunksize=100,
                                           passes=10,
                                           alpha='auto',
                                           per_word_topics=True)

2024-02-16 13:28:50,252 : INFO : using autotuned alpha, starting with [0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1]
2024-02-16 13:28:50,252 : INFO : using symmetric eta at 0.1
2024-02-16 13:28:50,265 : INFO : using serial LDA version on this node
2024-02-16 13:28:50,321 : INFO : running online (multi-pass) LDA training, 10 topics, 10 passes over the supplied corpus of 4365 documents, updating model once every 100 documents, evaluating perplexity every 1000 documents, iterating 50x with a convergence threshold of 0.001000
2024-02-16 13:28:50,322 : INFO : PROGRESS: pass 0, at document #100/4365
2024-02-16 13:28:50,367 : INFO : optimized alpha [0.15299448, 0.14696023, 0.1594215, 0.14274292, 0.14605698, 0.15529087, 0.15804017, 0.13538495, 0.1564618, 0.16159248]
2024-02-16 13:28:50,391 : INFO : merging changes from 100 documents into a model of 4365 documents
2024-02-16 13:28:50,426 : INFO : topic #7 (0.135): 0.009*"sagen" + 0.009*"neu" + 0.006*"Jahr" + 0.006*"Produkt" + 0.005*"Prozent

In [32]:
# Print the Keyword in the 10 topics
pprint(lda_model.print_topics())
doc_lda = lda_model[corpus]

2024-02-16 13:29:40,873 : INFO : topic #0 (0.239): 0.022*"Kommission" + 0.017*"Eu_kommission" + 0.014*"Zulassung" + 0.013*"Verordnung" + 0.008*"eiha" + 0.008*"Neuartige_lebensmittel" + 0.006*"europaisch" + 0.006*"Cbd" + 0.005*"Hirt" + 0.005*"Kraft"
2024-02-16 13:29:40,874 : INFO : topic #1 (0.004): 0.000*"Vanhonacker" + 0.000*"Veissier" + 0.000*"Typologie" + 0.000*"Uberwiege" + 0.000*"Umsteuerunge" + 0.000*"Towards" + 0.000*"Videobotschaft" + 0.000*"Verbeke" + 0.000*"Tierwohlachtung" + 0.000*"Ventura"
2024-02-16 13:29:40,875 : INFO : topic #2 (0.521): 0.040*"fleisch" + 0.013*"Jahr" + 0.013*"sagen" + 0.011*"vegan" + 0.010*"Unternehmen" + 0.010*"essen" + 0.009*"Tier" + 0.007*"Prozent" + 0.007*"erster" + 0.007*"Produkt"
2024-02-16 13:29:40,877 : INFO : topic #3 (0.094): 0.006*"hamburg" + 0.006*"umweltschutzer" + 0.006*"Klimabilanz" + 0.005*"retortenfleisch" + 0.004*"Patty" + 0.004*"zugesetzt" + 0.003*"Buhler" + 0.003*"institut" + 0.003*"Fermentation" + 0.003*"lebenden_tieren"
2024-02-16 1

[(0,
  '0.022*"Kommission" + 0.017*"Eu_kommission" + 0.014*"Zulassung" + '
  '0.013*"Verordnung" + 0.008*"eiha" + 0.008*"Neuartige_lebensmittel" + '
  '0.006*"europaisch" + 0.006*"Cbd" + 0.005*"Hirt" + 0.005*"Kraft"'),
 (1,
  '0.000*"Vanhonacker" + 0.000*"Veissier" + 0.000*"Typologie" + '
  '0.000*"Uberwiege" + 0.000*"Umsteuerunge" + 0.000*"Towards" + '
  '0.000*"Videobotschaft" + 0.000*"Verbeke" + 0.000*"Tierwohlachtung" + '
  '0.000*"Ventura"'),
 (2,
  '0.040*"fleisch" + 0.013*"Jahr" + 0.013*"sagen" + 0.011*"vegan" + '
  '0.010*"Unternehmen" + 0.010*"essen" + 0.009*"Tier" + 0.007*"Prozent" + '
  '0.007*"erster" + 0.007*"Produkt"'),
 (3,
  '0.006*"hamburg" + 0.006*"umweltschutzer" + 0.006*"Klimabilanz" + '
  '0.005*"retortenfleisch" + 0.004*"Patty" + 0.004*"zugesetzt" + '
  '0.003*"Buhler" + 0.003*"institut" + 0.003*"Fermentation" + '
  '0.003*"lebenden_tieren"'),
 (4,
  '0.038*"insekt" + 0.016*"insekten" + 0.015*"Insekten" + 0.008*"grill" + '
  '0.008*"Insekt" + 0.007*"Futter" + 0.00

In [33]:
# Compute Perplexity
print('\nPerplexity: ', lda_model.log_perplexity(corpus))  # a measure of how good the model is. lower the better.

# Compute Coherence Score
coherence_model_lda = CoherenceModel(model=lda_model, texts=data_lemmatized, dictionary=id2word, coherence='c_v')
coherence_lda = coherence_model_lda.get_coherence()
print('\nCoherence Score: ', coherence_lda)

2024-02-16 13:29:43,538 : INFO : -10.915 per-word bound, 1931.1 perplexity estimate based on a held-out corpus of 4365 documents with 1063843 words
2024-02-16 13:29:43,569 : INFO : using ParallelWordOccurrenceAccumulator<processes=11, batch_size=64> to estimate probabilities from sliding windows



Perplexity:  -10.9152147261887


2024-02-16 13:29:50,215 : INFO : 1 batches submitted to accumulate stats from 64 documents (7204 virtual)
2024-02-16 13:29:50,230 : INFO : 2 batches submitted to accumulate stats from 128 documents (19518 virtual)
2024-02-16 13:29:50,230 : INFO : 3 batches submitted to accumulate stats from 192 documents (25311 virtual)
2024-02-16 13:29:50,236 : INFO : 4 batches submitted to accumulate stats from 256 documents (29998 virtual)
2024-02-16 13:29:50,236 : INFO : 5 batches submitted to accumulate stats from 320 documents (40478 virtual)
2024-02-16 13:29:50,236 : INFO : 6 batches submitted to accumulate stats from 384 documents (62426 virtual)
2024-02-16 13:29:50,236 : INFO : 7 batches submitted to accumulate stats from 448 documents (73812 virtual)
2024-02-16 13:29:50,251 : INFO : 8 batches submitted to accumulate stats from 512 documents (78537 virtual)
2024-02-16 13:29:50,256 : INFO : 9 batches submitted to accumulate stats from 576 documents (93155 virtual)
2024-02-16 13:29:50,257 : INFO


Coherence Score:  0.46838895760339716


In [34]:
# Visualize the topics
pyLDAvis.enable_notebook()
vis = pyLDAvis.gensim.prepare(lda_model, corpus, id2word)
vis

### OLD

In [4]:
# Create a list of documents from the dataframe column "clean_text"
documents = df["clean_text"].values.tolist()

# remove common words and tokenize
texts = [
    [word for word in document.lower().split() if word not in stoplist]
    for document in documents
]

# remove words that appear only once
frequency = defaultdict(int)
for text in texts:
    for token in text:
        frequency[token] += 1

texts = [
    [token for token in text if frequency[token] > 1]
    for text in texts
]

dictionary = corpora.Dictionary(texts)
corpus = [dictionary.doc2bow(text) for text in texts]

# Save the dictionary and corpus for later use
dictionary.save("../Data/Gensim/dict.dict")
corpora.MmCorpus.serialize("../Data/Gensim/df_corpus.mm", corpus)

2024-02-15 11:52:58,342 : INFO : adding document #0 to Dictionary<0 unique tokens: []>
2024-02-15 11:52:58,990 : INFO : built Dictionary<56144 unique tokens: ['abgewinn', 'abhängen', 'abstimmung', 'absturz', 'analyse']...> from 4365 documents (total 1200525 corpus positions)
2024-02-15 11:52:58,991 : INFO : Dictionary lifecycle event {'msg': "built Dictionary<56144 unique tokens: ['abgewinn', 'abhängen', 'abstimmung', 'absturz', 'analyse']...> from 4365 documents (total 1200525 corpus positions)", 'datetime': '2024-02-15T11:52:58.991344', 'gensim': '4.3.2', 'python': '3.10.10 (tags/v3.10.10:aad5f6a, Feb  7 2023, 17:20:36) [MSC v.1929 64 bit (AMD64)]', 'platform': 'Windows-10-10.0.19045-SP0', 'event': 'created'}
2024-02-15 11:52:59,494 : INFO : Dictionary lifecycle event {'fname_or_handle': '../Data/Gensim/dict.dict', 'separately': 'None', 'sep_limit': 10485760, 'ignore': frozenset(), 'datetime': '2024-02-15T11:52:59.494546', 'gensim': '4.3.2', 'python': '3.10.10 (tags/v3.10.10:aad5f6a,

In [5]:
# Build a word2vec model
model = gensim.models.Word2Vec(texts, window=5, min_count=10, workers=4)

for index, word in enumerate(model.wv.index_to_key):
    if index == 10:
        break
    print(f"word #{index}/{len(model.wv.index_to_key)} is {word}")

2024-02-15 11:53:00,062 : INFO : collecting all words and their counts
2024-02-15 11:53:00,062 : INFO : PROGRESS: at sentence #0, processed 0 words, keeping 0 word types
2024-02-15 11:53:00,189 : INFO : collected 56144 word types from a corpus of 1200525 raw words and 4365 sentences
2024-02-15 11:53:00,190 : INFO : Creating a fresh vocabulary
2024-02-15 11:53:00,224 : INFO : Word2Vec lifecycle event {'msg': 'effective_min_count=10 retains 13733 unique words (24.46% of original 56144, drops 42411)', 'datetime': '2024-02-15T11:53:00.224209', 'gensim': '4.3.2', 'python': '3.10.10 (tags/v3.10.10:aad5f6a, Feb  7 2023, 17:20:36) [MSC v.1929 64 bit (AMD64)]', 'platform': 'Windows-10-10.0.19045-SP0', 'event': 'prepare_vocab'}
2024-02-15 11:53:00,225 : INFO : Word2Vec lifecycle event {'msg': 'effective_min_count=10 leaves 1049930 word corpus (87.46% of original 1200525, drops 150595)', 'datetime': '2024-02-15T11:53:00.224209', 'gensim': '4.3.2', 'python': '3.10.10 (tags/v3.10.10:aad5f6a, Feb  7

word #0/13733 is sagen
word #1/13733 is fleisch
word #2/13733 is geben
word #3/13733 is lebensmittel
word #4/13733 is produkt
word #5/13733 is neu
word #6/13733 is eu
word #7/13733 is prozent
word #8/13733 is gentechnisch
word #9/13733 is unternehmen


In [6]:
print("Similar words to 'fleisch':")
similar_words = model.wv.most_similar(positive=["fleisch"])
for word, similarity in similar_words:
    print(f"{word}: {similarity}")
print("-------------------")
print("Similar words to 'vegan':")
similar_words = model.wv.most_similar(positive=["vegan"])
for word, similarity in similar_words:
    print(f"{word}: {similarity}")
print("-------------------")
print("Similar words to 'Insekt':")
similar_words = model.wv.most_similar(positive=["insekt"])
for word, similarity in similar_words:
    print(f"{word}: {similarity}")
print("-------------------")
print("Similar words to 'gentechnik':")
similar_words = model.wv.most_similar(positive=["gentechnik"])
for word, similarity in similar_words:
    print(f"{word}: {similarity}")
print("-------------------")
print("Similar words to 'klima':")
similar_words = model.wv.most_similar(positive=["klima"])
for word, similarity in similar_words:
    print(f"{word}: {similarity}")
print("-------------------")

Similar words to 'fleisch':
laborfleisch: 0.7978897094726562
fleischprodukt: 0.6954471468925476
kunstfleisch: 0.673831582069397
rindfleisch: 0.6620738506317139
kulturfleisch: 0.6572024822235107
tierfrei: 0.6484807729721069
steak: 0.6460477113723755
alternative: 0.6239553093910217
tierfleisch: 0.6224168539047241
käse: 0.6182761788368225
-------------------
Similar words to 'vegan':
vegetarisch: 0.964493989944458
veggie: 0.9203389883041382
fleischlos: 0.9145782589912415
wurst: 0.8564102053642273
angebot: 0.8560646176338196
fleischersatz: 0.8454684019088745
sortiment: 0.8432948589324951
fleischersatzprodukt: 0.8373308181762695
fleischalternative: 0.8347960710525513
currywurst: 0.8343737721443176
-------------------
Similar words to 'Insekt':
insektenart: 0.8145617842674255
mehlwürmer: 0.8055780529975891
heuschrecke: 0.8024699091911316
essbar: 0.7759734392166138
nahrhaft: 0.7693979144096375
speiseplan: 0.7690139412879944
krabbeltier: 0.7619073390960693
jahrtausenden: 0.7609601616859436
meh

In [6]:
# old, from https://radimrehurek.com/gensim/auto_examples/tutorials/run_word2vec.html#sphx-glr-auto-examples-tutorials-run-word2vec-py
from sklearn.decomposition import IncrementalPCA    # inital reduction
from sklearn.manifold import TSNE                   # final reduction
import numpy as np                                  # array handling


def reduce_dimensions(model):
    num_dimensions = 2  # final num dimensions (2D, 3D, etc)

    # extract the words & their vectors, as numpy arrays
    vectors = np.asarray(model.wv.vectors)
    labels = np.asarray(model.wv.index_to_key)  # fixed-width numpy strings

    # reduce using t-SNE
    tsne = TSNE(n_components=num_dimensions, random_state=0)
    vectors = tsne.fit_transform(vectors)

    x_vals = [v[0] for v in vectors]
    y_vals = [v[1] for v in vectors]
    return x_vals, y_vals, labels


x_vals, y_vals, labels = reduce_dimensions(model)

def plot_with_plotly(x_vals, y_vals, labels, plot_in_notebook=True):
    from plotly.offline import init_notebook_mode, iplot, plot
    import plotly.graph_objs as go

    trace = go.Scatter(x=x_vals, y=y_vals, mode='text', text=labels)
    data = [trace]

    if plot_in_notebook:
        init_notebook_mode(connected=True)
        iplot(data, filename='word-embedding-plot')
    else:
        plot(data, filename='word-embedding-plot.html')


def plot_with_matplotlib(x_vals, y_vals, labels):
    import matplotlib.pyplot as plt
    import random

    random.seed(0)

    plt.figure(figsize=(12, 12))
    plt.scatter(x_vals, y_vals)

    #
    # Label randomly subsampled 25 data points
    #
    indices = list(range(len(labels)))
    selected_indices = random.sample(indices, 25)
    for i in selected_indices:
        plt.annotate(labels[i], (x_vals[i], y_vals[i]))

try:
    get_ipython()
except Exception:
    plot_function = plot_with_matplotlib
else:
    plot_function = plot_with_plotly

plot_function(x_vals, y_vals, labels)