### Loading and prerequesites

In [13]:
import spacy
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import scipy
import statsmodels.api as sm
import gensim
import re
from pprint import pprint
from time import time
from collections import defaultdict
from gensim import corpora
from gensim.test.utils import datapath
from gensim import utils
import gensim.models
from gensim.models.phrases import Phrases, Phraser
import multiprocessing
from gensim.models import Word2Vec

import logging
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)

from matplotlib import rcParams
rcParams.update({'figure.autolayout': True})

cores = multiprocessing.cpu_count()

In [14]:
VARIANT = "MEDIUMs" # "SMALL", "MEDIUM" , "FULL"

if VARIANT == "SMALL":
    df = pd.read_pickle("../Data/df_full.pkl")
    df = df.head(30)
elif VARIANT == "MEDIUM":
    df = pd.read_pickle("../Data/df_full.pkl")
    df = df.head(500)
else:
    df = pd.read_pickle("../Data/df_full.pkl")

In [15]:
# Load stopwords from txt file using utf-8 encoding
with open("../Data/stopwords-de.txt", "r", encoding="utf-8") as file:
    stoplist = file.read().split("\n")

### ALT

In [16]:
# Create a list of documents from the dataframe column "clean_text"
documents = df["clean_text"].values.tolist()

# remove common words and tokenize
texts = [
    [word for word in document.lower().split() if word not in stoplist]
    for document in documents
]

# remove words that appear only once
frequency = defaultdict(int)
for text in texts:
    for token in text:
        frequency[token] += 1

texts = [
    [token for token in text if frequency[token] > 1]
    for text in texts
]

dictionary = corpora.Dictionary(texts)
corpus = [dictionary.doc2bow(text) for text in texts]

# Save the dictionary and corpus for later use
dictionary.save("../Data/Gensim/dict.dict")
corpora.MmCorpus.serialize("../Data/Gensim/df_corpus.mm", corpus)

2024-02-13 11:23:29,345 : INFO : adding document #0 to Dictionary<0 unique tokens: []>
2024-02-13 11:23:29,980 : INFO : built Dictionary<56144 unique tokens: ['abgewinn', 'abhängen', 'abstimmung', 'absturz', 'analyse']...> from 4365 documents (total 1200525 corpus positions)
2024-02-13 11:23:29,980 : INFO : Dictionary lifecycle event {'msg': "built Dictionary<56144 unique tokens: ['abgewinn', 'abhängen', 'abstimmung', 'absturz', 'analyse']...> from 4365 documents (total 1200525 corpus positions)", 'datetime': '2024-02-13T11:23:29.980695', 'gensim': '4.3.2', 'python': '3.10.10 (tags/v3.10.10:aad5f6a, Feb  7 2023, 17:20:36) [MSC v.1929 64 bit (AMD64)]', 'platform': 'Windows-10-10.0.19045-SP0', 'event': 'created'}
2024-02-13 11:23:30,580 : INFO : Dictionary lifecycle event {'fname_or_handle': '../Data/Gensim/dict.dict', 'separately': 'None', 'sep_limit': 10485760, 'ignore': frozenset(), 'datetime': '2024-02-13T11:23:30.580588', 'gensim': '4.3.2', 'python': '3.10.10 (tags/v3.10.10:aad5f6a,

In [17]:
# Build a word2vec model
model = gensim.models.Word2Vec(texts, window=5, min_count=10, workers=4)

for index, word in enumerate(model.wv.index_to_key):
    if index == 10:
        break
    print(f"word #{index}/{len(model.wv.index_to_key)} is {word}")

2024-02-13 11:23:31,186 : INFO : collecting all words and their counts
2024-02-13 11:23:31,186 : INFO : PROGRESS: at sentence #0, processed 0 words, keeping 0 word types
2024-02-13 11:23:31,316 : INFO : collected 56144 word types from a corpus of 1200525 raw words and 4365 sentences
2024-02-13 11:23:31,316 : INFO : Creating a fresh vocabulary
2024-02-13 11:23:31,351 : INFO : Word2Vec lifecycle event {'msg': 'effective_min_count=10 retains 13733 unique words (24.46% of original 56144, drops 42411)', 'datetime': '2024-02-13T11:23:31.351376', 'gensim': '4.3.2', 'python': '3.10.10 (tags/v3.10.10:aad5f6a, Feb  7 2023, 17:20:36) [MSC v.1929 64 bit (AMD64)]', 'platform': 'Windows-10-10.0.19045-SP0', 'event': 'prepare_vocab'}
2024-02-13 11:23:31,351 : INFO : Word2Vec lifecycle event {'msg': 'effective_min_count=10 leaves 1049930 word corpus (87.46% of original 1200525, drops 150595)', 'datetime': '2024-02-13T11:23:31.351376', 'gensim': '4.3.2', 'python': '3.10.10 (tags/v3.10.10:aad5f6a, Feb  7

word #0/13733 is sagen
word #1/13733 is fleisch
word #2/13733 is geben
word #3/13733 is lebensmittel
word #4/13733 is produkt
word #5/13733 is neu
word #6/13733 is eu
word #7/13733 is prozent
word #8/13733 is gentechnisch
word #9/13733 is unternehmen


In [28]:
print("Similar words to 'fleisch':")
similar_words = model.wv.most_similar(positive=["fleisch"])
for word, similarity in similar_words:
    print(f"{word}: {similarity}")
print("-------------------")
print("Similar words to 'vegan':")
similar_words = model.wv.most_similar(positive=["vegan"])
for word, similarity in similar_words:
    print(f"{word}: {similarity}")
print("-------------------")
print("Similar words to 'Insekt':")
similar_words = model.wv.most_similar(positive=["insekt"])
for word, similarity in similar_words:
    print(f"{word}: {similarity}")
print("-------------------")
print("Similar words to 'gentechnik':")
similar_words = model.wv.most_similar(positive=["gentechnik"])
for word, similarity in similar_words:
    print(f"{word}: {similarity}")

Similar words to 'fleisch':
laborfleisch: 0.8023892641067505
fleischprodukt: 0.7365873456001282
kunstfleisch: 0.693586528301239
steak: 0.6881596446037292
alternative: 0.679298996925354
rindfleisch: 0.674889087677002
hühnerfleisch: 0.66506028175354
schnitzel: 0.6615918278694153
flexitarier: 0.6352449655532837
kulturfleisch: 0.634269654750824
-------------------
Similar words to 'vegan':
vegetarisch: 0.9666286706924438
veggie: 0.9295666813850403
fleischlos: 0.9275266528129578
wurst: 0.8767037987709045
currywurst: 0.8545152544975281
fleischersatz: 0.8513873815536499
röbe: 0.8452140688896179
käsealternative: 0.8422332406044006
speisekart: 0.8377646207809448
angebot: 0.8373401165008545
-------------------
Similar words to 'Insekt':
heuschrecke: 0.8000679016113281
insektenart: 0.7979258298873901
mehlwürmer: 0.7908195853233337
essbar: 0.7791203260421753
speiseplan: 0.7715321183204651
nahrhaft: 0.7507984042167664
mehlwurm: 0.7502872347831726
krabbeltier: 0.7431719899177551
käfer: 0.72996354103

In [6]:
# old, from https://radimrehurek.com/gensim/auto_examples/tutorials/run_word2vec.html#sphx-glr-auto-examples-tutorials-run-word2vec-py
from sklearn.decomposition import IncrementalPCA    # inital reduction
from sklearn.manifold import TSNE                   # final reduction
import numpy as np                                  # array handling


def reduce_dimensions(model):
    num_dimensions = 2  # final num dimensions (2D, 3D, etc)

    # extract the words & their vectors, as numpy arrays
    vectors = np.asarray(model.wv.vectors)
    labels = np.asarray(model.wv.index_to_key)  # fixed-width numpy strings

    # reduce using t-SNE
    tsne = TSNE(n_components=num_dimensions, random_state=0)
    vectors = tsne.fit_transform(vectors)

    x_vals = [v[0] for v in vectors]
    y_vals = [v[1] for v in vectors]
    return x_vals, y_vals, labels


x_vals, y_vals, labels = reduce_dimensions(model)

def plot_with_plotly(x_vals, y_vals, labels, plot_in_notebook=True):
    from plotly.offline import init_notebook_mode, iplot, plot
    import plotly.graph_objs as go

    trace = go.Scatter(x=x_vals, y=y_vals, mode='text', text=labels)
    data = [trace]

    if plot_in_notebook:
        init_notebook_mode(connected=True)
        iplot(data, filename='word-embedding-plot')
    else:
        plot(data, filename='word-embedding-plot.html')


def plot_with_matplotlib(x_vals, y_vals, labels):
    import matplotlib.pyplot as plt
    import random

    random.seed(0)

    plt.figure(figsize=(12, 12))
    plt.scatter(x_vals, y_vals)

    #
    # Label randomly subsampled 25 data points
    #
    indices = list(range(len(labels)))
    selected_indices = random.sample(indices, 25)
    for i in selected_indices:
        plt.annotate(labels[i], (x_vals[i], y_vals[i]))

try:
    get_ipython()
except Exception:
    plot_function = plot_with_matplotlib
else:
    plot_function = plot_with_plotly

plot_function(x_vals, y_vals, labels)