### Loading and prerequesites

In [13]:
import spacy
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import scipy
import statsmodels.api as sm
import gensim
from pprint import pprint
from collections import defaultdict
from gensim import corpora
from gensim.test.utils import datapath
from gensim import utils
import gensim.models

import logging
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)

from matplotlib import rcParams
rcParams.update({'figure.autolayout': True})

In [14]:
VARIANT = "MEDIUM" # "SMALL", "MEDIUM" , "FULL"

if VARIANT == "SMALL":
    df = pd.read_pickle("../Data/df_full.pkl")
    df = df.head(30)
elif VARIANT == "MEDIUM":
    df = pd.read_pickle("../Data/df_full.pkl")
    df = df.head(500)
else:
    df = pd.read_pickle("../Data/df_full.pkl")

In [15]:
# Load stopwords from txt file using utf-8 encoding
with open("../Data/stopwords-de.txt", "r", encoding="utf-8") as file:
    stoplist = file.read().split("\n")

### bla bla

In [16]:
# Create a list of documents from the dataframe column "clean_text"
documents = df["clean_text"].values.tolist()

# remove common words and tokenize
texts = [
    [word for word in document.lower().split() if word not in stoplist]
    for document in documents
]

# remove words that appear only once
frequency = defaultdict(int)
for text in texts:
    for token in text:
        frequency[token] += 1

texts = [
    [token for token in text if frequency[token] > 1]
    for text in texts
]

dictionary = corpora.Dictionary(texts)
corpus = [dictionary.doc2bow(text) for text in texts]

# Save the dictionary and corpus for later use
dictionary.save("../Data/Gensim/dict.dict")
corpora.MmCorpus.serialize("../Data/Gensim/df_corpus.mm", corpus)

2024-02-12 19:40:31,106 : INFO : adding document #0 to Dictionary<0 unique tokens: []>
2024-02-12 19:40:31,179 : INFO : built Dictionary<13243 unique tokens: ['000', '122', '15', '150', '16']...> from 500 documents (total 135110 corpus positions)
2024-02-12 19:40:31,180 : INFO : Dictionary lifecycle event {'msg': "built Dictionary<13243 unique tokens: ['000', '122', '15', '150', '16']...> from 500 documents (total 135110 corpus positions)", 'datetime': '2024-02-12T19:40:31.180914', 'gensim': '4.3.2', 'python': '3.10.10 (tags/v3.10.10:aad5f6a, Feb  7 2023, 17:20:36) [MSC v.1929 64 bit (AMD64)]', 'platform': 'Windows-10-10.0.19045-SP0', 'event': 'created'}
2024-02-12 19:40:31,238 : INFO : Dictionary lifecycle event {'fname_or_handle': '../Data/Gensim/dict.dict', 'separately': 'None', 'sep_limit': 10485760, 'ignore': frozenset(), 'datetime': '2024-02-12T19:40:31.238912', 'gensim': '4.3.2', 'python': '3.10.10 (tags/v3.10.10:aad5f6a, Feb  7 2023, 17:20:36) [MSC v.1929 64 bit (AMD64)]', 'pla

In [17]:
# Build a word2vec model
model = gensim.models.Word2Vec(texts, window=5, min_count=50, workers=4)

for index, word in enumerate(model.wv.index_to_key):
    if index == 10:
        break
    print(f"word #{index}/{len(model.wv.index_to_key)} is {word}")

2024-02-12 19:40:31,311 : INFO : collecting all words and their counts
2024-02-12 19:40:31,311 : INFO : PROGRESS: at sentence #0, processed 0 words, keeping 0 word types
2024-02-12 19:40:31,332 : INFO : collected 13243 word types from a corpus of 135110 raw words and 500 sentences
2024-02-12 19:40:31,332 : INFO : Creating a fresh vocabulary
2024-02-12 19:40:31,336 : INFO : Word2Vec lifecycle event {'msg': 'effective_min_count=50 retains 471 unique words (3.56% of original 13243, drops 12772)', 'datetime': '2024-02-12T19:40:31.336996', 'gensim': '4.3.2', 'python': '3.10.10 (tags/v3.10.10:aad5f6a, Feb  7 2023, 17:20:36) [MSC v.1929 64 bit (AMD64)]', 'platform': 'Windows-10-10.0.19045-SP0', 'event': 'prepare_vocab'}
2024-02-12 19:40:31,337 : INFO : Word2Vec lifecycle event {'msg': 'effective_min_count=50 leaves 52748 word corpus (39.04% of original 135110, drops 82362)', 'datetime': '2024-02-12T19:40:31.337424', 'gensim': '4.3.2', 'python': '3.10.10 (tags/v3.10.10:aad5f6a, Feb  7 2023, 17

word #0/471 is sagen
word #1/471 is fleisch
word #2/471 is geben
word #3/471 is produkt
word #4/471 is neu
word #5/471 is prozent
word #6/471 is lebensmittel
word #7/471 is unternehmen
word #8/471 is million
word #9/471 is stehen


In [18]:
from sklearn.decomposition import IncrementalPCA    # inital reduction
from sklearn.manifold import TSNE                   # final reduction
import numpy as np                                  # array handling


def reduce_dimensions(model):
    num_dimensions = 2  # final num dimensions (2D, 3D, etc)

    # extract the words & their vectors, as numpy arrays
    vectors = np.asarray(model.wv.vectors)
    labels = np.asarray(model.wv.index_to_key)  # fixed-width numpy strings

    # reduce using t-SNE
    tsne = TSNE(n_components=num_dimensions, random_state=0)
    vectors = tsne.fit_transform(vectors)

    x_vals = [v[0] for v in vectors]
    y_vals = [v[1] for v in vectors]
    return x_vals, y_vals, labels


x_vals, y_vals, labels = reduce_dimensions(model)

def plot_with_plotly(x_vals, y_vals, labels, plot_in_notebook=True):
    from plotly.offline import init_notebook_mode, iplot, plot
    import plotly.graph_objs as go

    trace = go.Scatter(x=x_vals, y=y_vals, mode='text', text=labels)
    data = [trace]

    if plot_in_notebook:
        init_notebook_mode(connected=True)
        iplot(data, filename='word-embedding-plot')
    else:
        plot(data, filename='word-embedding-plot.html')


def plot_with_matplotlib(x_vals, y_vals, labels):
    import matplotlib.pyplot as plt
    import random

    random.seed(0)

    plt.figure(figsize=(12, 12))
    plt.scatter(x_vals, y_vals)

    #
    # Label randomly subsampled 25 data points
    #
    indices = list(range(len(labels)))
    selected_indices = random.sample(indices, 25)
    for i in selected_indices:
        plt.annotate(labels[i], (x_vals[i], y_vals[i]))

try:
    get_ipython()
except Exception:
    plot_function = plot_with_matplotlib
else:
    plot_function = plot_with_plotly

plot_function(x_vals, y_vals, labels)