In [6]:
import re
import json
from nltk.tokenize import RegexpTokenizer
from nltk.stem.wordnet import WordNetLemmatizer
from nltk.corpus import stopwords
from gensim.models import Phrases
from gensim.corpora import Dictionary
from gensim.models import LdaModel
import nltk
from gensim.utils import simple_preprocess

In [8]:
years = ['2016', '2017', '2018', '2019', '2020', '2021', '2022', '2023'] # Hier die Jahre angeben, die geladen werden sollen (final: 2016-2023)
path = "/Users/jan/Documents/Promotion/BIBB GW/hdd/Pflege/"#Hier der Pfad zu den JSON-Dateien!
with open(f"{path}Pflege_2016.json", 'r') as file:
    output = json.load(file)
docs = [str(value) for value in output["full_text"].values()]
del output

In [9]:
tokenizer = RegexpTokenizer(r'\w+')
for idx in range(len(docs)):
    docs[idx] = docs[idx].lower()  # Convert to lowercase.
    docs[idx] = tokenizer.tokenize(docs[idx])  # Split into words.
# Remove numbers, but not words that contain numbers.
docs = [[token for token in doc if not token.isnumeric()] for doc in docs]
# Remove words that are only one character.
docs = [[token for token in doc if len(token) > 1] for doc in docs]
# Remove stop words
stop_words = set(stopwords.words('german'))
def remove_stopwords(texts):
    return [[word for word in doc if word not in stop_words] for doc in texts]
docs = remove_stopwords(docs)

In [11]:
lemmatizer = WordNetLemmatizer()
docs = [[lemmatizer.lemmatize(token) for token in doc] for doc in docs]

[nltk_data] Downloading package wordnet to /Users/jan/nltk_data...


In [12]:
bigram = Phrases(docs, min_count=20)
for idx in range(len(docs)):
    for token in bigram[docs[idx]]:
        if '_' in token:
            # Token is a bigram, add to document.
            docs[idx].append(token)

In [13]:
# Create a dictionary representation of the documents.
dictionary = Dictionary(docs)

# Filter out words that occur less than 20 documents, or more than 50% of the documents.
dictionary.filter_extremes(no_below=20, no_above=0.5)
# Bag-of-words representation of the documents.
corpus = [dictionary.doc2bow(doc) for doc in docs]

In [14]:
# Set training parameters.
num_topics = 10
chunksize = 2000
passes = 20
iterations = 400
eval_every = None  # Don't evaluate model perplexity, takes too much time.

# Make an index to word dictionary.
temp = dictionary[0]  # This is only to "load" the dictionary.
id2word = dictionary.id2token

model = LdaModel(
    corpus=corpus,
    id2word=id2word,
    chunksize=chunksize,
    alpha='auto',
    eta='auto',
    iterations=iterations,
    num_topics=num_topics,
    passes=passes,
    eval_every=eval_every
)

In [15]:
top_topics = model.top_topics(corpus)

# Average topic coherence is the sum of topic coherences of all topics, divided by the number of topics.
avg_topic_coherence = sum([t[1] for t in top_topics]) / num_topics
print('Average topic coherence: %.4f.' % avg_topic_coherence)

from pprint import pprint
pprint(top_topics)

Average topic coherence: -1.3367.
[([(0.018999234, 'stellenangebot'),
   (0.0155440755, 'stellenangebots'),
   (0.010431875, 'kartenausschnitt'),
   (0.010152948, 'karte'),
   (0.009697562, 'führungsverantwortung'),
   (0.0085437205, 'informationen'),
   (0.0075700497, 'stellen'),
   (0.007503077, 'tarifvertrag'),
   (0.005785843, 'teilzeit'),
   (0.0055862986, 'kontaktdaten'),
   (0.005543431, 'tätigkeit'),
   (0.0054497113, 'angaben'),
   (0.005391395, 'arbeitszeit'),
   (0.0053133913, 'beginn'),
   (0.005263227, 'stellenangebotes'),
   (0.005250528, 'westfalen'),
   (0.005247443, 'rückfragen'),
   (0.005240013, 'anwendung'),
   (0.0052229213, 'zusatzleistungen'),
   (0.005222843, 'vergrößern')],
  -0.25875888835715394),
 ([(0.02464169, 'arbeitsort'),
   (0.021868264, 'anbieter'),
   (0.020346297, 'anzeige'),
   (0.019207468, 'gigajob'),
   (0.017562607, 'mal'),
   (0.013426545, 'plz'),
   (0.013263893, 'land'),
   (0.013164123, 'stellenbezeichnung'),
   (0.012431667, 'jobbeschreibun