In [1]:
# Run in terminal or command prompt
# python3 -m spacy download en

import numpy as np
import pandas as pd
import re, nltk, spacy, gensim

# Gensim
import gensim
import gensim.corpora as corpora
from gensim.utils import simple_preprocess
from gensim.models import CoherenceModel

# Sklearn
from sklearn.decomposition import LatentDirichletAllocation, TruncatedSVD
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.model_selection import GridSearchCV
from pprint import pprint


# Plotting tools
import pyLDAvis
import pyLDAvis.sklearn
import matplotlib.pyplot as plt
%matplotlib inline

# Enable logging for gensim - optional
import logging
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.ERROR)

import warnings
warnings.filterwarnings("ignore",category=DeprecationWarning)

In [2]:
# NLTK Stop words
import nltk; nltk.download('stopwords')
from nltk.corpus import stopwords
stop_words = stopwords.words('german')
stop_words.extend(['Aufgaben', '(m/w)', "fur", "bitte", "gut", "uber"]) #inclide more stopwords

[nltk_data] Downloading package stopwords to C:\Users\LE HOANG
[nltk_data]     NHAN\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [3]:
import xlrd
df = pd.read_excel(r"C:\Users\LE HOANG NHAN\Desktop\jobDescriptionData.xlsx")
df['title_description'] = df['title'].str.cat(df['description'],sep=" ")
df.head()

Unnamed: 0,id,hiringOrganization,group,title,description,employmentType,tags,jobLocation,spider,firstSeen,lastSeen,identifier,filename,title_description
0,297,Margitta Heinecke Existentia Personalvermittlung,Bundesagentur für Arbeit,IT Administrator (m/w) (Informatikkaufmann/-frau),Aufgaben: \n \nSystembetreuung \nImplementi...,Vollzeit; Bei der Besetzung des Stellenangebot...,Job Board,"Dresden, Sachsen, Deutschland",arbeitsagentur,2016-02-14 02:01:45,2016-02-14 02:01:45,10000-1139133361-S,arbeitsagentur_002dc952f9d2e625c6b117f64179eea...,IT Administrator (m/w) (Informatikkaufmann/-fr...
1,655,CNCN - Contract Consulting GmbH,Bundesagentur für Arbeit,Mitarbeiter (m/w) Support-Team (Anwendungssyst...,Wir suchen im Auftrag eines Kunden ab sofort !...,Vollzeit; nach Vereinbarung; Bei der Besetzung...,Job Board,"Nürnberg, Mittelfranken, Bayern, Deutschland; ...",arbeitsagentur,2017-11-19 01:08:47,2017-11-19 01:08:47,12961-217374171118020211-S,arbeitsagentur_0067b024bf6f7d27105744eaf75bd13...,Mitarbeiter (m/w) Support-Team (Anwendungssyst...
2,1569,RADAS Jobbörse & Personalvermittlung GmbH,Bundesagentur für Arbeit,Software-Entwickler / SPS-Programmierer (m/w) ...,RADAS Jobbörse & Personalvermittlung GmbH ist ...,Vollzeit; 40 Wochenstunden; nach Qualifikation...,Job Board,"97080 Würzburg, Bayern, Deutschland",arbeitsagentur,2015-06-07 00:27:23,2015-06-07 00:27:23,12254-1004998-S,arbeitsagentur_00fde52cc06730f1534783aec2a607b...,Software-Entwickler / SPS-Programmierer (m/w) ...
3,2447,M Plan GmbH,Bundesagentur für Arbeit,Software-System-Ingenieur (m/w) Elektromobilit...,M Plan steht für geballte Kompetenz und Erfahr...,Vollzeit; Bei der Besetzung des Stellenangebot...,Job Board,"68161 Mannheim, Baden-Württemberg, Deutschland",arbeitsagentur,2015-06-24 05:39:19,2015-06-24 05:39:19,12266-53947_460345-S,arbeitsagentur_01807213e8d37784dc4d26db4253718...,Software-System-Ingenieur (m/w) Elektromobilit...
4,2825,Brücke Rendsburg-Eckernförde e.V,Bundesagentur für Arbeit,Fachinformatikerin für Systemintegration (w/m)...,Wir stellen zur Verstärkung unseres IT-Teams i...,"Vollzeit, Teilzeit - flexibel; Bei der Besetzu...",Job Board,"Ahlmannstr. 2a, 24768 Rendsburg, Schleswig-Hol...",arbeitsagentur,2017-10-29 01:06:39,2017-10-29 01:06:39,10000-1157595558-S,arbeitsagentur_01bb20a016a0eac6c680b5d55872b11...,Fachinformatikerin für Systemintegration (w/m)...


In [4]:
# Convert to list
data = df.title_description.values.tolist()

# Remove special letters
data = [re.sub('\S*@\S*\s?', '', sent) for sent in data]

# Remove new line characters
data = [re.sub('\s+', ' ', sent) for sent in data]


# Remove distracting single quotes
data = [re.sub("\'", "", sent) for sent in data]


pprint(data[:1])

['IT Administrator (m/w) (Informatikkaufmann/-frau) Aufgaben: Systembetreuung '
 'Implementierung und Instandsetzung von Hardware Unterstützung im Bereich '
 'Netzwerkadministration User Help Desk Profil: Erfolgreich abgeschlossene '
 'Ausbildung aus dem Bereich Informationstechnik Kenntnisse im Umgang mit '
 'Microsoft Betriebssystemen und Windows Server Kenntnisse im Umgang mit PC '
 'Hardware IT']


In [5]:
def sent_to_words(sentences):
    for sentence in sentences:
        yield(gensim.utils.simple_preprocess(str(sentence), deacc=True))  # deacc=True removes punctuations
# simple_preprocess: Convert a document into a list of lowercase tokens, ignoring tokens that are too short or too long.
data_words = list(sent_to_words(data))

print(data_words[:1])

[['it', 'administrator', 'frau', 'aufgaben', 'systembetreuung', 'implementierung', 'und', 'instandsetzung', 'von', 'hardware', 'unterstutzung', 'im', 'bereich', 'user', 'help', 'desk', 'profil', 'erfolgreich', 'abgeschlossene', 'ausbildung', 'aus', 'dem', 'bereich', 'kenntnisse', 'im', 'umgang', 'mit', 'microsoft', 'und', 'windows', 'server', 'kenntnisse', 'im', 'umgang', 'mit', 'pc', 'hardware', 'it']]


In [6]:
# Build the bigram and trigram models
bigram = gensim.models.Phrases(data_words, min_count=5, threshold=100) # higher threshold fewer phrases.
trigram = gensim.models.Phrases(bigram[data_words], threshold=100)  
# Phrases: Automatically detect common phrases – multi-word expressions / word n-grams – from a stream of sentences

# get a sentence clubbed as a trigram/bigram
bigram_mod = gensim.models.phrases.Phraser(bigram)
trigram_mod = gensim.models.phrases.Phraser(trigram)

# See trigram example
print(trigram_mod[bigram_mod[data_words[0]]])

['it', 'administrator', 'frau', 'aufgaben', 'systembetreuung', 'implementierung', 'und', 'instandsetzung', 'von', 'hardware', 'unterstutzung', 'im', 'bereich', 'user_help_desk', 'profil', 'erfolgreich_abgeschlossene', 'ausbildung', 'aus', 'dem', 'bereich', 'kenntnisse', 'im', 'umgang', 'mit', 'microsoft', 'und', 'windows_server', 'kenntnisse', 'im', 'umgang', 'mit', 'pc', 'hardware', 'it']


In [7]:
# Define functions for stopwords, bigrams, trigrams and lemmatization
def remove_stopwords(texts):
    return [[word for word in simple_preprocess(str(doc)) if word not in stop_words] for doc in texts]

def make_bigrams(texts):
    return [bigram_mod[doc] for doc in texts]

def make_trigrams(texts):
    return [trigram_mod[bigram_mod[doc]] for doc in texts]

# lemmatization: achieve the root forms
def lemmatization(texts, allowed_postags=['NOUN', 'ADJ', 'VERB', 'ADV']):
    """https://spacy.io/api/annotation"""
    texts_out = []
    for sent in texts:
        doc = nlp(" ".join(sent)) 
        texts_out.append([token.lemma_ for token in doc if token.pos_ in allowed_postags]) #token.lemma_: root of token; token.pos_: The simple part-of-speech tag ('NOUN', 'ADJ', 'VERB', 'ADV')
    return texts_out

In [8]:
# Remove Stop Words
data_words_nostops = remove_stopwords(data_words)

# Form Bigrams
data_words_bigrams = make_bigrams(data_words_nostops)

# Initialize spacy 'de' model, keeping only tagger component (for efficiency)       
# python3 -m spacy download de

nlp = spacy.load('de', disable=['parser', 'ner'])   #parser:Dependency Parsing;  ner: Named Entity Recognition
# Do lemmatization keeping only noun, adj, vb, adv
data_lemmatized = lemmatization(data_words_bigrams, allowed_postags=['NOUN', 'ADJ', 'VERB', 'ADV']) 

print(data_lemmatized[:1])

[['frau', 'aufgaben', 'systembetreuung', 'implementierung', 'instandsetzung', 'hardware', 'unterstutzung', 'bereich', 'desk', 'erfolgreich', 'abgeschlossen', 'ausbildung', 'bereich', 'kenntnisse', 'umgang', 'windows', 'server', 'kenntnisse', 'umgang']]


In [9]:
from gensim import corpora

class DTMcorpus(corpora.textcorpus.TextCorpus):
    def get_texts(self):
        return self.input

    def __len__(self):
        return len(self.input)

corpus = DTMcorpus(data_lemmatized)
corpus

<__main__.DTMcorpus at 0x296a9ddbc08>

In [10]:
x = pd.read_excel(r"C:\Users\LE HOANG NHAN\Desktop\jobDescriptionData.xlsx")

In [11]:
#set time_slices
df["year"] = pd.to_datetime(x["firstSeen"],errors='ignore', utc=True)

In [12]:
#set time_slices
t1 = df.loc[(df["year"].dt.year == 2014)].shape[0]
t2 = df.loc[(df["year"].dt.year == 2015)].shape[0]
t3 = df.loc[(df["year"].dt.year == 2016)].shape[0]
t4 = df.loc[(df["year"].dt.year == 2017)].shape[0]
t5 = df.loc[(df["year"].dt.year == 2018)].shape[0]

In [13]:
#set time_slices
print(t1, t2, t3, t4, t5)
time_slices = [t1, t2, t3, t4, t5]

192 431 440 539 398


In [14]:
#run Dynamic topic modeling
from gensim.models.wrappers.dtmmodel import DtmModel
model = DtmModel(r"C:\Users\LE HOANG NHAN\Desktop\dtm-win64.exe", corpus, time_slices, num_topics=10, id2word=corpus.dictionary)

In [15]:
print(model.show_topic(topicid=1, time=0, topn=10))

[(0.017440508632548722, 'entwicklung'), (0.017042294811903336, 'uber'), (0.016448153473576758, 'kunden'), (0.01551355773282574, 'software'), (0.013704024181641941, 'bieten'), (0.012353624748245943, 'aufgaben'), (0.011826611228082427, 'kenntnisse'), (0.011478081601964082, 'bereich'), (0.010834524268255605, 'informatik'), (0.010804971732470815, 'elektrotechnik')]


In [16]:
#Topic Evolution
num_topics = 10
for topic_no in range(num_topics):
    print("\nTopic", str(topic_no))
    for time in range(len(time_slices)):
        print("Time slice", str(time))
        print(model.show_topic(topic_no, time, topn=10))


Topic 0
Time slice 0
[(0.01727530431532113, 'konnen'), (0.01641896778024007, 'bewerbung'), (0.013920343163659745, 'uber'), (0.010110812444553436, 'bewerben'), (0.009221534019211266, 'bereich'), (0.008989040564077023, 'fragen'), (0.00855968159828925, 'einsatzort'), (0.00837946368755551, 'net'), (0.008142535525491304, 'bieten'), (0.008058157694397989, 'stellenprofil')]
Time slice 1
[(0.017653709639221585, 'konnen'), (0.0164996636191244, 'bewerbung'), (0.013869778591386342, 'uber'), (0.010070708855971739, 'bewerben'), (0.009340410487705524, 'bereich'), (0.00905469281879081, 'fragen'), (0.00861225698648808, 'einsatzort'), (0.008431883789133274, 'net'), (0.00815607477188598, 'bieten'), (0.008154511357078566, 'stellenprofil')]
Time slice 2
[(0.017334417314823464, 'konnen'), (0.016324671642208754, 'bewerbung'), (0.013778351653463923, 'uber'), (0.009964513960691677, 'bewerben'), (0.00936592567978068, 'bereich'), (0.008925395430153285, 'fragen'), (0.008622537029413391, 'einsatzort'), (0.008192

[(0.01727275171520844, 'team'), (0.00969428221610334, 'management'), (0.008020014573941983, 'greifswald'), (0.007333337118181403, 'as_well'), (0.006798891672262562, 'support'), (0.006604562986493966, 'university'), (0.006468159657095088, 'have'), (0.0064067677150619375, 'development'), (0.006154076187030863, 'display'), (0.006038367428642095, 'skills')]
Time slice 3
[(0.016596450080190995, 'team'), (0.009665163771569165, 'management'), (0.008208166212640875, 'greifswald'), (0.007259314366185515, 'as_well'), (0.006782326700536922, 'university'), (0.006659632629512166, 'support'), (0.006454926101547375, 'have'), (0.006258754917271955, 'development'), (0.006039056071618775, 'skills'), (0.006038400210325171, 'display')]
Time slice 4
[(0.01581747342004522, 'team'), (0.009497639683687862, 'management'), (0.008484368767226145, 'greifswald'), (0.007217236977108829, 'as_well'), (0.007023032439207653, 'university'), (0.006544955279271741, 'support'), (0.0064424545732082806, 'have'), (0.006101773

In [17]:
#Distance between documents: compare the documents across different time-frames and see how similar they are topic-wise
#considering document 0
doc = 0
print("doc = 0, model.gamma_[doc]",model.gamma_[doc])

doc = 0, model.gamma_[doc] [5.23560209e-04 5.23560209e-04 9.95287958e-01 5.23560209e-04
 5.23560209e-04 5.23560209e-04 5.23560209e-04 5.23560209e-04
 5.23560209e-04 5.23560209e-04]


In [18]:
#The distance between documents based on their topic distribution: lower, more related

from gensim.matutils import hellinger
# considering document 4 and 5
doc1 = 4
doc2 = 5
hellinger(model.gamma_[doc1], model.gamma_[doc2])

0.3890027272592329

In [19]:
import pyLDAvis

doc_topic, topic_term, doc_lengths, term_frequency, vocab = model.dtm_vis(time=0, corpus=corpus)
vis_wrapper = pyLDAvis.prepare(topic_term_dists=topic_term, doc_topic_dists=doc_topic, doc_lengths=doc_lengths, vocab=vocab, term_frequency=term_frequency)
pyLDAvis.display(vis_wrapper)

of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  return pd.concat([default_term_info] + list(topic_dfs))


In [20]:
doc_topic, topic_term, doc_lengths, term_frequency, vocab = model.dtm_vis(time=1, corpus=corpus)
vis_wrapper = pyLDAvis.prepare(topic_term_dists=topic_term, doc_topic_dists=doc_topic, doc_lengths=doc_lengths, vocab=vocab, term_frequency=term_frequency)
pyLDAvis.display(vis_wrapper)