<h1>Table of Contents<span class="tocSkip"></span></h1>
<div class="toc"><ul class="toc-item"><li><span><a href="#|-Preliminaries" data-toc-modified-id="|-Preliminaries-1"><span class="toc-item-num">1&nbsp;&nbsp;</span>| Preliminaries</a></span></li><li><span><a href="#|-Cleaning" data-toc-modified-id="|-Cleaning-2"><span class="toc-item-num">2&nbsp;&nbsp;</span>| Cleaning</a></span></li><li><span><a href="#|-Tokenization" data-toc-modified-id="|-Tokenization-3"><span class="toc-item-num">3&nbsp;&nbsp;</span>| Tokenization</a></span></li><li><span><a href="#|-POS-Tagging" data-toc-modified-id="|-POS-Tagging-4"><span class="toc-item-num">4&nbsp;&nbsp;</span>| POS Tagging</a></span></li><li><span><a href="#|-Language-Detection" data-toc-modified-id="|-Language-Detection-5"><span class="toc-item-num">5&nbsp;&nbsp;</span>| Language Detection</a></span><ul class="toc-item"><li><ul class="toc-item"><li><span><a href="#Ratio-other-languages" data-toc-modified-id="Ratio-other-languages-5.0.1"><span class="toc-item-num">5.0.1&nbsp;&nbsp;</span>Ratio other languages</a></span></li></ul></li></ul></li><li><span><a href="#|-Linguistic-Processing" data-toc-modified-id="|-Linguistic-Processing-6"><span class="toc-item-num">6&nbsp;&nbsp;</span>| Linguistic Processing</a></span><ul class="toc-item"><li><span><a href="#|-Lemmatization" data-toc-modified-id="|-Lemmatization-6.1"><span class="toc-item-num">6.1&nbsp;&nbsp;</span>| Lemmatization</a></span></li></ul></li><li><span><a href="#|-Frequencies-for-categories" data-toc-modified-id="|-Frequencies-for-categories-7"><span class="toc-item-num">7&nbsp;&nbsp;</span>| Frequencies for categories</a></span></li><li><span><a href="#|-Vectorisierung" data-toc-modified-id="|-Vectorisierung-8"><span class="toc-item-num">8&nbsp;&nbsp;</span>| Vectorisierung</a></span><ul class="toc-item"><li><span><a href="#|-SpaCy" data-toc-modified-id="|-SpaCy-8.1"><span class="toc-item-num">8.1&nbsp;&nbsp;</span>| SpaCy</a></span></li><li><span><a href="#|-Gensim" data-toc-modified-id="|-Gensim-8.2"><span class="toc-item-num">8.2&nbsp;&nbsp;</span>| Gensim</a></span></li></ul></li></ul></div>

# | Preliminaries

In [1]:
import pickle
import pandas as pd
import numpy as np
import regex as re

# spaCy - Named Entity, Vecotorization
import spacy
nlp = spacy.load("de_core_news_lg")
from spacy.matcher import PhraseMatcher
from spacy.tokens import Span
from spacy.tokens import Doc

# Tokenization
import nltk
from nltk.tokenize import RegexpTokenizer
nltk.download('averaged_perceptron_tagger')

# QOL
from tqdm import tqdm

# Language Detection

from lingua import Language, LanguageDetectorBuilder

import warnings
warnings.filterwarnings('ignore')

[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /Users/landsiedelj/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


In [2]:
df = pd.read_pickle('/Users/landsiedelj/Downloads/export.pkl')
df_person = pd.read_excel('/Users/landsiedelj/Downloads/WikiPersonen_nach1900_lebend_mitGeschlecht_mitGeburtsland_mitBerufskategorien_mitKlickzahlen.xlsx')

# | Cleaning

In [3]:
df.drop(['search_engine','edge','datetime'], axis=1, inplace=True)
df[:5]

Unnamed: 0.1,Unnamed: 0,root,source,target,rank,depth,grandparent,parent,source_add,target_add
0,0,christian eriksen,christian eriksen,christian eriksen aktuell,1,0,,,christian eriksen,aktuell
1,1,christian eriksen,christian eriksen,christian eriksen ajax,2,0,,,christian eriksen,ajax
2,2,christian eriksen,christian eriksen,christian eriksen alter,3,0,,,christian eriksen,alter
3,3,christian eriksen,christian eriksen,christian eriksen aktuelle teams,4,0,,,christian eriksen,aktuelle teams
4,4,christian eriksen,christian eriksen,christian eriksen amsterdam,5,0,,,christian eriksen,amsterdam


In [4]:
# Cleaning df
dictionary = {"Unnamed: 0" : "id"}
    
df = df.copy()
df.rename(columns = dictionary, inplace=True)

# Cleaning df_person
wiki_dictionary = {"Wikipedia-Eintrag" : "wikipedia_eintrag",
                "Name, Vorname": "name_vorname",
                "alle Berufe": "alle_berufe",
                "1. Beruf": "1_beruf",
                "Seitenaufrufe Wikipedia der letzten 60 Tage": "Seitenaufrufe"}
    
df_person = df_person.copy()
df_person.rename(columns = wiki_dictionary, inplace=True)

In [5]:
# Unify terms
def clean_target_add(x):
    return str(x).replace("cv","lebenslauf").replace("vs","versus").replace("bvb","borussia dortmund").replace("cpr","herz-lungen-wiederbelebung") \
        .replace("chancellor","bundeskanzler").replace("acab","alle polizisten sind bastarde").replace("city","stadt").replace("age","alter") \
            .replace("contemporary", "zeitgemäß").replace("concert", "konzert").replace("collection", "kollection").replace("chords", "akkorde") \
                .replace("birth chart", "geburtshoroskop").replace("challenge", "herausforderung").replace("child", "kind").replace("analysis", "analyse") \
                    .replace("book", "buch").replace("car", "auto").replace("cast", "besetzung").replace("closet", "kleiderschrank")
        


df['target_add'] = df['target_add'].apply(clean_target_add)

df['root'] = df['root'].apply(lambda x: re.sub("bastian yottta","bastian yotta", str(x)))
# df['target_add'] = df['target_add'].apply(lambda x: re.sub("vs","versus", str(x)))
# df['target_add'] = df['target_add'].apply(lambda x: re.sub("bvb","borussia dortmund", str(x)))

In [6]:
def clean_wiki_eintrag(x):
    return str(x).replace("Jérôme Boateng","Jerome Boateng")

df_person['wikipedia_eintrag'] = df_person['wikipedia_eintrag'].apply(clean_wiki_eintrag)

In [7]:
# Strip whitespace
df['target_add'] = df['target_add'].str.split(',').str[-1].str.lstrip() # del whitespace
df['source_add'] = df['source_add'].str.split(',').str[-1].str.lstrip() # del whitespace
df['root'] = df['root'].str.split(',').str[-1].str.lstrip() # del whitespace

In [8]:
df[:2]

Unnamed: 0,id,root,source,target,rank,depth,grandparent,parent,source_add,target_add
0,0,christian eriksen,christian eriksen,christian eriksen aktuell,1,0,,,christian eriksen,aktuell
1,1,christian eriksen,christian eriksen,christian eriksen ajax,2,0,,,christian eriksen,ajax


### If root in excel add gender

In [9]:
df_person[:2]

Unnamed: 0,UID,wikipedia_eintrag,name_vorname,Name,Vorname,Pseudonym,Geschlecht,Herkunft,alle_berufe,1_beruf,Beruf - Oberkategorie,Geburtsdatum,Geburtsort,Geburtsland,Seitenaufrufe Wikipedia der letzten 12 Monate
0,19955,Wladimir Wladimirowitsch Putin,"Putin, Wladimir Wladimirowitsch",Putin,Wladimir Wladimirowitsch,"Putin, Vladimir Vladimirovič; Путин, Владимир ...",männlich,russisch,"Politiker, Präsident",Politiker,POL,1952-10-07 00:00:00,Sankt Petersburg (Leningrad),Russland,4782656
1,4967001,Annalena Baerbock,"Baerbock, Annalena",Baerbock,Annalena,"Baerbock, Annalena Charlotte Alma (vollständig...",weiblich,deutsch,"Politikerin (Bündnis 90/Die Grünen), MdB",Politikerin (Bündnis 90/Die Grünen),POL,1980-12-15 00:00:00,Hannover,Deutschland,4406600


In [10]:
df_person['wikipedia_eintrag'] = df_person['wikipedia_eintrag'].str.replace(r"\(.*\)","") # del everything in parenthesis
df_person = df_person.apply(lambda x: x.astype(str).str.lower())
df_person['wikipedia_eintrag'] = df_person['wikipedia_eintrag'].str.rstrip() # del whitespace

In [11]:
df_person['wikipedia_eintrag'].value_counts()[:10]

andreas müller       14
michael müller       13
klaus müller         13
thomas müller        11
peter schmidt        11
christian meyer      10
michael schneider    10
christian müller     10
thomas fischer        9
christoph müller      9
Name: wikipedia_eintrag, dtype: int64

In [12]:
# to do use df_person or only streaming? 
df['gender'] = df.root.map(df_person.set_index('wikipedia_eintrag')['Geschlecht'].to_dict())

In [21]:
df.gender.value_counts()

männlich        16623
weiblich        11886
n/a / divers      985
Name: gender, dtype: int64

In [22]:
df.loc[(df['gender'] == 'n/a / divers')]

Unnamed: 0,id,root,source,target,rank,depth,grandparent,parent,source_add,target_add,gender,tokens,person,location,language
15962,0,demi lovato,demi lovato,demi lovato anyone,1,0,,,demi lovato,anyone,n/a / divers,[anyone],,,Language.ENGLISH
15963,1,demi lovato,demi lovato,demi lovato aktuell,2,0,,,demi lovato,aktuell,n/a / divers,[aktuell],,,Language.GERMAN
15964,2,demi lovato,demi lovato,demi lovato alter,3,0,,,demi lovato,alter,n/a / divers,[alter],,,Language.ITALIAN
15965,3,demi lovato,demi lovato,demi lovato age,4,0,,,demi lovato,alter,n/a / divers,[alter],,,Language.ITALIAN
15966,4,demi lovato,demi lovato,demi lovato autotune,5,0,,,demi lovato,autotune,n/a / divers,[autotune],,,Language.ITALIAN
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
16449,487,demi lovato,demi lovato chords,demi lovato melon cake chords,12,1,,demi lovato,chords,melon,n/a / divers,[melon],,,Language.ENGLISH
16449,487,demi lovato,demi lovato chords,demi lovato melon cake chords,12,1,,demi lovato,chords,cake,n/a / divers,[cake],,,Language.ENGLISH
16450,488,demi lovato,demi lovato chords,demi lovato stone cold chords piano,13,1,,demi lovato,chords,stone,n/a / divers,[stone],,,Language.ENGLISH
16450,488,demi lovato,demi lovato chords,demi lovato stone cold chords piano,13,1,,demi lovato,chords,cold,n/a / divers,[cold],,,Language.ENGLISH


Names not unique -> profession not assigned -> Solution add UID in export

In [13]:
df = df.assign(target_add=df['target_add'].str.split(' ')).explode('target_add')

# | Tokenization

In [16]:
# With RegexpTokenizer nltk module ->  take only tokens from words and numbers
from tqdm import tqdm
tqdm.pandas()
tokenizer = RegexpTokenizer(r'\w+')

df["tokens"] = df.progress_apply(lambda row: tokenizer.tokenize(str(row["target_add"].lower())), axis=1)

100%|██████████| 33601/33601 [00:00<00:00, 168359.40it/s]


# Entity Recognition

In [17]:

# Enitity Recognition von Städten und Personen:

df['person'] = ''
df['location'] = ''
for row in df.itertuples(index=True, name='Pandas'):
    persons = []
    locations = []
    doc = nlp(str(row.tokens))
    for ent in doc.ents:
        persons.append(ent.text)
        locations.append(ent.label_)
    df.at[row.Index, 'person'] = persons
    df.at[row.Index, 'location'] = locations
print(df.head(2))

ValueError: Must have equal len keys and value when setting with an iterable

In [None]:
##Enitity Recognition von Städten und Personen:
#https://spacy.io/usage/linguistic-features

#! pip install https://github.com/explosion/spacy-models/releases/download/de_core_news_sm-3.0.0/de_core_news_sm-3.0.0-py3-none-any.whl
#! pip install https://github.com/explosion/spacy-models/releases/download/de_core_news_sm-3.0.0/de_core_news_sm-3.0.0.tar.gz
#import spacy
#nlp = spacy.load('de_core_news_sm')

'''doc = nlp('Ich, Manuel Neuer wohne in Berlin.')
for ent in doc.ents:
    print(ent.text, ent.start_char, ent.end_char, ent.label_)'''

Manuel Neuer 5 17 PER
Berlin 27 33 LOC


# | Language Detection

In [18]:
languages = [Language.ENGLISH, Language.FRENCH, Language.GERMAN, Language.SPANISH, Language.TURKISH, Language.DUTCH, Language.ITALIAN, Language.POLISH, Language.RUSSIAN]
detector = LanguageDetectorBuilder.from_languages(*languages).build()
df['language'] = df['target'].progress_apply(lambda x:detector.detect_language_of(x))

100%|██████████| 33601/33601 [00:39<00:00, 852.62it/s] 


In [19]:
df.head(10)

Unnamed: 0,id,root,source,target,rank,depth,grandparent,parent,source_add,target_add,gender,tokens,person,location,language
0,0,christian eriksen,christian eriksen,christian eriksen aktuell,1,0,,,christian eriksen,aktuell,männlich,[aktuell],,,Language.GERMAN
1,1,christian eriksen,christian eriksen,christian eriksen ajax,2,0,,,christian eriksen,ajax,männlich,[ajax],,,Language.DUTCH
2,2,christian eriksen,christian eriksen,christian eriksen alter,3,0,,,christian eriksen,alter,männlich,[alter],,,Language.GERMAN
3,3,christian eriksen,christian eriksen,christian eriksen aktuelle teams,4,0,,,christian eriksen,aktuelle,männlich,[aktuelle],,,Language.GERMAN
3,3,christian eriksen,christian eriksen,christian eriksen aktuelle teams,4,0,,,christian eriksen,teams,männlich,[teams],,,Language.GERMAN
4,4,christian eriksen,christian eriksen,christian eriksen amsterdam,5,0,,,christian eriksen,amsterdam,männlich,[amsterdam],,,Language.DUTCH
5,5,christian eriksen,christian eriksen,christian eriksen aktueller verein,6,0,,,christian eriksen,aktueller,männlich,[aktueller],,,Language.GERMAN
5,5,christian eriksen,christian eriksen,christian eriksen aktueller verein,6,0,,,christian eriksen,verein,männlich,[verein],,,Language.GERMAN
6,6,christian eriksen,christian eriksen,christian eriksen accident,7,0,,,christian eriksen,accident,männlich,[accident],,,Language.ENGLISH
7,7,christian eriksen,christian eriksen,christian eriksen ajax amsterdam,8,0,,,christian eriksen,ajax,männlich,[ajax],,,Language.DUTCH


In [20]:
df.language.value_counts()

Language.GERMAN     21165
Language.ENGLISH     8394
Language.DUTCH       1374
Language.FRENCH      1181
Language.ITALIAN      733
Language.TURKISH      397
Language.SPANISH      217
Language.POLISH       140
Name: language, dtype: int64

# | Linguistic Processing

## | Lemmatization

In [None]:
from nltk.stem import WordNetLemmatizer 
nltk.download('omw-1.4')
nltk.download('wordnet')
lemmatizer = WordNetLemmatizer()
df["lemmata"] = df.apply(lambda row: [lemmatizer.lemmatize(word) for word in row["tokens"]], axis=1)

[nltk_data] Downloading package omw-1.4 to
[nltk_data]     /Users/landsiedelj/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/landsiedelj/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [None]:
# Lemmatization with word type from https://www.machinelearningplus.com/nlp/lemmatization-examples-python/
# Lemmatize with POS Tag
from nltk.corpus import wordnet
lemmatizer = WordNetLemmatizer()

def get_wordnet_pos(word):
    """Map POS tag to first character lemmatize() accepts"""
    tag = nltk.pos_tag([word])[0][1][0].upper()
    tag_dict = {"J": wordnet.ADJ,
                "N": wordnet.NOUN,
                "V": wordnet.VERB,
                "R": wordnet.ADV}
    return tag_dict.get(tag, wordnet.NOUN)


df["lemmata_word_type"] = df.apply(lambda row: [lemmatizer.lemmatize(w, get_wordnet_pos(w)) for w in row["tokens"]], axis=1)

In [None]:
df.head()

Unnamed: 0,id,root,source,target,rank,depth,grandparent,parent,source_add,target_add,tokens,pos,person,location,language,lemmata,lemmata_word_type
0,0,christian eriksen,christian eriksen,christian eriksen aktuell,1,0,,,christian eriksen,aktuell,[aktuell],"[(aktuell, NN)]",[],[],Language.GERMAN,[aktuell],[aktuell]
1,1,christian eriksen,christian eriksen,christian eriksen ajax,2,0,,,christian eriksen,ajax,[ajax],"[(ajax, NN)]",[],[],Language.DUTCH,[ajax],[ajax]
2,2,christian eriksen,christian eriksen,christian eriksen alter,3,0,,,christian eriksen,alter,[alter],"[(alter, NN)]",[],[],Language.GERMAN,[alter],[alter]
3,3,christian eriksen,christian eriksen,christian eriksen aktuelle teams,4,0,,,christian eriksen,aktuelle teams,"[aktuelle, teams]","[(aktuelle, NNS), (teams, NNS)]",[],[],Language.GERMAN,"[aktuelle, team]","[aktuelle, team]"
4,4,christian eriksen,christian eriksen,christian eriksen amsterdam,5,0,,,christian eriksen,amsterdam,[amsterdam],"[(amsterdam, NN)]",[],[],Language.DUTCH,[amsterdam],[amsterdam]


In [None]:
# Create list of target_add -> suggestion terms

suggestion_list = df['target_add'].drop_duplicates().tolist()
print(len(suggestion_list))

suggestion_list[:10]

4201


['aktuell',
 'ajax',
 'alter',
 'aktuelle teams',
 'amsterdam',
 'aktueller verein',
 'accident',
 'ajax amsterdam',
 'all cards',
 'attack']

In [None]:
textfile = open('/Users/landsiedelj/Downloads/suggestion_list.txt', 'w', encoding='utf-8')
for element in suggestion_list:
    textfile.write(element + '\n')
textfile.close()

# | Vectorisierung

## | SpaCy

In [None]:
# Testing Vectorization with a subset
df_2 = df.copy()[:5000]
token_list = df_2.tokens
# Creating Single Terms
token_list = [a for b in token_list for a in b]
token_list = token_list[:5000]

In [None]:
# Creating Vector array for our subset
doc = list(nlp.pipe(token_list, disable=['parser', 'tagger', 'ner']))
vectors = [term.vector for term in doc]