Creation of output files for further analysis

requirements:

pandas == 1.4.1

# | Preliminaries

In [None]:
import pandas as pd
import pickle
import numpy as np
import regex as re

# spaCy - Named Entity, Vectorization
import spacy
nlp = spacy.load("de_core_news_lg")
from spacy.matcher import PhraseMatcher
from spacy.tokens import Span
from spacy.tokens import Doc

# Tokenization
import nltk
from nltk.tokenize import RegexpTokenizer
nltk.download('averaged_perceptron_tagger')

# QOL
from tqdm import tqdm

# Language Detection

from lingua import Language, LanguageDetectorBuilder

import warnings
warnings.filterwarnings('ignore')


from pandarallel import pandarallel
pandarallel.initialize(progress_bar=True)


In [None]:
df = pd.read_pickle('/Users/landsiedelj/Downloads/export_no_duplicates.pickle')
df_person = pd.read_excel('/Users/landsiedelj/Downloads/WikiPersonen.xlsx')

# | Cleaning

In [None]:
print(df.shape)
print(df_person.shape)

In [None]:
print(df.search_engine.value_counts())
print(df.depth.value_counts())

In [None]:
# Cleaning df_person
wiki_dictionary = {
                "Wikipedia-Eintrag": "wikipedia_eintrag",
                "Name, Vorname": "name_vorname",
                "alle Berufe": "alle_berufe",
                "1. Beruf": "1_beruf",
                "Beruf - Oberkategorie": "beruf_oberkategorie",
                "Seitenaufrufe Wikipedia der letzten 12 Monate": "seitenaufrufe"}
    
df_person = df_person.copy()
df_person.rename(columns = wiki_dictionary, inplace=True)

In [None]:
df_person.head(2)

In [None]:
df.head(2)

In [None]:
# Unify terms
def clean_target_add(x):
    return str(x).replace("cv","lebenslauf").replace("vs","versus").replace("bvb","borussia dortmund").replace("cpr","herz-lungen-wiederbelebung") \
        .replace("chancellor","bundeskanzler").replace("acab","alle polizisten sind bastarde").replace("city","stadt").replace("age","alter") \
            .replace("contemporary", "zeitgemäß").replace("concert", "konzert").replace("collection", "kollection").replace("chords", "akkorde") \
                .replace("birth chart", "geburtshoroskop").replace("challenge", "herausforderung").replace("child", "kind").replace("analysis", "analyse") \
                    .replace("book", "buch").replace("car", "auto").replace("cast", "besetzung").replace("closet", "kleiderschrank")
        


df['target_add'] = df['target_add'].parallel_apply(clean_target_add)

df['root'] = df['root'].apply(lambda x: re.sub("bastian yottta","bastian yotta", str(x)))

In [None]:
def clean_wiki_eintrag(x):
    return str(x).replace("Jérôme Boateng","Jerome Boateng")

df_person['wikipedia_eintrag'] = df_person['wikipedia_eintrag'].apply(clean_wiki_eintrag)

In [None]:
# Cleaning "Beruf_Oberkategorie" for better readability

di = {
"ADE":"Adel",
"SEC" :"Sicherheitsdienste",
"EKM" :	"Entertainment",
"POL" :	"Politik",
"REL" :	"Religion",
"SPO" :	"Sport",
"CRI" :	"Verbrechen",
"ECO" :	"Wirtschaft",
"WIS" :	"Wissen"}

df_person["beruf_oberkategorie"].replace(di, inplace=True)

In [None]:
# drop not needed col
df.drop(['datetime'], axis=1, inplace=True)

In [None]:
# Strip whitespace
df['target_add'] = df['target_add'].str.split(',').str[-1].str.lstrip() # del whitespace
df['source_add'] = df['source_add'].str.split(',').str[-1].str.lstrip() # del whitespace
df['root'] = df['root'].str.split(',').str[-1].str.lstrip() # del whitespace

In [None]:
# Strip whitespace
df_person['wikipedia_eintrag'] = df_person['wikipedia_eintrag'].str.split(',').str[-1].str.lstrip() # del whitespace

### If root in excel add gender

In [None]:
df_person['wikipedia_eintrag'] = df_person['wikipedia_eintrag'].str.replace(r'\(.*\)', '') # del everything in parenthesis
df_person = df_person.apply(lambda x: x.astype(str).str.lower())
df_person['wikipedia_eintrag'] = df_person['wikipedia_eintrag'].str.rstrip() # del whitespace

In [None]:
# Map gender to Entity, if in Wiki data set -> add new column
df['gender'] = df.UID.map(df_person.set_index('UID')['Geschlecht'].to_dict())

In [None]:
print(df.gender.value_counts())
print(df.gender.isnull().sum())

### If root in excel add occupation

In [None]:
df['occupation'] = df.UID.map(df_person.set_index('UID')['beruf_oberkategorie'].to_dict())

In [None]:
df = df.assign(target_add=df['target_add'].str.split(' ')).explode('target_add')

# Tokenization

In [None]:
# With RegexpTokenizer nltk module ->  take only tokens from words and numbers
from tqdm import tqdm
tqdm.pandas()
tokenizer = RegexpTokenizer(r'\w+')

df["tokens"] = df.progress_apply(lambda row: tokenizer.tokenize(str(row["target_add"].lower())), axis=1)

# | Language Detection

In [None]:
languages = [Language.ENGLISH, Language.FRENCH, Language.GERMAN, Language.SPANISH, Language.TURKISH, Language.DUTCH, Language.ITALIAN, Language.POLISH, Language.RUSSIAN]
detector = LanguageDetectorBuilder.from_languages(*languages).build()
df['language'] = df['target'].parallel_apply(lambda x:detector.detect_language_of(x))

In [None]:
df.head(2)

In [None]:
df.language.value_counts()

In [None]:
df = df[df['language'] == Language.GERMAN]

# | Linguistic Processing

## | Lemmatization

In [None]:
from nltk.stem import WordNetLemmatizer 
nltk.download('omw-1.4')
nltk.download('wordnet')
lemmatizer = WordNetLemmatizer()
df["lemmata"] = df.apply(lambda row: [lemmatizer.lemmatize(word) for word in row["tokens"]], axis=1)

In [None]:
# Lemmatization with word type from https://www.machinelearningplus.com/nlp/lemmatization-examples-python/
# Lemmatize with POS Tag
from nltk.corpus import wordnet
lemmatizer = WordNetLemmatizer()

def get_wordnet_pos(word):
    """Map POS tag to first character lemmatize() accepts"""
    tag = nltk.pos_tag([word])[0][1][0].upper()
    tag_dict = {"J": wordnet.ADJ,
                "N": wordnet.NOUN,
                "V": wordnet.VERB,
                "R": wordnet.ADV}
    return tag_dict.get(tag, wordnet.NOUN)


df["lemmata_word_type"] = df.parallel_apply(lambda row: [lemmatizer.lemmatize(w, get_wordnet_pos(w)) for w in row["tokens"]], axis=1)

# Output

In [None]:
# Create list of target_add -> suggestion terms

suggestion_list = df['target_add'].drop_duplicates().tolist()
print(len(suggestion_list))

suggestion_list[:10]

In [None]:
textfile = open('/Users/landsiedelj/Downloads/suggestion_list.txt', 'w', encoding='utf-8')
for element in suggestion_list:
    textfile.write(element + '\n')
textfile.close()

In [None]:
df.to_json('export_incl_gender_cleaned_occupation_NEU.jsonl', lines=True, orient='records')

In [None]:
df.to_pickle('export_incl_gender_cleaned_occupation_NEU.pkl')

# Datei ist hier abgelegt: https://th-koeln.sciebo.de/s/M3q60FEe0i5bduT

# | Vectorisierung

## | SpaCy

In [None]:
# Testing Vectorization with a subset
df_2 = df.copy()[:5000]
token_list = df_2.tokens
# Creating Single Terms
token_list = [a for b in token_list for a in b]
token_list = token_list[:5000]

In [None]:
# Creating Vector array for our subset
doc = list(nlp.pipe(token_list, disable=['parser', 'tagger', 'ner']))
vectors = [term.vector for term in doc]