# Nouns from Wiktionary

https://github.com/gambolputty/german-nouns

In [None]:
import pandas as pd

pd.set_option('display.max_columns', 500)
pd.set_option('display.max_rows', 5000)


In [None]:
df = pd.read_csv('nouns.csv', low_memory=False)
df.sample(10)

In [None]:
df['nominativ plural']

In [None]:
df[df['lemma'] == 'Verkauf']

In [None]:
# for i, col in enumerate(df.columns):
#    print(i, col)
    
cols = df.columns[16:]
cols

In [None]:
df[cols] = df[cols].fillna('')
df = df[df['pos'] == 'Substantiv']

In [None]:
import re

nouns = {}

for i, row in df.iterrows():
    lemma = row['lemma']
    
    # skip non-nouns, tokens with specials and acronyms
    if re.search(r'[,\-\s\.0-9]', lemma)!=None or re.search(r'[A-Z]', lemma[1:])!=None:
        continue
        
    nouns[lemma] = lemma

    # add flex forms
    for col in cols:
        flex = row[col]

        if flex != None and len(flex) > 1 and \
           re.search(r'[,\-\s\.0-9]', flex)==None and re.search(r'[A-Z]', flex[1:])==None:
            nouns[flex] = lemma
            
nouns

In [None]:
nouns['Haus']

In [None]:
# df = orig_df.copy()

In [None]:
df[df.lemma == 'All']

In [None]:
df.pos.value_counts()

# Correction of spaCy's German Lemmas

In [None]:
from spacy.lang.de.lemmatizer import LOOKUP
import sys

In [None]:
# file = sys.stdout
file = open("lemmatizer_de.py", "w", encoding='utf-8')
# file = open("lemmatizer_de_changes.py", "w", encoding='utf-8')

file.write("""# coding: utf8
from __future__ import unicode_literals

LOOKUP_DELTA = {\n""")

for word, lemma in LOOKUP.items():
    try:
        if word[0].isupper() and lemma.islower() and word.lower() not in LOOKUP and word in nouns:
            new_lemma = nouns[word]
            file.write(f'    "{word}": "{new_lemma}",\n')
            # file.write(f'    "{word}": "{new_lemma}", # previous "{lemma}"\n')
            # print(f'    "{word}": "{new_lemma}" # previous "{lemma}"')
        else:
            # pass
            file.write(f'    "{word}": "{lemma}",\n')
        
    except Exception as e:
        print(e)
        print(f'    "{word}": "{lemma}"')
        break
            
file.write("}\n")
file.close()

In [None]:
word = "Banden"
word[-2:]

In [None]:
"Banden"[:-1] in LOOKUP

In [None]:
"Bande" in LOOKUP

# Test

In [None]:
import spacy

nlp = spacy.load('de')

In [None]:
from IPython.core.display import display, HTML
from tabulate import tabulate

def print_nlp(doc, include_punct=False):
    """Print tokens with attributes for spaCy doc."""
    rows = []
    for token in doc:
        if not token.is_punct or include_punct:
            row = (token.text, token.lemma_, 
                   token.pos_, token.tag_, token.dep_,
                   token.is_punct, token.is_alpha, token.is_stop,
                   token.ent_type_, token.ent_iob_)
            rows.append(row)

    # generate HTML formatted table for display in Jupyter
    headers = ['text', 'lemma_', 'pos_', 'tag_', 'dep_', 
               'is_punct', 'is_alpha', 'is_stop', 'ent_type', 'ent_iob'] 
    display(HTML(tabulate(rows, headers=headers, tablefmt='html')))
    

In [None]:
from lemmatizer_de import LOOKUP as my_LOOKUP

nlp.vocab.morphology.lemmatizer.lookup_table = my_LOOKUP




In [None]:
LOOKUP['Baum']

In [None]:
def print_nlp(doc):
    print(" ".join([f"{t}/{t.lemma_}/{t.pos_}" for t in doc if not t.is_punct]))

texts = """Dieser Gärtner wohnt im Haus.
"""

for text in texts.split("\n"):
    doc = nlp(text)
    print_nlp(doc)

# SpaCy 2.2

In [2]:
import sys

sys.path.insert(0, r'C:\Users\Jens\Documents\Development\github\jsalbr\spaCy')

for p in sys.path:
    print(p)

C:\Users\Jens\Documents\Development\github\jsalbr\spaCy
C:\Users\Jens\Documents\Development\github\jsalbr\spacy-lemmatizer-de-fix
C:\Users\Jens\Anaconda3\python37.zip
C:\Users\Jens\Anaconda3\DLLs
C:\Users\Jens\Anaconda3\lib
C:\Users\Jens\Anaconda3

C:\Users\Jens\AppData\Roaming\Python\Python37\site-packages
C:\Users\Jens\Anaconda3\lib\site-packages
C:\Users\Jens\Anaconda3\lib\site-packages\pandas_profiling-2.3.0-py3.7.egg
C:\Users\Jens\Anaconda3\lib\site-packages\confuse-1.0.0-py3.7.egg
C:\Users\Jens\Anaconda3\lib\site-packages\phik-0.9.8-py3.7.egg
C:\Users\Jens\Anaconda3\lib\site-packages\htmlmin-0.1.12-py3.7.egg
C:\Users\Jens\Anaconda3\lib\site-packages\missingno-0.4.2-py3.7.egg
C:\Users\Jens\Anaconda3\lib\site-packages\pytest_pylint-0.14.1-py3.7.egg
C:\Users\Jens\Anaconda3\lib\site-packages\defusedxml-0.6.0-py3.7.egg
C:\Users\Jens\Anaconda3\lib\site-packages\typed_ast-1.4.0-py3.7-win-amd64.egg
C:\Users\Jens\Anaconda3\lib\site-packages\win32
C:\Users\Jens\Anaconda3\lib\site-packages\

In [3]:
import spacy

ModuleNotFoundError: No module named 'spacy.symbols'

In [None]:
spacy.__file__