# General approach
- Make all terms lowercase
- Identify German nouns with a list and convert these to upper case


## Step 1: Read Excel and convert everything in the 'object' column to  lower case


In [1]:
import pandas as pd

df = pd.read_excel('ELSST_R3_DE.xlsx')
print(len(df.index))

count = range(1,len(df.index))

for i in count:
    old = df['object'][i]
    new = old.lower()
    df = df.replace(old,new)

df.to_excel('forELSSTGroup\\GER_ELSST_lowercase.xlsx', sheet_name='Sheet 1')


241


## Step 2 - Compile a list of nouns and verbs
I used the csv-list offered here: https://pypi.org/project/german-nouns/. It was compiled from WiktionaryDE. The list had a number of issues and I have done a lot of trimming to it to improve it based on the results I received. Among the issues were very short nouns, names of cities, etc. The outcome of this is a text file with each noun (including different forms) on an individual line. 
I ended up compiling a list of verbs too ensure that verbs weren't capizalized, but the results are sketchy. 
 

## Step 3 - Capitalize all single or first words in a field


In [2]:
import openpyxl

wb = openpyxl.load_workbook("forELSSTGroup\\GER_ELSST_lowercase.xlsx")
ws = wb["Sheet 1"]

for r in range(1,ws.max_row+1):
    label = ws.cell(r,3).value
    terms = ws.cell(r,4).value
    ws.cell(r,4).value = ws.cell(r,4).value.replace(terms[0], terms[0].capitalize(),1)

wb.save("forELSSTGroup\\GER_ELSST_firstword.xlsx")



## Step 4 - Capitalize all words in the list of nouns and NOT in the list of verbs


In [3]:
import openpyxl
import re

with open('wortliste/neue_nomenliste.txt', 'r', encoding="utf8") as f:
    nomen = [line.strip() for line in f]

with open('wortliste/neue_verben.txt', 'r', encoding="utf8") as f:
    verben = [line.strip() for line in f]    # to do replace ö,ü,ä,

wb = openpyxl.load_workbook("forELSSTGroup\\GER_ELSST_firstword.xlsx")
ws = wb["Sheet 1"]

for wort in nomen:
    for r in range(1,ws.max_row+1):            
        terms = ws.cell(r,4).value
        label = ws.cell(r,3).value
        for i in terms.split(' '):
            try: 
                item = re.search(r"\b[a-zäöüÄÖÜß]+\b",i).group()
            except AttributeError:
                continue
            if item == wort.lower() and item not in verben:
                ws.cell(r,4).value = re.sub(r"\b"+item, item.capitalize(),ws.cell(r,4).value)
                #print(f"replaced {item}")
wb.save('forELSSTGroup\\GER_ELSST_targetfile1.xlsx')

## Step 5 - Capitalize all words which end in typical noun suffixes (heit, keit, etc.)

In [6]:
import openpyxl
import re

wb = openpyxl.load_workbook("forELSSTGroup\\GER_ELSST_targetfile1.xlsx")
ws = wb["Sheet 1"]

heit = ["ing","heit","heiten", "keit","keiten", "ung", "ungen" ,"nis", "nisse","nissen", "schaft", "schaften", "tum","tümer"]

for endung in heit:
    endungen = endung + r"\b"
    for r in range(1,ws.max_row+1):
    #for r in range(1,50):
        terms = ws.cell(r,4).value
        for item in terms.split(' '):
            if not re.search('\\W',item):
                if item != item.capitalize():
                    #print(item)
                    match = re.search(endungen, item)
                    if match:
                        #print(item)
                        ws.cell(r,4).value = re.sub(item, item.capitalize(),ws.cell(r,4).value)
                        #print(ws.cell(r,4).value)
wb.save('forELSSTGroup\\GER_ELSST_targetfile2.xlsx') 


In [15]:
import openpyxl
import re
import nltk
from HanTa import HanoverTagger as ht

tagger = ht.HanoverTagger('morphmodel_ger.pgz')
terme = []
nomen = []

wb = openpyxl.load_workbook("forELSSTGroup\\GER_ELSST_targetfile2.xlsx")
ws = wb["Sheet 1"]

for r in range(1,ws.max_row+1):            
    terms = ws.cell(r,4).value
    for i in terms.split(' '):
        terme.append(i)

for item in terme:
    tokens = nltk.word_tokenize(item, language='german')
    tags = tagger.tag_sent(tokens)
    #print(tags)

    for i in tags:
        if 'NN' in i:
            nomen.append(i[0])

for wort in nomen:
    for r in range(1,ws.max_row+1):            
        terms = ws.cell(r,4).value
        for i in terms.split(' '):
            if i[:-1] == wort.lower():
                print(i)
                ws.cell(r,4).value = re.sub(i, i.capitalize(),ws.cell(r,4).value)
            elif i == wort.lower():
                print(i)
                ws.cell(r,4).value = re.sub(i, i.capitalize(),ws.cell(r,4).value)
wb.save('forELSSTGroup\\GER_ELSST_targetfile3.xlsx')


leistungsteste
leistungsteste
leistungsteste
leistungsteste
leistungstests
leistungstests
leistungstests
leistungstests
leistungsfeststellungsteste
leistungsstands
erfolgsmessgrösse.
rundwanderungen.
waehlen
waehlen
wahlverhalten
wahlverhalten
augenoptik
kosmetikdienste
