In [1]:
import lxml
from lxml import etree
import spacy
import pandas as pd

In [2]:
tree = etree.parse("C:/Users/jO/Desktop/DH_MA_Arbeit/coding/vwdazb/transkribiert_vwdazb.xml")

In [3]:
ns = {"t":'http://www.tei-c.org/ns/1.0'}
xml = "{http://www.w3.org/XML/1998/namespace}"

# POS-Tagging: Spacy

German Language Models: https://spacy.io/models/de

## Modell: "de_core_news_lg"

In [4]:
#spacy.cli.download("de_core_news_lg")

In [5]:
nlp = spacy.load("de_core_news_lg", disable=["ner"])

In [6]:
linegroups = tree.xpath("//t:lg", namespaces=ns)

In [7]:
poem = "vwdazb"

In [8]:
def postagging_lg(linegroups, nlp, poem): 
    pos_info = [] # Liste für CSV
    for lg in linegroups: # Linegroup aus der Liste aller Linegroups
        for line in lg: # Line in Linegroup
            if line is not None and line.text: #wenn Line einen Text hat
                line_text = line.text # Text abrufen
                line_text.replace("\n", "").replace("\r", "") # Text normalisieren
                line_text = " ".join(line_text.strip().split())
                #print(line_text)
                doc = nlp(line_text) # doc von Text mit nlp erstellen
                
                new_line_text = [] # Liste für XML
                for token in doc:
                    new_token = f"{token.text}_/{token.pos_}" # Token_/POS erstellen
                    new_line_text.append(new_token) # der Liste für XML hinzufügen
                    pos_info.append({
                    'text': token.text,
                    'pos': token.pos_}) # der Liste für CSV hinzufügen
                        
                newlinetext = " ".join(new_line_text) # aus der Liste einen String erstellen
                #print(newlinetext)
        line.text = newlinetext # der Linetext mit dem neuen String ersetzen
    
    tree.write(f"output_text+pos_{poem}_lg.xml", encoding="utf-8", xml_declaration=True) # output xml speichern

    pos_info_df = pd.DataFrame(pos_info)
    pos_info_df.to_csv(f"output_text+pos_{poem}_lg.csv", index=False)
    print(pos_info_df.head(10))
    

In [9]:
postagging_lg(linegroups, nlp, poem)

        text    pos
0        der    DET
1     Körper   NOUN
2       will    AUX
3       sich   PRON
4         ja    ADV
5  schließen   VERB
6          ,  PUNCT
7      rufst   VERB
8         du   PRON
9          ,  PUNCT


## Modell: de_dep_news_trf

In [10]:
#spacy.cli.download("de_dep_news_trf")

In [11]:
tree = etree.parse("C:/Users/jO/Desktop/DH_MA_Arbeit/coding/vwdazb/transkribiert_vwdazb.xml")

In [12]:
nlp = spacy.load("de_dep_news_trf")

In [13]:
linegroups = tree.xpath("//t:lg", namespaces=ns)

In [14]:
poem = "vwdazb"

In [15]:
def postagging_trf(linegroups, nlp, poem): 
    pos_info = [] # Liste für CSV
    for lg in linegroups: # Linegroup aus der Liste aller Linegroups
        for line in lg: # Line in Linegroup
            if line is not None and line.text: #wenn Line einen Text hat
                line_text = line.text # Text abrufen
                line_text.replace("\n", "").replace("\r", "") # Text normalisieren
                line_text = " ".join(line_text.strip().split())
                #print(line_text)
                doc = nlp(line_text) # doc von Text mit nlp erstellen
                
                new_line_text = [] # Liste für XML
                for token in doc:
                    new_token = f"{token.text}_/{token.pos_}" # Token_/POS erstellen
                    new_line_text.append(new_token) # der Liste für XML hinzufügen
                    pos_info.append({
                    'text': token.text,
                    'pos': token.pos_}) # der Liste für CSV hinzufügen
                        
                newlinetext = " ".join(new_line_text) # aus der Liste einen String erstellen
                #print(newlinetext)
        line.text = newlinetext # der Linetext mit dem neuen String ersetzen
    
    tree.write(f"output_text+pos_{poem}_trf.xml", encoding="utf-8", xml_declaration=True) # output xml speichern

    pos_info_df = pd.DataFrame(pos_info)
    pos_info_df.to_csv(f"output_text+pos_{poem}_trf.csv", index=False)
    print(pos_info_df.head(10))
    

In [16]:
postagging_trf(linegroups, nlp, poem)

        text    pos
0        der    DET
1     Körper   NOUN
2       will    AUX
3       sich   PRON
4         ja    ADV
5  schließen   VERB
6          ,  PUNCT
7      rufst   VERB
8         du   PRON
9          ,  PUNCT


## Modell: spacy_udpipe "de-hdt"

In [17]:
import spacy_udpipe

In [18]:
#spacy_udpipe.download("de-hdt")

In [19]:
nlp = spacy_udpipe.load("de-hdt")

In [20]:
tree = etree.parse("C:/Users/jO/Desktop/DH_MA_Arbeit/coding/vwdazb/transkribiert_vwdazb.xml")

In [21]:
linegroups = tree.xpath("//tei:lg", namespaces={"tei": "http://www.tei-c.org/ns/1.0"})

In [22]:
poem = "vwdazb"

In [23]:
def postagging_udp(linegroups, nlp, poem): 
    pos_info = [] # Liste für CSV
    for lg in linegroups: # Linegroup aus der Liste aller Linegroups
        for line in lg: # Line in Linegroup
            if line is not None and line.text: #wenn Line einen Text hat
                line_text = line.text # Text abrufen
                line_text.replace("\n", "").replace("\r", "") # Text normalisieren
                line_text = " ".join(line_text.strip().split())
                #print(line_text)
                doc = nlp(line_text) # doc von Text mit nlp erstellen
                
                new_line_text = [] # Liste für XML
                for token in doc:
                    new_token = f"{token.text}_/{token.pos_}" # Token_/POS erstellen
                    new_line_text.append(new_token) # der Liste für XML hinzufügen
                    pos_info.append({
                    'text': token.text,
                    'pos': token.pos_}) # der Liste für CSV hinzufügen
                        
                newlinetext = " ".join(new_line_text) # aus der Liste einen String erstellen
                #print(newlinetext)
        line.text = newlinetext # der Linetext mit dem neuen String ersetzen
    
    tree.write(f"output_text+pos_{poem}_udp.xml", encoding="utf-8", xml_declaration=True) # output xml speichern

    pos_info_df = pd.DataFrame(pos_info)
    pos_info_df.to_csv(f"output_text+pos_{poem}_udp.csv", index=False)
    print(pos_info_df.head(10))

In [24]:
postagging_udp(linegroups, nlp, poem)

        text    pos
0        der    DET
1     Körper   NOUN
2       will    AUX
3       sich   PRON
4         ja    ADV
5  schließen   VERB
6          ,  PUNCT
7      rufst    ADV
8         du   PRON
9          ,  PUNCT
