# Subjektinversion im Spanischen
## Eine Korpusstudie

In [None]:
import pyconll
import csv 

In [1]:
import pandas as pd
import numpy as np

### Kandidatensätze extrahieren

In [4]:
# Korpus
sent_file = pyconll.load_from_file('es_ancora-ud-dev.conllu')

In [51]:
# Kandidatensätze aus Korpus extrahieren

def get_vs_sents(sents, prodrop = True):

    vs_sentences = []
    prodrop_sents = []

    for n, sentence in enumerate(sents):
        vs_pair = None
        subj_list = []
        verb = None
        for token in sentence:
            #print(token.conll())
            #print(token.form, token.deprel)
            if token.deprel == "nsubj": 
                subj_list.append(token)
            if token.deprel == "root" and (token.upos=="VERB" or token.upos=="AUX"): 
                verb = token
        if len(subj_list) > 0 and verb:
            for subj in subj_list:
                if list(subj.deps.keys())[0] == verb.id:
                    if int(subj.id) > int(verb.id):
                        vs_pair = (subj.form, verb.form)
                        break
            if vs_pair:
                vs_sentences.append((n, vs_pair, sentence.text))
            elif prodrop and not verb.id in [list(sub.deps.keys())[0] for sub in subj_list]:
                vs_sentences.append((n, (verb.form, ""), sentence.text))
        elif prodrop and len(subj_list) == 0 and verb: 
            vs_sentences.append((n, (verb.form,""), sentence.text))

    return(vs_sentences)


    

In [52]:
vs_sents = get_vs_sents(sent_file)

In [53]:
len(vs_sents)

441

In [58]:
# Aufbau der Annotation im Korpus
print(train[125].conll())

# sent_id = 3LB-CAST-d2-6-s12
# text = Muy entero debe de encontrarse para verter sobre el asfalto unas energías a las que deberá apelar en los momentos importantes que aún quedan por desvelar sus rudos secretos (Valdezcaray, Covadonga, Avila y Segovia).
# orig_file_sentence 002#26
1	Muy	mucho	ADV	rg	_	5	advmod	5:advmod	_
2	entero	entero	ADJ	aq0ms0	Gender=Masc|Number=Sing	5	xcomp	5:xcomp	ArgTem=arg2:atr
3	debe	deber	AUX	vmip3s0	Mood=Ind|Number=Sing|Person=3|Tense=Pres|VerbForm=Fin	5	aux	5:aux	_
4	de	de	ADP	sps00	_	2	case	2:case	_
5-6	encontrarse	_	_	_	_	_	_	_	_
5	encontrar	encontrar	VERB	vmn0000	VerbForm=Inf	0	root	0:root	_
6	se	él	PRON	_	Case=Acc|Person=3|PrepCase=Npr|PronType=Prs|Reflex=Yes	5	expl:pv	5:expl:pv	_
7	para	para	ADP	sps00	_	8	mark	8:mark	_
8	verter	verter	VERB	vmn0000	VerbForm=Inf	2	advcl	2:advcl	ArgTem=argM:adv
9	sobre	sobre	ADP	sps00	_	11	case	11:case	_
10	el	el	DET	da0ms0	Definite=Def|Gender=Masc|Number=Sing|PronType=Art	11	det	11:det	_
11	asfalto	asfalto	NOUN	ncms000	

In [55]:
# Kandidatensätze als csv-Datei speichern
with open('output/vs_sents.csv','w') as out:
    csv_out=csv.writer(out)
    csv_out.writerow(['num','pairs','text'])
    for row in vs_sents:
        csv_out.writerow(row)

            

In [None]:
# Alle Sätze als csv-Datei speichern
with open('output/all_sents.csv','w') as out:
    csv_out=csv.writer(out)
    csv_out.writerow(['n', 'text'])
    for n, row in enumerate(train):
        csv_out.writerow((n, row.text))

### Subjekttyp-Tabelle

In [33]:
# Von Hand gelabelte Excel-Datei auslesen
df_labeled = pd.read_excel("output/vs_sents_labeled.xlsx", sheet_name=0)

In [22]:
display(df_labeled)

Unnamed: 0,num,pairs,text,prelim,type,prop_cat,verb_type,noun_type,comment
0,2,"('entró', '')",Desde entonces entró en silencio absoluto.,sv,,,,,
1,4,"('ministro', 'suelen')",Por su boca suelen hablar de vez en cuando tan...,vs,2.0,UEI,ue,dpd,
2,5,"('sabe', '')",Por la Cancillería se sabe que la gira preside...,0,,,,,impersonell
3,13,"('Hurtado', 'declaró')","""El pueblo puede estar seguro de que aquí no e...",vs,1.0,QI,t,dpd,
4,14,"('Añadió', '')",Añadió que desconoce la procedencia de los rum...,sv,,,,,
...,...,...,...,...,...,...,...,...,...
436,1643,"('dijo', '')",La ley da para mucho y la voluntad del pueblo ...,vs,1.0,QI,t,n,
437,1645,"('dijo', '')","""Hemos hecho un acuerdo porque creemos que es ...",vs,1.0,QI,t,n,
438,1648,"('Arzalluz', 'dijo')","""En el PNV tenemos derecho a pensar de él lo q...",vs,1.0,QI,t,dpd,
439,1651,"('Citó', '')",Citó un texto del presidente en el diario Nuev...,sv,,,,,


In [23]:
# Anzahl von VS-Sätzen
len(df_labeled[df_labeled["prelim"]=="vs"])

211

In [25]:
# Anzahl von Typ-1-Sätzen
len(df_labeled[(df_labeled["prelim"]=="vs") & (df_labeled["type"]==1)])

115

In [26]:
# Anzahl von Quotative-Inversions-Sätzen
len(df_labeled[(df_labeled["prelim"]=="vs") & (df_labeled["type"]==1) & (df_labeled["prop_cat"]=="QI")])

106

In [27]:
# Anzahl von Typ-2-Sätzen
len(df_labeled[(df_labeled["prelim"]=="vs") & (df_labeled["type"]==2)])

96

In [28]:
# Anzahl von Sätzen pro Kategorie 
for c in set(list(df_labeled["prop_cat"])):
    if not pd.isna(c):
        print(c)
        print(len(df_labeled[(df_labeled["prelim"]=="vs") & (df_labeled["prop_cat"]==c)]))
        

UAI
47
TI
14
NF
2
UEI
23
F
3
QI
106
LI
9
WH
5
FF
1
QF
1


In [83]:
# Tabelle mit absoluter Anzahl und prozentualem Anteil von Subjekttypen nach Inversionstyp

noun_types = sorted(list(set([i for i in list(df_labeled["noun_type"]) if not pd.isna(i)]))) 
noun_types.append("total")

index = pd.MultiIndex.from_tuples([("Art von Subjekt", ""), ("Typ 1", "absolut"), ("Typ 1", "%"), ("Typ 2", "absolut"), ("Typ 2", "%")])
type_df = pd.DataFrame(columns=index)
type_df["Art von Subjekt", ""] = noun_types
for t in [1, 2]:
    #print(t)
    len_list = []
    for c in noun_types[:-1]:
        len_list.append(len(df_labeled[(df_labeled["type"]==t) & (df_labeled["noun_type"]==c)]))
    total = len(df_labeled[df_labeled["type"]==t])
    len_list.append(total)
    type_df["Typ " + str(t), "absolut"] = len_list
    perc = [(l/total)*100 for l in len_list]
    type_df["Typ " + str(t), "%"] = perc
type_df = type_df.round(2)        
display(type_df)

Unnamed: 0_level_0,Art von Subjekt,Typ 1,Typ 1,Typ 2,Typ 2
Unnamed: 0_level_1,Unnamed: 1_level_1,absolut,%,absolut,%
0,dpb,1,0.87,5,5.21
1,dpd,74,64.35,55,57.29
2,dpid,7,6.09,24,25.0
3,n,33,28.7,0,0.0
4,pd,0,0.0,3,3.12
5,pid,0,0.0,2,2.08
6,s,0,0.0,7,7.29
7,total,115,100.0,96,100.0


In [84]:
# Abkürzungen durch ganze Namen ersetzen
noun_types_full = ["bloße Nomen", "DPs definit", "DPs indefinit", "Nullsubjekte", 
                   "Pronomen definit", "Pronomen indefinit", "sonstige", "Summe"]
type_df["Art von Subjekt", ""] = noun_types_full

In [86]:
# Tabelle als csv-Datei speichern
type_df.to_csv("output/type_df.csv")

In [87]:
#Latex-Tabelle 
print(type_df.to_latex(index=False, caption="\label{nttable}Arten von Subjekten nach Inversionstyp"))

\begin{table}
\centering
\caption{\label{nttable}Arten von Subjekten nach Inversionstyp}
\begin{tabular}{lrrrr}
\toprule
    Art von Subjekt & \multicolumn{2}{l}{Typ 1} & \multicolumn{2}{l}{Typ 2} \\
                    & absolut &       \% & absolut &       \% \\
\midrule
        bloße Nomen &       1 &    0.87 &       5 &    5.21 \\
        DPs definit &      74 &   64.35 &      55 &   57.29 \\
      DPs indefinit &       7 &    6.09 &      24 &   25.00 \\
       Nullsubjekte &      33 &   28.70 &       0 &    0.00 \\
   Pronomen definit &       0 &    0.00 &       3 &    3.12 \\
 Pronomen indefinit &       0 &    0.00 &       2 &    2.08 \\
           sonstige &       0 &    0.00 &       7 &    7.29 \\
              Summe &     115 &  100.00 &      96 &  100.00 \\
\bottomrule
\end{tabular}
\end{table}

