In [13]:
import pandas as pd
from collections import Counter

In [14]:
devfile = "Enriched_AnCora_conllu_train_docs.conllu" # Obtained from AINA PROJECT dataset 'ANCORA_ca v2' (https://zenodo.org/records/5509997)

In [15]:
df = pd.read_csv(devfile, sep='\t', header=None, names=['ID', 'FORM', 'LEMMA', 'UPOS', 'XPOS', 'FEATS', 'HEAD', 'DEPREL', 'DEPS', 'MISC'])

In [16]:
# The dataset has rows with some metadata regarding each tokenized sentence in the corpus, then rows with eack processed token and the pattern repeats for the next sentence
df.head()

Unnamed: 0,ID,FORM,LEMMA,UPOS,XPOS,FEATS,HEAD,DEPREL,DEPS,MISC
0,# newdoc id = train_0,,,,,,,,,
1,# sent_id = train-s1,,,,,,,,,
2,# text = El Tribunal Suprem (TS) ha confirmat ...,,,,,,,,,
3,# orig_file_sentence 001#1,,,,,,,,,
4,1,El,el,DET,DET,Definite=Def|Gender=Masc|Number=Sing|PronType=Art,2.0,det,_,O


In [17]:
df.shape

(419201, 10)

In [18]:
# Remove the rows that contain NaN values (metadata rows)
df.dropna(axis=0, inplace=True, ignore_index=True)

In [19]:
print(df.shape)
df.head()

(379028, 10)


Unnamed: 0,ID,FORM,LEMMA,UPOS,XPOS,FEATS,HEAD,DEPREL,DEPS,MISC
0,1,El,el,DET,DET,Definite=Def|Gender=Masc|Number=Sing|PronType=Art,2,det,_,O
1,2,Tribunal,Tribunal,PROPN,PROPN,_,8,nsubj,_,B-ORG
2,3,Suprem,Suprem,PROPN,PROPN,_,2,flat,_,I-ORG
3,4,(,(,PUNCT,PUNCT,PunctSide=Ini|PunctType=Brck,5,punct,_,SpaceAfter=No|O
4,5,TS,TS,PROPN,PROPN,_,2,flat,_,SpaceAfter=No|B-ORG


In [20]:
# Filter for verbs, we are only interested in these
verbs_df = df[df['UPOS'] == 'VERB']

In [21]:
print(verbs_df.shape)
verbs_df.head()

(30044, 10)


Unnamed: 0,ID,FORM,LEMMA,UPOS,XPOS,FEATS,HEAD,DEPREL,DEPS,MISC
7,8,confirmat,confirmar,VERB,VERB,Gender=Masc|Number=Sing|Tense=Past|VerbForm=Part,0,root,_,O
34,34,beneficiat,beneficiar,VERB,VERB,Gender=Masc|Number=Sing|Tense=Past|VerbForm=Part,10,acl,_,O
39,39,desenvolupaven,desenvolupar,VERB,VERB,Mood=Ind|Number=Plur|Person=3|Tense=Imp|VerbFo...,36,acl,_,O
53,52,oferir,oferir,VERB,VERB,VerbForm=Inf,43,acl,_,SpaceAfter=No|O
65,8,tingut,tenir,VERB,VERB,Gender=Masc|Number=Sing|Tense=Past|VerbForm=Part,2,acl,_,O


In [22]:
# Group and count repeated features in the list of verbs
v_flex = Counter(verbs_df['FEATS'])

In [23]:
# Show the 20 most common verbal features
v_flex.most_common(20)

[('VerbForm=Inf', 10822),
 ('Mood=Ind|Number=Sing|Person=3|Tense=Pres|VerbForm=Fin', 5854),
 ('Gender=Masc|Number=Sing|Tense=Past|VerbForm=Part', 4671),
 ('Mood=Ind|Number=Plur|Person=3|Tense=Pres|VerbForm=Fin', 1923),
 ('Mood=Ind|Number=Sing|Person=3|Tense=Fut|VerbForm=Fin', 1509),
 ('Mood=Ind|Number=Sing|Person=3|Tense=Imp|VerbForm=Fin', 844),
 ('VerbForm=Ger', 816),
 ('Mood=Sub|Number=Sing|Person=3|Tense=Pres|VerbForm=Fin', 729),
 ('Mood=Ind|Number=Plur|Person=3|Tense=Fut|VerbForm=Fin', 602),
 ('Mood=Sub|Number=Plur|Person=3|Tense=Pres|VerbForm=Fin', 427),
 ('Mood=Ind|Number=Plur|Person=3|Tense=Imp|VerbForm=Fin', 296),
 ('Mood=Ind|Number=Plur|Person=1|Tense=Pres|VerbForm=Fin', 258),
 ('Mood=Ind|Number=Sing|Person=1|Tense=Pres|VerbForm=Fin', 234),
 ('Mood=Sub|Number=Sing|Person=3|Tense=Imp|VerbForm=Fin', 194),
 ('Gender=Fem|Number=Sing|Tense=Past|VerbForm=Part', 167),
 ('Mood=Cnd|Number=Sing|Person=3|VerbForm=Fin', 147),
 ('Gender=Masc|Number=Plur|Tense=Past|VerbForm=Part', 111),
 ('