In [1]:
# imports
from conllu import parse
import json
import pandas as pd

In [2]:
# read the CoNLL-U file
with open("fi_tdt-ud-train.conllu", "r", encoding="utf-8") as f:
    data = f.read()

# parse the CoNLL-U data
sentences = parse(data)

# initialize dictionary to store counts of different forms of verbs
feats_freqs = {}

# counting the amounts of different forms of verbs
for sentence in sentences: 
    for token in sentence:
        if token["upos"] == "VERB":
            feats_string = json.dumps(token["feats"])
            if feats_string not in feats_freqs:
                feats_freqs[feats_string] = [1, token["form"].lower()]
            else:
                feats_freqs[feats_string][0] += 1

print(f"There are only {len(feats_freqs)} different forms in all the verbs in this data set.")

There are only 408 different forms in all the verbs in this data set.


In [3]:
# display 20 most frequent verb forms
feats_freqs_df = pd.DataFrame.from_dict(feats_freqs, orient='index', columns=["Frequencies", "Example"])
feats_freqs_df = feats_freqs_df.sort_values(by="Frequencies", ascending=False)
feats_freqs_df.head(20)

Unnamed: 0,Frequencies,Example
"{""Mood"": ""Ind"", ""Number"": ""Sing"", ""Person"": ""3"", ""Tense"": ""Past"", ""VerbForm"": ""Fin"", ""Voice"": ""Act""}",2801,vei
"{""Mood"": ""Ind"", ""Number"": ""Sing"", ""Person"": ""3"", ""Tense"": ""Pres"", ""VerbForm"": ""Fin"", ""Voice"": ""Act""}",2517,avaa
"{""InfForm"": ""1"", ""Number"": ""Sing"", ""VerbForm"": ""Inf"", ""Voice"": ""Act""}",2460,katsoa
"{""Case"": ""Nom"", ""Number"": ""Sing"", ""PartForm"": ""Past"", ""VerbForm"": ""Part"", ""Voice"": ""Act""}",1256,noussut
"{""Mood"": ""Ind"", ""Tense"": ""Pres"", ""VerbForm"": ""Fin"", ""Voice"": ""Pass""}",1030,lavastetaan
"{""Mood"": ""Ind"", ""Number"": ""Plur"", ""Person"": ""3"", ""Tense"": ""Pres"", ""VerbForm"": ""Fin"", ""Voice"": ""Act""}",820,hidastavat
"{""Case"": ""Ill"", ""InfForm"": ""3"", ""Number"": ""Sing"", ""VerbForm"": ""Inf"", ""Voice"": ""Act""}",727,kävelemään
"{""Mood"": ""Ind"", ""Number"": ""Sing"", ""Person"": ""1"", ""Tense"": ""Past"", ""VerbForm"": ""Fin"", ""Voice"": ""Act""}",661,näin
"{""Case"": ""Nom"", ""Number"": ""Sing"", ""PartForm"": ""Past"", ""VerbForm"": ""Part"", ""Voice"": ""Pass""}",648,restauroitu
"{""Connegative"": ""Yes"", ""Mood"": ""Ind"", ""Tense"": ""Pres"", ""VerbForm"": ""Fin""}",586,tule
