**NOTE:** I haven't found any library to generate inflected forms from the lemma for Catalan. The best I could do was to use the vocabulary stored into a SpaCy model.

In [1]:
import pandas as pd
import spacy

In [2]:
# Get data fom file
df = pd.read_csv("Catalan_Stems.txt", sep=";")

In [3]:
# The dataset contains the keywords in the lexicon, the stems and POS tag
df.head()

Unnamed: 0,Lemma_Catalan,Stem_Catalan,POS_Catalan
0,amagar,amag,VERB
1,amic,amic,NOUN
2,amic,amig,NOUN
3,arxivar,arxiv,VERB
4,blocar,bloc,VERB


In [4]:
# We load a Catalan language model from SpaCy
nlp = spacy.load("ca_core_news_md")

In [5]:
# Searches each of the stems of the lexicon in the vocabulary of the model
# Then lemmatises all the forms found and keeps only those forms whose lemmas and POS tag correspond to the keywords in the lexicon
# Saves the lemma, inflected form and morphological features in a tuple (one tuple for each inflected form)
# Saves the tuples into a new list to create a DF later
def get_inflections(stem, lemma, pos):
    # Using regex, find all forms that have the stem in the model's vocabulary
    forms = []
    for w in nlp.vocab.strings:
        if w.startswith(stem):
            #print(w)
            forms.append(w)
    # Lemmatise and filter the forms to keep only the ones that have the lemma as indicated in the input
    # We save form, lemma, POS and inflection info
    lemmas = []
    for w in forms:
        doc = nlp(w)
        if doc[0].pos_ == pos and doc[0].lemma_ == lemma:
            #print(doc[0].text, doc[0].lemma_, doc[0].pos_, doc[0].morph)
            lemmas.append((doc[0].lemma_, doc[0].text, doc[0].pos_, doc[0].morph))
    # Remove duplicates before returning the list
    return list(set(lemmas))

In [7]:
# Example with the keyword 'comentar' and stem 'coment'
infl = get_inflections('coment', 'comentar', 'VERB')
infl[:5]

[('comentar',
  'comentava',
  'VERB',
  Mood=Ind|Number=Sing|Person=3|Tense=Imp|VerbForm=Fin),
 ('comentar',
  'comentaven',
  'VERB',
  Mood=Ind|Number=Plur|Person=3|Tense=Imp|VerbForm=Fin),
 ('comentar',
  'comentaràs',
  'VERB',
  Mood=Ind|Number=Plur|Person=1|Tense=Pres|VerbForm=Fin),
 ('comentar',
  'comentaré',
  'VERB',
  Mood=Ind|Number=Sing|Person=1|Tense=Fut|VerbForm=Fin),
 ('comentar',
  'comentarem',
  'VERB',
  Mood=Ind|Number=Plur|Person=1|Tense=Fut|VerbForm=Fin)]

In [8]:
# Test with a row of the DF
infl = []

for index, row in df.iterrows():
    if index < 3:
        print(row['Stem_Catalan'], row['Lemma_Catalan'], row['POS_Catalan'])
        tmp = get_inflections(row['Stem_Catalan'], row['Lemma_Catalan'], row['POS_Catalan'])
        print(f'Index: {index} - Value: {len(tmp)}')

amag amagar VERB
Index: 0 - Value: 19
amic amic NOUN
Index: 1 - Value: 2
amig amic NOUN
Index: 2 - Value: 2


In [None]:
# Apply inflection function to all rows of the DF - returns none because it fills the list infl_list
infl_list = []
df.apply(lambda row: infl_list.extend(get_inflections(row['Stem_Catalan'], row['Lemma_Catalan'], row['POS_Catalan'])), axis=1)

0     None
1     None
2     None
3     None
4     None
      ... 
56    None
57    None
58    None
59    None
60    None
Length: 61, dtype: object

In [115]:
infl_list

[('amagar',
  'amagaren',
  'VERB',
  Mood=Ind|Number=Plur|Person=3|Tense=Pres|VerbForm=Fin),
 ('amagar',
  'amagaré',
  'VERB',
  Mood=Ind|Number=Sing|Person=1|Tense=Fut|VerbForm=Fin),
 ('amagar',
  'amagava',
  'VERB',
  Mood=Ind|Number=Sing|Person=3|Tense=Imp|VerbForm=Fin),
 ('amagar',
  'amagarien',
  'VERB',
  Mood=Ind|Number=Plur|Person=3|Tense=Imp|VerbForm=Fin),
 ('amagar',
  'amagarà',
  'VERB',
  Mood=Ind|Number=Sing|Person=3|Tense=Fut|VerbForm=Fin),
 ('amagar',
  'amagaran',
  'VERB',
  Mood=Ind|Number=Plur|Person=3|Tense=Fut|VerbForm=Fin),
 ('amagar',
  'amaga',
  'VERB',
  Mood=Ind|Number=Sing|Person=3|Tense=Pres|VerbForm=Fin),
 ('amagar', 'amagant', 'VERB', VerbForm=Ger),
 ('amagar',
  'amagam',
  'VERB',
  Mood=Ind|Number=Sing|Person=3|Tense=Pres|VerbForm=Fin),
 ('amagar',
  'amagàvem',
  'VERB',
  Mood=Ind|Number=Plur|Person=1|Tense=Imp|VerbForm=Fin),
 ('amagar',
  'amag',
  'VERB',
  Mood=Ind|Number=Sing|Person=3|Tense=Pres|VerbForm=Fin),
 ('amagar',
  'amague',
  'VERB

In [None]:
len(infl_list)

491

In [117]:
infl_list[100]

('comentari', 'comentari', 'NOUN', Gender=Masc|Number=Sing)

In [None]:
# Remove duplicate tuples
m = list(set(infl_list))
len(m)

457

In [119]:
m

[('respondre', 'respondré', 'VERB', VerbForm=Inf),
 ('blocar',
  'blocava',
  'VERB',
  Mood=Ind|Number=Sing|Person=3|Tense=Imp|VerbForm=Fin),
 ('silenciar',
  'silenciïn',
  'VERB',
  Mood=Sub|Number=Plur|Person=3|Tense=Pres|VerbForm=Fin),
 ('restringir',
  'restringeixen',
  'VERB',
  Mood=Ind|Number=Plur|Person=3|Tense=Pres|VerbForm=Fin),
 ('publicar',
  'publicaria',
  'VERB',
  Mood=Cnd|Number=Sing|Person=3|VerbForm=Fin),
 ('denunciar', 'denunciar', 'VERB', VerbForm=Inf),
 ('arxivar', 'arxivar', 'VERB', VerbForm=Inf),
 ('denunciar', 'denunciant', 'VERB', VerbForm=Ger),
 ('seguir',
  'seguirem',
  'VERB',
  Mood=Ind|Number=Plur|Person=1|Tense=Fut|VerbForm=Fin),
 ('compartir',
  'comparteixo',
  'VERB',
  Mood=Ind|Number=Sing|Person=1|Tense=Pres|VerbForm=Fin),
 ('publicar',
  'publico',
  'VERB',
  Mood=Ind|Number=Sing|Person=1|Tense=Pres|VerbForm=Fin),
 ('silenciar',
  'silenciem',
  'VERB',
  Mood=Ind|Number=Plur|Person=1|Tense=Pres|VerbForm=Fin),
 ('resposta', 'resposta', 'NOUN',

In [None]:
# Create a DF from the list of tuples created previously
infl_df = pd.DataFrame(m, columns=['Lemma', 'Wordform', 'POS', 'Full_POS']).sort_values(by=['Lemma', 'Wordform'], axis=0, inplace=False).reset_index(drop=True)

In [166]:
infl_df[infl_df['Full_POS'].isna()]

Unnamed: 0,Lemma,Wordform,POS,Full_POS


In [139]:
infl_df.head()

Unnamed: 0,Lemma,Wordform,POS,Full_POS
0,amagar,amag,VERB,"(Mood=Ind, Number=Sing, Person=3, Tense=Pres, ..."
1,amagar,amaga,VERB,"(Mood=Ind, Number=Sing, Person=3, Tense=Pres, ..."
2,amagar,amagam,VERB,"(Mood=Ind, Number=Sing, Person=3, Tense=Pres, ..."
3,amagar,amagant,VERB,(VerbForm=Ger)
4,amagar,amagar,VERB,(VerbForm=Inf)


In [None]:
# Save DF to csv file
infl_df.to_csv('inflection_table_cat.csv', sep=';')