In [1]:
import pandas as pd
import numpy as np
from random import sample
from tqdm import tqdm

In [2]:
with open("../../words_preprocess/filtered_nouns.txt") as file:
    lines = file.readlines()

lines = list(map(lambda x: x.split(), lines))

In [3]:
df = pd.DataFrame(lines, columns=["inflected form", "base form", "tags"])
df.head(10)

Unnamed: 0,inflected form,base form,tags
0,абажур,абажур,noun:inanim:m:v_naz
1,абажура,абажур,noun:inanim:m:v_rod
2,абажурові,абажур,noun:inanim:m:v_dav
3,абажуру,абажур,noun:inanim:m:v_dav
4,абажур,абажур,noun:inanim:m:v_zna
5,абажуром,абажур,noun:inanim:m:v_oru
6,абажурі,абажур,noun:inanim:m:v_mis
7,абажурові,абажур,noun:inanim:m:v_mis
8,абажуру,абажур,noun:inanim:m:v_mis
9,абажуре,абажур,noun:inanim:m:v_kly


In [4]:
df.shape

(1519157, 3)

In [5]:
def process_group(row):
    return row["tags"].split(":")[1]

def process_number(row):
    tags = row["tags"].split(":")[1:]

    #check number
    if "p" in tags:
        return "2"
    return "1"

def process_gender(row):
    tags = row["tags"].split(":")[1:]

    #check gender
    if "m" in tags:
        return "m"
    elif "f" in tags:
        return "f"
    elif "n" in tags:
        return "n"

def process_case(row):
    tags = row["tags"].split(":")[1:]

    #check case
    if "v_naz" in tags:
        return "N"
    elif "v_rod" in tags:
        return "G"
    elif "v_dav" in tags:
        return "D"
    elif "v_zna" in tags:
        return "A"
    elif "v_oru" in tags:
        return "I"
    elif "v_mis" in tags:
        return "L"
    elif "v_kly" in tags:
        return "V"

In [6]:
df["group"] = df.apply(process_group, axis=1)
df["number"] = df.apply(process_number, axis=1)
df["gender"] = df.apply(process_gender, axis=1)
df["case"] = df.apply(process_case, axis=1)

In [7]:
df["number"].value_counts()

number
1    970421
2    548736
Name: count, dtype: int64

In [8]:
df["gender"].value_counts()

gender
m    450789
f    336907
n    182725
Name: count, dtype: int64

In [9]:
df["case"].value_counts()

case
L    291828
D    235895
A    214290
G    204539
V    190872
N    190868
I    190865
Name: count, dtype: int64

In [10]:
def fix_gender(row):
    if row["gender"] in ("m", "f", "n"):
        return row["gender"]
    else:
        base = row["base form"]
        return df[df["base form"] == base ].iloc[0]["gender"]

In [13]:
sample_nouns = sample(list(df["base form"].unique()), 115608)

In [14]:
df.head()

Unnamed: 0,inflected form,base form,tags,group,number,gender,case
0,абажур,абажур,noun:inanim:m:v_naz,inanim,1,m,N
1,абажура,абажур,noun:inanim:m:v_rod,inanim,1,m,G
2,абажурові,абажур,noun:inanim:m:v_dav,inanim,1,m,D
3,абажуру,абажур,noun:inanim:m:v_dav,inanim,1,m,D
4,абажур,абажур,noun:inanim:m:v_zna,inanim,1,m,A


In [15]:
gb = df.groupby(["base form"])

In [16]:
sample_nouns[10]

'остріха'

In [17]:
type(gb.get_group(sample_nouns[10]))

pandas.core.frame.DataFrame

In [18]:
df_train = pd.DataFrame(columns=['inflected form', 'base form', 'tags', 'number', 'gender'])
df_test = pd.DataFrame(columns=['inflected form', 'base form', 'tags', 'number', 'gender'])

for i in tqdm(range(len(sample_nouns[:100000]))):
    noun = sample_nouns[i]
    # frames = [df_train, df[df["base form"] == noun]]
    df_train = pd.concat([df_train, gb.get_group(noun)])

for i in tqdm(range(len(sample_nouns[100000:]))):
    noun = sample_nouns[15000 + i]
    df_test = pd.concat([df_test, gb.get_group(noun)])

100%|██████████| 100000/100000 [2:09:20<00:00, 12.89it/s] 
100%|██████████| 15608/15608 [03:12<00:00, 81.13it/s] 


In [19]:
df_train.reset_index(inplace=True)
df_test.reset_index(inplace=True)

df_train.drop(columns=["index", "tags"], inplace=True)
df_test.drop(columns=["index", "tags"], inplace=True)


In [20]:
df_train.fillna(method='ffill', inplace=True)
df_test.fillna(method='ffill', inplace=True)

In [21]:
df_train["concat form"] = df_train["base form"] + df_train["number"] + df_train["case"]
df_test["concat form"] = df_test["base form"] + df_test["number"] + df_test["case"]
df_train.head()

Unnamed: 0,inflected form,base form,number,gender,group,case,concat form
0,емаль,емаль,1,f,inanim,N,емаль1N
1,емалі,емаль,1,f,inanim,G,емаль1G
2,емалі,емаль,1,f,inanim,D,емаль1D
3,емаль,емаль,1,f,inanim,A,емаль1A
4,емаллю,емаль,1,f,inanim,I,емаль1I


In [22]:
df_train.to_csv("train.csv", index=False)
df_test.to_csv("test.csv", index=False)