In [1]:
import numpy as np
import pandas as pd

In [2]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

from catboost import CatBoostClassifier, Pool

In [3]:
train = pd.read_csv('D:/kaggle_data/Silero/train.csv')
test = pd.read_csv('D:/kaggle_data/Silero/test.csv')
submission = pd.read_csv('D:/kaggle_data/Silero/sample_submission.csv')

In [4]:
train.sample(10)

Unnamed: 0,id,word,stress,num_syllables,lemma
29861,29861,отложенные,2,5,откладывать
18898,18898,часах,2,2,часы
45882,45882,неизвестная,3,5,неизвестный
27727,27727,святыню,2,3,святыня
50314,50314,выводит,2,3,выводить
12181,12181,дешевую,2,4,дешевый
26192,26192,шкур,1,1,шкура
11228,11228,галантно,2,3,галантно
38276,38276,алании,2,4,алания
24666,24666,смертными,1,3,смертный


In [5]:
train.word.nunique(), train.lemma.nunique()

(63438, 23543)

Как видно из полученных выше значений, количество уникальных значений в столбце *'word'* больше чем в 2,5 раза

In [6]:
train[train['lemma'] == 'черт']

Unnamed: 0,id,word,stress,num_syllables,lemma
6,6,чёрта,1,2,черт
1628,1628,чертей,2,2,черт
5819,5819,чертям,2,2,черт
10062,10062,черти,1,2,черт
15964,15964,чертом,1,2,черт
56737,56737,чёрту,1,2,черт


In [7]:
train[:10]

Unnamed: 0,id,word,stress,num_syllables,lemma
0,0,румяной,2,3,румяный
1,1,цифрами,1,3,цифра
2,2,слугами,1,3,слуга
3,3,выбирает,3,4,выбирать
4,4,управдом,3,3,управдом
5,5,купюру,2,3,купюра
6,6,чёрта,1,2,черт
7,7,культурной,2,3,культурный
8,8,мэрии,1,3,мэрия
9,9,маркерные,2,4,маркерный


In [8]:
'Max syllables = {}'.format(train['num_syllables'].max())

'Max syllables = 6'

In [9]:
def syllably_div(word, return_syllables=False):
    """divide your word on syllables"""
    syllab_list = []
    check_word = list(word)
    syllab = []
    counter = 0 
    vowels = ['а', 'у', 'о', 'ы', 'э', 'я', 'ю', 'ё', 'и', 'е']
    for num, letter in enumerate(check_word):    
        if letter in vowels and check_word not in vowels:
            syllab.append(letter)
            #print(syllab, counter)
            counter += 1
            syllab_list.append(syllab)
            syllab = []
        elif letter in vowels and check_word in vowels:
            syllab.append(letter)
            #print(syllab)
            counter += 1
            syllab_list.append(syllab)
            syllab = []
        else:
            syllab.append(letter)
            #print(syllab)
    if return_syllables==True:
        return np.array([syllab_list, counter], dtype='object')
    return counter

In [10]:
syllably_div(train['word'][7], return_syllables=True)[0]

[['к', 'у'], ['л', 'ь', 'т', 'у'], ['р', 'н', 'о']]

In [11]:
(train['word'].apply(syllably_div) == train['num_syllables']).sum()/len(train['num_syllables'])

1.0

In [12]:
syllables_data = np.array([syllably_div(i, return_syllables=True) for i in train['word']])
syllables_data_test = np.array([syllably_div(i, return_syllables=True) for i in test['word']])

In [13]:
for i,x in enumerate(syllables_data[:, 0]):  
    while len(syllables_data[:, 0][i]) < 6:
        syllables_data[:, 0][i].append([])


In [14]:
for i,x in enumerate(syllables_data_test[:, 0]):  
    while len(syllables_data_test[:, 0][i]) < 6:
        syllables_data_test[:, 0][i].append([])

In [15]:
def syllab_dataframe(data):
    one_syllab = []
    two_syllab = []
    three_syllab = []
    four_syllab = []
    five_syllab = []
    six_syllab = []

    for i in data:
        one_syllab.append(''.join(i[0]))
        two_syllab.append(''.join(i[1]))
        three_syllab.append(''.join(i[2])) 
        four_syllab.append(''.join(i[3]))
        five_syllab.append(''.join(i[4])) 
        six_syllab.append(''.join(i[5]))

    syllables_df = pd.DataFrame(np.array([one_syllab, two_syllab, three_syllab, four_syllab, five_syllab, six_syllab], dtype='object'))
    syllables_df = syllables_df.transpose()
    syllables_df.columns =['one_syllab', 'two_syllab', 'three_syllab', 'four_syllab', 'five_syllab', 'six_syllab']
    return syllables_df

In [16]:
syllables_df_train = syllab_dataframe(syllables_data[:, 0])
syllables_df_test = syllab_dataframe(syllables_data_test[:, 0])

In [17]:
X_train = pd.concat([train, syllables_df_train], axis=1)
X_test = pd.concat([test, syllables_df_test], axis=1)

In [18]:
y_train = X_train['stress']
X_train.drop(['stress', 'id'], axis=1, inplace=True)
X_test.drop(['id'], axis=1, inplace=True)

In [19]:
X_train, X_valid, y_train, y_valid = train_test_split(X_train, y_train, train_size=0.8, 
                                                      test_size=0.2, random_state=2022)

In [20]:
X_train

Unnamed: 0,word,num_syllables,lemma,one_syllab,two_syllab,three_syllab,four_syllab,five_syllab,six_syllab
56344,поддержим,3,поддерживать,по,дде,ржи,,,
39602,бомбежки,3,бомбежка,бо,мбе,жки,,,
774,заканчивал,4,заканчивать,за,ка,нчи,ва,,
10448,открываться,4,открываться,о,ткры,ва,ться,,
1836,расправой,3,расправа,ра,спра,во,,,
...,...,...,...,...,...,...,...,...,...
46769,инфраструктурных,5,инфраструктурный,и,нфра,стру,кту,рны,
37488,сказание,4,сказание,ска,за,ни,е,,
16557,разыгрался,4,разыгрываться,ра,зы,гра,лся,,
1244,ядерную,4,ядерный,я,де,рну,ю,,


In [21]:
X_test

Unnamed: 0,word,num_syllables,lemma,one_syllab,two_syllab,three_syllab,four_syllab,five_syllab,six_syllab
0,эпилепсия,5,эпилепсия,э,пи,ле,пси,я,
1,относящейся,5,относиться,о,тно,ся,ще,йся,
2,размышлениями,6,размышление,ра,змы,шле,ни,я,ми
3,модемы,3,модем,мо,де,мы,,,
4,солнц,1,солнце,со,,,,,
...,...,...,...,...,...,...,...,...,...
29955,донбасса,3,донбасс,до,нба,сса,,,
29956,обложка,3,обложка,о,бло,жка,,,
29957,правителя,4,правитель,пра,ви,те,ля,,
29958,шерстяной,3,шерстяной,ше,рстя,но,,,


In [22]:
cat_features = [0, 2, 3, 4, 5, 6, 7, 8]

In [23]:
train_data = Pool(data=X_train, label=y_train, cat_features=cat_features)

eval_dataset = Pool(data=X_valid,
                    label=y_valid,
                    cat_features=cat_features)
                    

model = CatBoostClassifier(iterations=2000,
                           random_seed=2022,
                           loss_function='MultiClass',
                           task_type="GPU",
                           devices='0:1'
                           )

# Fit model
model.fit(train_data)
# Get predicted classes
preds_class = model.predict(eval_dataset)
# Get predicted probabilities for each class
preds_proba = model.predict_proba(eval_dataset)
# Get predicted RawFormulaVal
preds_raw = model.predict(eval_dataset,
                          prediction_type='RawFormulaVal')

Learning rate set to 0.081177
0:	learn: 1.6416481	total: 13.6ms	remaining: 27.2s
1:	learn: 1.5289994	total: 27.7ms	remaining: 27.7s
2:	learn: 1.4386313	total: 40.2ms	remaining: 26.8s
3:	learn: 1.3660374	total: 54.4ms	remaining: 27.1s
4:	learn: 1.3018128	total: 67.7ms	remaining: 27s
5:	learn: 1.2467087	total: 80.5ms	remaining: 26.8s
6:	learn: 1.2001994	total: 93ms	remaining: 26.5s
7:	learn: 1.1587796	total: 106ms	remaining: 26.3s
8:	learn: 1.1087649	total: 117ms	remaining: 26s
9:	learn: 1.0669331	total: 129ms	remaining: 25.7s
10:	learn: 1.0298806	total: 141ms	remaining: 25.5s
11:	learn: 0.9983317	total: 155ms	remaining: 25.7s
12:	learn: 0.9703311	total: 168ms	remaining: 25.7s
13:	learn: 0.9447198	total: 181ms	remaining: 25.7s
14:	learn: 0.9232482	total: 195ms	remaining: 25.8s
15:	learn: 0.9028745	total: 215ms	remaining: 26.7s
16:	learn: 0.8839856	total: 228ms	remaining: 26.6s
17:	learn: 0.8671102	total: 240ms	remaining: 26.4s
18:	learn: 0.8520912	total: 259ms	remaining: 27s
19:	learn: 0

In [24]:
accuracy_score(y_valid, preds_class[:,0])

0.7916141235813366

In [25]:
predict = model.predict(X_test)

In [26]:
predict_series = pd.DataFrame(predict[:,0], columns=['stress'])

In [27]:
final_subm = pd.DataFrame(test['id'])

In [28]:
final_subm = pd.concat([final_subm, predict_series], axis=1)

In [29]:
final_subm

Unnamed: 0,id,stress
0,0,3
1,1,3
2,2,3
3,3,2
4,4,1
...,...,...
29955,29955,2
29956,29956,2
29957,29957,2
29958,29958,3


In [30]:
final_subm.to_csv('./stress_subm.csv', index=False)