In [1]:
import numpy as np
import pandas as pd

In [2]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

from catboost import CatBoostClassifier, Pool

In [3]:
train = pd.read_csv('D:/kaggle_data/Silero/train.csv')
test = pd.read_csv('D:/kaggle_data/Silero/test.csv')
submission = pd.read_csv('D:/kaggle_data/Silero/sample_submission.csv')

In [4]:
train.sample(10)

Unnamed: 0,id,word,stress,num_syllables,lemma
22819,22819,ругает,2,3,ругать
57781,57781,полезным,2,3,полезный
61494,61494,приехало,2,4,приезжать
60712,60712,спешащих,2,3,спешить
4135,4135,греющий,1,3,греть
37765,37765,выполнит,1,3,выполнять
38060,38060,поселков,2,3,поселок
59689,59689,догадка,2,3,догадка
16477,16477,содержании,3,5,содержание
13328,13328,максатиха,2,4,максатиха


In [5]:
train.word.nunique(), train.lemma.nunique()

(63438, 23543)

Как видно из полученных выше значений, количество уникальных значений в столбце *'word'* больше чем в 2,5 раза

In [6]:
train[train['lemma'] == 'черт']

Unnamed: 0,id,word,stress,num_syllables,lemma
6,6,чёрта,1,2,черт
1628,1628,чертей,2,2,черт
5819,5819,чертям,2,2,черт
10062,10062,черти,1,2,черт
15964,15964,чертом,1,2,черт
56737,56737,чёрту,1,2,черт


In [7]:
train[:10]

Unnamed: 0,id,word,stress,num_syllables,lemma
0,0,румяной,2,3,румяный
1,1,цифрами,1,3,цифра
2,2,слугами,1,3,слуга
3,3,выбирает,3,4,выбирать
4,4,управдом,3,3,управдом
5,5,купюру,2,3,купюра
6,6,чёрта,1,2,черт
7,7,культурной,2,3,культурный
8,8,мэрии,1,3,мэрия
9,9,маркерные,2,4,маркерный


In [8]:
'Max syllables = {}'.format(train['num_syllables'].max())

'Max syllables = 6'

In [9]:
def syllably_div(word, return_syllables=False):
    """divide your word on syllables"""
    syllab_list = []
    check_word = list(word)
    syllab = []
    counter = 0 
    vowels = ['а', 'у', 'о', 'ы', 'э', 'я', 'ю', 'ё', 'и', 'е']
    #print(word)
    for num, letter in enumerate(check_word):    
        if letter in vowels:
            syllab.append(letter)
            #print(syllab, counter)
            counter += 1
            syllab_list.append(syllab)
            syllab = []
        elif letter in ['й', 'ь', 'ъ']:
            if counter > 0:
                syllab_list[counter-1].append(letter)
            else:
                syllab.append(letter)
        else:
            syllab.append(letter)
            #print(syllab)
    if return_syllables==True:
        return np.array([syllab_list, counter], dtype='object')
    return counter

In [10]:
syllably_div('ньюсмейкеры', return_syllables=True)[0]

[['н', 'ь', 'ю'], ['с', 'м', 'е', 'й'], ['к', 'е'], ['р', 'ы']]

In [11]:
(train['word'].apply(syllably_div) == train['num_syllables']).sum()/len(train['num_syllables'])

1.0

In [12]:
syllables_data = np.array([syllably_div(i, return_syllables=True) for i in train['word']])
syllables_data_test = np.array([syllably_div(i, return_syllables=True) for i in test['word']])

In [13]:
for i,x in enumerate(syllables_data[:, 0]):  
    while len(syllables_data[:, 0][i]) < 6:
        syllables_data[:, 0][i].append([])


In [14]:
for i,x in enumerate(syllables_data_test[:, 0]):  
    while len(syllables_data_test[:, 0][i]) < 6:
        syllables_data_test[:, 0][i].append([])

In [15]:
def syllab_dataframe(data):
    one_syllab = []
    two_syllab = []
    three_syllab = []
    four_syllab = []
    five_syllab = []
    six_syllab = []

    for i in data:
        one_syllab.append(''.join(i[0]))
        two_syllab.append(''.join(i[1]))
        three_syllab.append(''.join(i[2])) 
        four_syllab.append(''.join(i[3]))
        five_syllab.append(''.join(i[4])) 
        six_syllab.append(''.join(i[5]))

    syllables_df = pd.DataFrame(np.array([one_syllab, two_syllab, three_syllab, four_syllab, five_syllab, six_syllab], dtype='object'))
    syllables_df = syllables_df.transpose()
    syllables_df.columns =['one_syllab', 'two_syllab', 'three_syllab', 'four_syllab', 'five_syllab', 'six_syllab']
    return syllables_df

In [17]:
syllables_df_train = syllab_dataframe(syllables_data[:, 0])
syllables_df_test = syllab_dataframe(syllables_data_test[:, 0])

In [18]:
X_train = pd.concat([train, syllables_df_train], axis=1)
X_test = pd.concat([test, syllables_df_test], axis=1)

In [19]:
y_train = X_train['stress']
# also drop 'word' because we divided words into syllables and add them to data
X_train.drop(['stress', 'id', 'word'], axis=1, inplace=True) 
X_test.drop(['id', 'word'], axis=1, inplace=True)

In [20]:
X_train, X_valid, y_train, y_valid = train_test_split(X_train, y_train, train_size=0.8, 
                                                      test_size=0.2, random_state=2022)

In [21]:
X_train

Unnamed: 0,num_syllables,lemma,one_syllab,two_syllab,three_syllab,four_syllab,five_syllab,six_syllab
56344,3,поддерживать,по,дде,ржи,,,
39602,3,бомбежка,бо,мбе,жки,,,
774,4,заканчивать,за,ка,нчи,ва,,
10448,4,открываться,о,ткры,ваь,тся,,
1836,3,расправа,ра,спра,вой,,,
...,...,...,...,...,...,...,...,...
46769,5,инфраструктурный,и,нфра,стру,кту,рны,
37488,4,сказание,ска,за,ни,е,,
16557,4,разыгрываться,ра,зы,гра,лся,,
1244,4,ядерный,я,де,рну,ю,,


In [22]:
X_test

Unnamed: 0,num_syllables,lemma,one_syllab,two_syllab,three_syllab,four_syllab,five_syllab,six_syllab
0,5,эпилепсия,э,пи,ле,пси,я,
1,5,относиться,о,тно,ся,щей,ся,
2,6,размышление,ра,змы,шле,ни,я,ми
3,3,модем,мо,де,мы,,,
4,1,солнце,со,,,,,
...,...,...,...,...,...,...,...,...
29955,3,донбасс,до,нба,сса,,,
29956,3,обложка,о,бло,жка,,,
29957,4,правитель,пра,ви,те,ля,,
29958,3,шерстяной,ше,рстя,ной,,,


In [23]:
#cat_features = [0, 2, 3, 4, 5, 6, 7, 8]
cat_features = [1, 2, 3, 4, 5, 6, 7]

In [24]:
train_data = Pool(data=X_train, label=y_train, cat_features=cat_features)

eval_dataset = Pool(data=X_valid,
                    label=y_valid,
                    cat_features=cat_features)
                    

model = CatBoostClassifier(iterations=2000,
                           #learning_rate=0.1,
                           depth=6,
                           random_seed=2022,
                           loss_function='MultiClass',
                           task_type="GPU",
                           devices='0:1'
                           )

# Fit model
model.fit(train_data, use_best_model=True, eval_set=eval_dataset)
# Get predicted classes
preds_class = model.predict(eval_dataset)
# Get predicted probabilities for each class
preds_proba = model.predict_proba(eval_dataset)
# Get predicted RawFormulaVal
preds_raw = model.predict(eval_dataset,
                          prediction_type='RawFormulaVal')

Learning rate set to 0.115486
0:	learn: 1.6018087	test: 1.6007089	best: 1.6007089 (0)	total: 14.5ms	remaining: 28.9s
1:	learn: 1.4629923	test: 1.4595970	best: 1.4595970 (1)	total: 32.5ms	remaining: 32.5s
2:	learn: 1.3565257	test: 1.3518821	best: 1.3518821 (2)	total: 46.3ms	remaining: 30.8s
3:	learn: 1.2691870	test: 1.2634953	best: 1.2634953 (3)	total: 60.3ms	remaining: 30.1s
4:	learn: 1.1970129	test: 1.1903933	best: 1.1903933 (4)	total: 73.4ms	remaining: 29.3s
5:	learn: 1.1389943	test: 1.1316617	best: 1.1316617 (5)	total: 86.4ms	remaining: 28.7s
6:	learn: 1.0910494	test: 1.0832319	best: 1.0832319 (6)	total: 99.3ms	remaining: 28.3s
7:	learn: 1.0518866	test: 1.0435348	best: 1.0435348 (7)	total: 112ms	remaining: 27.8s
8:	learn: 1.0178167	test: 1.0088040	best: 1.0088040 (8)	total: 124ms	remaining: 27.5s
9:	learn: 0.9875924	test: 0.9784865	best: 0.9784865 (9)	total: 137ms	remaining: 27.3s
10:	learn: 0.9506780	test: 0.9353459	best: 0.9353459 (10)	total: 149ms	remaining: 27s
11:	learn: 0.9204

In [25]:
accuracy_score(y_valid, preds_class[:,0]) 
# drop 'word' - 0.792953972257251 
# with 'word' in data - 0.7911412358133669

0.7877522068095839

In [26]:
predict = model.predict(X_test)

In [27]:
predict_series = pd.DataFrame(predict[:,0], columns=['stress'])

In [28]:
final_subm = pd.DataFrame(test['id'])

In [29]:
final_subm = pd.concat([final_subm, predict_series], axis=1)

In [30]:
final_subm

Unnamed: 0,id,stress
0,0,3
1,1,3
2,2,3
3,3,2
4,4,1
...,...,...
29955,29955,2
29956,29956,2
29957,29957,2
29958,29958,3


In [31]:
final_subm.to_csv('./stress_subm.csv', index=False)