In [423]:
import numpy as np
import pandas as pd

In [424]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

from catboost import CatBoostClassifier, Pool

In [425]:
train = pd.read_csv('D:/kaggle_data/Silero/train.csv')
test = pd.read_csv('D:/kaggle_data/Silero/test.csv')
submission = pd.read_csv('D:/kaggle_data/Silero/sample_submission.csv')

In [426]:
train.sample(10)

Unnamed: 0,id,word,stress,num_syllables,lemma
41193,41193,боровиха,3,4,боровиха
32024,32024,смекалка,2,3,смекалка
29535,29535,приеду,2,3,приезжать
2028,2028,любопытного,3,5,любопытный
33683,33683,штукатуркой,3,4,штукатурка
26190,26190,паранойи,3,4,паранойя
20433,20433,вселенский,2,3,вселенский
48736,48736,кальяна,2,3,кальян
59628,59628,сектору,1,3,сектор
53728,53728,палки,1,2,палка


In [427]:
train.word.nunique(), train.lemma.nunique()

(63438, 23543)

Как видно из полученных выше значений, количество уникальных значений в столбце *'word'* больше чем в 2,5 раза

In [428]:
train[train['lemma'] == 'черт']

Unnamed: 0,id,word,stress,num_syllables,lemma
6,6,чёрта,1,2,черт
1628,1628,чертей,2,2,черт
5819,5819,чертям,2,2,черт
10062,10062,черти,1,2,черт
15964,15964,чертом,1,2,черт
56737,56737,чёрту,1,2,черт


In [429]:
train[:10]

Unnamed: 0,id,word,stress,num_syllables,lemma
0,0,румяной,2,3,румяный
1,1,цифрами,1,3,цифра
2,2,слугами,1,3,слуга
3,3,выбирает,3,4,выбирать
4,4,управдом,3,3,управдом
5,5,купюру,2,3,купюра
6,6,чёрта,1,2,черт
7,7,культурной,2,3,культурный
8,8,мэрии,1,3,мэрия
9,9,маркерные,2,4,маркерный


In [430]:
'Max syllables = {}'.format(train['num_syllables'].max())

'Max syllables = 6'

In [431]:
def syllably_div(word, return_syllables=False):
    """divide your word on syllables"""
    syllab_list = []
    check_word = list(word)
    syllab = []
    counter = 0 
    vowels = ['а', 'у', 'о', 'ы', 'э', 'я', 'ю', 'ё', 'и', 'е']
    #print(word)
    for num, letter in enumerate(check_word):    
        if letter in vowels:
            syllab.append(letter)
            #print(syllab, counter)
            counter += 1
            syllab_list.append(syllab)
            syllab = []
        # elif letter in ['й', 'ь', 'ъ']:
        #     if counter > 0:
        #         syllab_list[counter-1].append(letter)
        #     else:
        #        syllab.append(letter)
        else:
            syllab.append(letter)
            #print(syllab)
    if return_syllables==True:
        return np.array([syllab_list, counter], dtype='object')
    return counter

In [432]:
syllably_div('ньюсмейкеры', return_syllables=True)[0]

[['н', 'ь', 'ю'], ['с', 'м', 'е'], ['й', 'к', 'е'], ['р', 'ы']]

In [433]:
(train['word'].apply(syllably_div) == train['num_syllables']).sum()/len(train['num_syllables'])

1.0

In [434]:
syllables_data = np.array([syllably_div(i, return_syllables=True) for i in train['word']])
syllables_data_test = np.array([syllably_div(i, return_syllables=True) for i in test['word']])

In [435]:
for i,x in enumerate(syllables_data[:, 0]):  
    while len(syllables_data[:, 0][i]) < 6:
        syllables_data[:, 0][i].append([])


In [436]:
for i,x in enumerate(syllables_data_test[:, 0]):  
    while len(syllables_data_test[:, 0][i]) < 6:
        syllables_data_test[:, 0][i].append([])

In [437]:
def syllab_dataframe(data):
    one_syllab = []
    two_syllab = []
    three_syllab = []
    four_syllab = []
    five_syllab = []
    six_syllab = []

    for i in data:
        one_syllab.append(''.join(i[0]))
        two_syllab.append(''.join(i[1]))
        three_syllab.append(''.join(i[2])) 
        four_syllab.append(''.join(i[3]))
        five_syllab.append(''.join(i[4])) 
        six_syllab.append(''.join(i[5]))

    syllables_df = pd.DataFrame(np.array([one_syllab, two_syllab, three_syllab, four_syllab, five_syllab, six_syllab], dtype='object'))
    syllables_df = syllables_df.transpose()
    syllables_df.columns =['one_syllab', 'two_syllab', 'three_syllab', 'four_syllab', 'five_syllab', 'six_syllab']
    return syllables_df

In [438]:
syllables_df_train = syllab_dataframe(syllables_data[:, 0])
syllables_df_test = syllab_dataframe(syllables_data_test[:, 0])

In [439]:
# some functions for adding features

def word_ending(word, lemma):
    check_word = list(word)
    check_lemma = list(lemma)
    diff = abs(len(check_lemma) - len(check_word))
    if diff != 0:
        return ''.join(check_word[-diff:])
    else:
        return ''

def unique_letters(word, lemma):
    set_word = set(word)
    set_lemma = set(lemma)
    unique = []
    for i in set_word:
        if i not in set_lemma:
            unique.append(i)
    if len(unique) != 0:
        return ''.join(unique)
    else:
        return ''    

In [440]:
X_train = pd.concat([train, syllables_df_train], axis=1)
X_test = pd.concat([test, syllables_df_test], axis=1)
X_train['ending of word'] = X_train.apply(lambda x: word_ending(x.word, x.lemma), axis=1)
X_test['ending of word'] = X_test.apply(lambda x: word_ending(x.word, x.lemma), axis=1)
X_train['unique letters'] = X_train.apply(lambda x: unique_letters(x.word, x.lemma), axis=1)
X_test['unique letters'] = X_test.apply(lambda x: unique_letters(x.word, x.lemma), axis=1)


In [441]:
y_train = X_train['stress']
# also drop 'word' because we divided words into syllables and add them to data
X_train.drop(['stress', 'id', 'word'], axis=1, inplace=True) 
X_test.drop(['id', 'word'], axis=1, inplace=True)

In [442]:
X_train, X_valid, y_train, y_valid = train_test_split(X_train, y_train, train_size=0.8, 
                                                      test_size=0.2, random_state=2022)

In [443]:
X_train

Unnamed: 0,num_syllables,lemma,one_syllab,two_syllab,three_syllab,four_syllab,five_syllab,six_syllab,ending of word,unique letters
56344,3,поддерживать,по,дде,ржи,,,,жим,м
39602,3,бомбежка,бо,мбе,жки,,,,,и
774,4,заканчивать,за,ка,нчи,ва,,,л,л
10448,4,открываться,о,ткры,ва,ться,,,,
1836,3,расправа,ра,спра,во,,,,й,ой
...,...,...,...,...,...,...,...,...,...,...
46769,5,инфраструктурный,и,нфра,стру,кту,рны,,,х
37488,4,сказание,ска,за,ни,е,,,,
16557,4,разыгрываться,ра,зы,гра,лся,,,лся,л
1244,4,ядерный,я,де,рну,ю,,,,ую


In [444]:
X_test

Unnamed: 0,num_syllables,lemma,one_syllab,two_syllab,three_syllab,four_syllab,five_syllab,six_syllab,ending of word,unique letters
0,5,эпилепсия,э,пи,ле,пси,я,,,
1,5,относиться,о,тно,ся,ще,йся,,я,ейщ
2,6,размышление,ра,змы,шле,ни,я,ми,ми,я
3,3,модем,мо,де,мы,,,,ы,ы
4,1,солнце,со,,,,,,ц,
...,...,...,...,...,...,...,...,...,...,...
29955,3,донбасс,до,нба,сса,,,,а,
29956,3,обложка,о,бло,жка,,,,,
29957,4,правитель,пра,ви,те,ля,,,,я
29958,3,шерстяной,ше,рстя,но,,,,,


In [445]:
#cat_features = [0, 2, 3, 4, 5, 6, 7, 8]
cat_features = [1, 2, 3, 4, 5, 6, 7, 8, 9]

In [446]:
train_data = Pool(data=X_train, label=y_train, cat_features=cat_features)

eval_dataset = Pool(data=X_valid,
                    label=y_valid,
                    cat_features=cat_features)
                    

model = CatBoostClassifier(iterations=2000,
                           #learning_rate=0.1,
                           #depth=6,
                           random_seed=2022,
                           loss_function='MultiClass',
                           #eval_metric='Accuracy',
                           task_type="GPU",
                           devices='0:1'
                           )

# Fit model
model.fit(train_data, use_best_model=True, eval_set=eval_dataset)
# Get predicted classes
preds_class = model.predict(eval_dataset)
# Get predicted probabilities for each class
preds_proba = model.predict_proba(eval_dataset)
# Get predicted RawFormulaVal
preds_raw = model.predict(eval_dataset,
                          prediction_type='RawFormulaVal')

Learning rate set to 0.115486
0:	learn: 1.5859106	test: 1.5841283	best: 1.5841283 (0)	total: 16.1ms	remaining: 32.2s
1:	learn: 1.4448953	test: 1.4409931	best: 1.4409931 (1)	total: 30.6ms	remaining: 30.5s
2:	learn: 1.3433173	test: 1.3378882	best: 1.3378882 (2)	total: 44.7ms	remaining: 29.7s
3:	learn: 1.2630720	test: 1.2567976	best: 1.2567976 (3)	total: 59.9ms	remaining: 29.9s
4:	learn: 1.1958286	test: 1.1885742	best: 1.1885742 (4)	total: 76.7ms	remaining: 30.6s
5:	learn: 1.1407600	test: 1.1325398	best: 1.1325398 (5)	total: 92.6ms	remaining: 30.8s
6:	learn: 1.0950791	test: 1.0865767	best: 1.0865767 (6)	total: 110ms	remaining: 31.3s
7:	learn: 1.0554774	test: 1.0467822	best: 1.0467822 (7)	total: 125ms	remaining: 31.1s
8:	learn: 1.0225472	test: 1.0134850	best: 1.0134850 (8)	total: 140ms	remaining: 30.9s
9:	learn: 0.9916215	test: 0.9822021	best: 0.9822021 (9)	total: 157ms	remaining: 31.2s
10:	learn: 0.9487760	test: 0.9318006	best: 0.9318006 (10)	total: 172ms	remaining: 31.1s
11:	learn: 0.914

In [447]:
accuracy_score(y_valid, preds_class[:,0]) 
# drop 'word' - 0.792953972257251 
# with 'word' in data - 0.7911412358133669
# add ending of word - 0.7991803278688525
# add unique letters - 0.8034363177805801 (best in leaderbord)
# back 'word' + all features - 0.8102931904161412

0.8034363177805801

In [448]:
predict = model.predict(X_test)

In [449]:
predict_series = pd.DataFrame(predict[:,0], columns=['stress'])

In [450]:
final_subm = pd.DataFrame(test['id'])

In [451]:
final_subm = pd.concat([final_subm, predict_series], axis=1)

In [452]:
final_subm

Unnamed: 0,id,stress
0,0,3
1,1,3
2,2,3
3,3,2
4,4,1
...,...,...
29955,29955,2
29956,29956,2
29957,29957,2
29958,29958,3


In [453]:
final_subm.to_csv('./stress_subm.csv', index=False)