In [1]:
# import libraries 
import numpy as np
import pandas as pd

In [2]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

from catboost import CatBoostClassifier, Pool

In [3]:
# load train and test data
train = pd.read_csv('D:/kaggle_data/Silero/train.csv')
test = pd.read_csv('D:/kaggle_data/Silero/test.csv')
submission = pd.read_csv('D:/kaggle_data/Silero/sample_submission.csv')

In [4]:
# look on the data
train.sample(10)

Unnamed: 0,id,word,stress,num_syllables,lemma
58532,58532,диссертационный,5,6,диссертационный
28058,28058,тревожный,2,3,тревожный
32565,32565,нержавеющая,3,6,нержавеющий
31902,31902,ругали,2,3,ругать
55980,55980,тираж,2,2,тираж
39542,39542,соломонович,3,5,соломонович
17804,17804,входящих,2,3,входить
23033,23033,аккредитивы,4,5,аккредитив
47124,47124,судимости,2,4,судимость
56020,56020,фронтовик,3,3,фронтовик


In [5]:
train.word.nunique(), train.lemma.nunique()

(63438, 23543)

Как видно из полученных выше значений, количество уникальных значений в столбце *'word'* больше чем в 2,5 раза

In [6]:
# look at the different forms of the word
train[train['lemma'] == 'черт']

Unnamed: 0,id,word,stress,num_syllables,lemma
6,6,чёрта,1,2,черт
1628,1628,чертей,2,2,черт
5819,5819,чертям,2,2,черт
10062,10062,черти,1,2,черт
15964,15964,чертом,1,2,черт
56737,56737,чёрту,1,2,черт


In [7]:
# what is maximum syllables in the world
'Max syllables = {}'.format(train['num_syllables'].max())

'Max syllables = 6'

In [8]:
# function for divide the word into syllables
def syllably_div(word, return_syllables=False):
    """divide your word on syllables"""
    syllab_list = []
    check_word = list(word)
    syllab = []
    counter = 0 
    vowels = ['а', 'у', 'о', 'ы', 'э', 'я', 'ю', 'ё', 'и', 'е']
    #print(word)
    for num, letter in enumerate(check_word):    
        if letter in vowels:
            syllab.append(letter)
            #print(syllab, counter)
            counter += 1
            syllab_list.append(syllab)
            syllab = []
        # elif letter in ['й', 'ь', 'ъ']:
        #     if counter > 0:
        #         syllab_list[counter-1].append(letter)
        #     else:
        #        syllab.append(letter)
        else:
            syllab.append(letter)
            #print(syllab)
    if return_syllables==True:
        return np.array([syllab_list, counter], dtype='object')
    return counter

In [9]:
# see how the function works
syllably_div('ньюсмейкеры', return_syllables=True)[0]

[['н', 'ь', 'ю'], ['с', 'м', 'е'], ['й', 'к', 'е'], ['р', 'ы']]

In [10]:
# check correctly dividing into syllables
(train['word'].apply(syllably_div) == train['num_syllables']).sum()/len(train['num_syllables'])

1.0

In [11]:
# divide train and test data words into syllables
syllables_data = np.array([syllably_div(i, return_syllables=True) for i in train['word']])
syllables_data_test = np.array([syllably_div(i, return_syllables=True) for i in test['word']])

In [12]:
# add empty list in each word that to equal length of all words
for i,x in enumerate(syllables_data[:, 0]):  
    while len(syllables_data[:, 0][i]) < 6:
        syllables_data[:, 0][i].append([])

In [13]:
for i,x in enumerate(syllables_data_test[:, 0]):  
    while len(syllables_data_test[:, 0][i]) < 6:
        syllables_data_test[:, 0][i].append([])

In [14]:
# function for creating dataframe with columns of the syllables 
def syllab_dataframe(data):
    one_syllab = []
    two_syllab = []
    three_syllab = []
    four_syllab = []
    five_syllab = []
    six_syllab = []

    for i in data:
        one_syllab.append(''.join(i[0]))
        two_syllab.append(''.join(i[1]))
        three_syllab.append(''.join(i[2])) 
        four_syllab.append(''.join(i[3]))
        five_syllab.append(''.join(i[4])) 
        six_syllab.append(''.join(i[5]))

    syllables_df = pd.DataFrame(np.array([one_syllab, two_syllab, three_syllab, four_syllab, five_syllab, six_syllab], dtype='object'))
    syllables_df = syllables_df.transpose()
    syllables_df.columns =['one_syllab', 'two_syllab', 'three_syllab', 'four_syllab', 'five_syllab', 'six_syllab']
    return syllables_df

In [15]:
# from syllables create dataframe with columns of the syllables 
syllables_df_train = syllab_dataframe(syllables_data[:, 0])
syllables_df_test = syllab_dataframe(syllables_data_test[:, 0])

In [16]:
# some functions for adding features: end of each word and unique letter in each word

def word_ending(word, lemma):
    check_word = list(word)
    check_lemma = list(lemma)
    diff = abs(len(check_lemma) - len(check_word))
    if diff != 0:
        return ''.join(check_word[-diff:])
    else:
        return ''

def unique_letters(word, lemma):
    set_word = set(word)
    set_lemma = set(lemma)
    unique = []
    for i in set_word:
        if i not in set_lemma:
            unique.append(i)
    if len(unique) != 0:
        return ''.join(unique)
    else:
        return ''    

In [17]:
# concatenate the created data with train and test dataframe and add some features by functions
X_train = pd.concat([train, syllables_df_train], axis=1)
X_test = pd.concat([test, syllables_df_test], axis=1)
X_train['ending of word'] = X_train.apply(lambda x: word_ending(x.word, x.lemma), axis=1)
X_test['ending of word'] = X_test.apply(lambda x: word_ending(x.word, x.lemma), axis=1)
X_train['unique letters'] = X_train.apply(lambda x: unique_letters(x.word, x.lemma), axis=1)
X_test['unique letters'] = X_test.apply(lambda x: unique_letters(x.word, x.lemma), axis=1)

In [18]:
y_train = X_train['stress']
# also drop 'word' because we divided words into syllables and add them to data
X_train.drop(['stress', 'id', 'word'], axis=1, inplace=True) 
X_test.drop(['id', 'word'], axis=1, inplace=True)

In [19]:
# divide training dataset into training and validation dataset
X_train, X_valid, y_train, y_valid = train_test_split(X_train, y_train, train_size=0.8, 
                                                      test_size=0.2, random_state=2022)

In [20]:
# look at the data
X_train.head(5)

Unnamed: 0,num_syllables,lemma,one_syllab,two_syllab,three_syllab,four_syllab,five_syllab,six_syllab,ending of word,unique letters
56344,3,поддерживать,по,дде,ржи,,,,жим,м
39602,3,бомбежка,бо,мбе,жки,,,,,и
774,4,заканчивать,за,ка,нчи,ва,,,л,л
10448,4,открываться,о,ткры,ва,ться,,,,
1836,3,расправа,ра,спра,во,,,,й,йо


In [21]:
X_test.head(5)

Unnamed: 0,num_syllables,lemma,one_syllab,two_syllab,three_syllab,four_syllab,five_syllab,six_syllab,ending of word,unique letters
0,5,эпилепсия,э,пи,ле,пси,я,,,
1,5,относиться,о,тно,ся,ще,йся,,я,йще
2,6,размышление,ра,змы,шле,ни,я,ми,ми,я
3,3,модем,мо,де,мы,,,,ы,ы
4,1,солнце,со,,,,,,ц,


In [22]:
# point which features are categorical
cat_features = [X_train.columns.to_list().index(cname) for cname in X_test.columns if
                    X_test[cname].dtype == "object"]
print(cat_features)

[1, 2, 3, 4, 5, 6, 7, 8, 9]


In [23]:
# create training and evaluation dataset by Pool and
# use the classifier by CatBoost with some hyperparameters
train_data = Pool(data=X_train, label=y_train, cat_features=cat_features)

eval_dataset = Pool(data=X_valid,
                    label=y_valid,
                    cat_features=cat_features)
                    

model = CatBoostClassifier(iterations=3000,
                           random_seed=2022,
                           depth=8, 
                           #learning_rate=0.03,
                           #l2_leaf_reg=7,
                           loss_function='MultiClass',
                           eval_metric='Accuracy',
                           task_type="GPU",
                           devices='0:1'
                           )

# Fit model
model.fit(train_data, use_best_model=True, eval_set=eval_dataset, verbose=300)
          
# Get predicted classes
preds_class = model.predict(eval_dataset)

Learning rate set to 0.096734
0:	learn: 0.6025419	test: 0.6067938	best: 0.6067938 (0)	total: 33.8ms	remaining: 1m 41s
300:	learn: 0.8013793	test: 0.7879887	best: 0.7879887 (300)	total: 12.7s	remaining: 1m 54s
600:	learn: 0.8444729	test: 0.7993380	best: 0.8005202 (593)	total: 24.8s	remaining: 1m 38s
900:	learn: 0.8757635	test: 0.8037516	best: 0.8045397 (882)	total: 36.4s	remaining: 1m 24s
1200:	learn: 0.8992315	test: 0.8049338	best: 0.8057219 (1034)	total: 47s	remaining: 1m 10s
1500:	learn: 0.9183251	test: 0.8074559	best: 0.8080076 (1432)	total: 58.4s	remaining: 58.3s
1800:	learn: 0.9333990	test: 0.8084016	best: 0.8087957 (1788)	total: 1m 11s	remaining: 47.3s
2100:	learn: 0.9452611	test: 0.8084805	best: 0.8093474 (2039)	total: 1m 23s	remaining: 35.5s
2400:	learn: 0.9547783	test: 0.8071406	best: 0.8093474 (2039)	total: 1m 34s	remaining: 23.5s
2700:	learn: 0.9618719	test: 0.8068253	best: 0.8093474 (2039)	total: 1m 44s	remaining: 11.6s
2999:	learn: 0.9682562	test: 0.8072194	best: 0.8093474

In [24]:
# check accuracy
accuracy_score(y_valid, preds_class[:,0]) 
# drop 'word' - 0.792953972257251 
# with 'word' in data - 0.7911412358133669
# add ending of word - 0.7991803278688525
# add unique letters - 0.8034363177805801
# back 'word' + all features - 0.8102931904161412
# tuning hyperparameters CatBoost - 0.810608448928121 (best in leaderbord)

0.8093474148802018

In [25]:
# predict stresses of the words on the test data 
predict = model.predict(X_test)

In [26]:
# create dataset with predict stresses of words
predict_series = pd.DataFrame(predict[:,0], columns=['stress'])

In [27]:
# create dataset with id from test data
final_subm = pd.DataFrame(test['id'])

In [28]:
# concatenate id and stresses of words 
final_subm = pd.concat([final_subm, predict_series], axis=1)

In [29]:
# look on the submission dataset
final_subm.tail(5)

Unnamed: 0,id,stress
29955,29955,2
29956,29956,2
29957,29957,2
29958,29958,3
29959,29959,4


In [30]:
# create .csv file
final_subm.to_csv('./stress_subm.csv', index=False)