<a href="https://colab.research.google.com/github/ko74dev/rosstat/blob/main/rosstat.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## Загрузим нужные библиотеки

In [2]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Embedding, LSTM, SpatialDropout1D
from tensorflow.keras import utils
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.callbacks import ModelCheckpoint
from tensorflow.keras.metrics import Recall 

## Загрузим датасет и рассмотрим по ближе

In [3]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [4]:
df = pd.read_csv("/content/drive/MyDrive/train_dataset_train.csv")
df.shape

(658064, 3)

In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 658064 entries, 0 to 658063
Data columns (total 3 columns):
 #   Column  Non-Null Count   Dtype 
---  ------  --------------   ----- 
 0   id      658064 non-null  int64 
 1   name    658064 non-null  object
 2   groups  658064 non-null  int64 
dtypes: int64(2), object(1)
memory usage: 15.1+ MB


## Предобработка тренировочного датасета
Выделим целевую переменную

In [6]:
y = pd.get_dummies(df['groups'])

Проведем токенизацию текста и дополнения до макс длины

In [7]:
# Максимальное количество слов 
num_words = 500000
# Максимальная длина
max_len = 20

tokenizer = Tokenizer(num_words=num_words, )
tokenizer.fit_on_texts(df.name)

sequences = tokenizer.texts_to_sequences(df.name)
x = pad_sequences(sequences, maxlen=max_len)

## Создание и тренировка модели

In [8]:
model = Sequential()
model.add(Embedding(num_words, 192, input_length=max_len))
model.add(SpatialDropout1D(0.5))
model.add(LSTM(64, return_sequences=True, dropout=.2))
model.add(LSTM(32))
model.add(Dense(9, activation='sigmoid'))

In [9]:
model.compile(optimizer='adam',
              loss='binary_crossentropy', 
              metrics=[Recall()])

In [10]:
X_train, X_test, y_train, y_test = train_test_split(
                                        x, y, test_size=0.1, random_state=42)

In [11]:
y_train.reset_index(drop=True, inplace=True)
y_test.reset_index(drop=True, inplace=True)

In [12]:
for i in range(10):
    ids = list(set(y_train[y_train[10]==0].index) | 
               set(y_train[y_train[10]==1].index[i::10]))
    np.random.shuffle(ids)
    model.fit(X_train[ids], y_train.loc[ids],
                   validation_data = (X_test, y_test),
                   epochs=2 if i<5 else 1, batch_size=512)

Epoch 1/2
Epoch 2/2
Epoch 1/2
Epoch 2/2
Epoch 1/2
Epoch 2/2
Epoch 1/2
Epoch 2/2
Epoch 1/2
Epoch 2/2


## Анализ работы модели для валидационных данных

In [13]:
y_pred = model.predict(X_test)
print(classification_report(
    pd.Series(y_test.values.argmax(1)).map({i:c for i,c in enumerate(y.columns)}),
    pd.Series(y_pred.argmax(1)).map({i:c for i,c in enumerate(y.columns)})))

              precision    recall  f1-score   support

           0       0.99      0.99      0.99      2150
           1       0.95      0.98      0.96      1715
           2       0.96      0.94      0.95       371
           3       0.98      0.99      0.99      2137
           4       0.98      1.00      0.99      2775
           6       0.92      0.98      0.95       953
           7       0.90      0.97      0.94      1218
           9       0.99      0.99      0.99      4223
          10       1.00      0.99      0.99     50265

    accuracy                           0.99     65807
   macro avg       0.96      0.98      0.97     65807
weighted avg       0.99      0.99      0.99     65807



## Подготовка тестового датасета

In [14]:
test = pd.read_csv("/content/drive/MyDrive/test_dataset_test.csv")

In [15]:
test_sequences = tokenizer.texts_to_sequences(test['name'])

In [16]:
x_test = pad_sequences(test_sequences, maxlen=max_len)

In [17]:
pred = model.predict(x_test, batch_size=512)

In [18]:
test['groups'] = pred.argmax(1)
test['groups'] = test['groups'].map({i:c for i,c in enumerate(y.columns)})

In [19]:
test[['id', 'groups']].to_csv('sub.csv', index=None)