# Подготовка окружения

## Установка пакетов и импорт зависимостей

In [None]:
!pip install datasets
!pip install pandas
!pip install faiss-gpu
!pip install faiss-cpu
!pip install flask
!pip install pyngrok
!pip install pytest
!pip install plotly
!pip install optuna

Collecting datasets
  Downloading datasets-2.17.1-py3-none-any.whl (536 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m536.7/536.7 kB[0m [31m11.0 MB/s[0m eta [36m0:00:00[0m
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl (116 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m17.0 MB/s[0m eta [36m0:00:00[0m
Collecting multiprocess (from datasets)
  Downloading multiprocess-0.70.16-py310-none-any.whl (134 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m134.8/134.8 kB[0m [31m20.3 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: dill, multiprocess, datasets
Successfully installed datasets-2.17.1 dill-0.3.8 multiprocess-0.70.16
Collecting faiss-gpu
  Downloading faiss_gpu-1.7.2-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (85.5 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m85.5/85.5 MB[0m [31m20.7 MB/s[0m eta 

In [None]:
import sys
import random
from multiprocessing import cpu_count
from getpass import getpass

import logging

from typing import Tuple
from google.colab import drive
from joblib import dump, load

import faiss
import torch
from transformers import AutoTokenizer

## Настройка окружения

In [None]:
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")
dump(tokenizer, constants.TOKENIZER_PATH)

## Клонирование исходных кодов проекта

In [None]:
def clone_pull_github_src(pull: bool = True):
    """
    Клонирует или обновляет репозиторий GitHub в локальный каталог для последующей работы.

    Parameters:
    pull (bool): Указывает, следует ли выполнять pull для существующего репозитория. Если True, выполняется git pull.
                 Если False, репозиторий клонируется в указанный каталог.

    Returns:
    Constants: Экземпляр класса Constants, содержащий константы проекта.
    """
    WORKSPACE_PATH = '/content/drive/MyDrive/docs/keepForever/mipt/nlp/hw1_4sem/'
    WORKSPACE_TMP = WORKSPACE_PATH + '/tmp/'
    GIT_HUB_PROJECT_PATH = WORKSPACE_PATH + 'code/'

    token = getpass('Введите GitHub token: ')
    repo_url = 'https://github.com/km-mipt-nlp-gen/hw1.git'
    repo_url_with_token = repo_url.replace('https://', f'https://{token}@')

    os.chdir(GIT_HUB_PROJECT_PATH)

    if pull:
        !git pull origin main
    else:
        !git clone {repo_url_with_token} "$GIT_HUB_PROJECT_PATH"

    del token

    sys.path.append(f"{GIT_HUB_PROJECT_PATH}/web_app/src/")
    from constants_module import Constants

    return Constants()

constants = clone_pull_github_src()
from constants_module import Constants
from chat_util_module import ChatUtil

Введите GitHub token: ··········
remote: Enumerating objects: 8, done.[K
remote: Counting objects: 100% (8/8), done.[K
remote: Compressing objects: 100% (5/5), done.[K
remote: Total 5 (delta 1), reused 0 (delta 0), pack-reused 0[K
Unpacking objects: 100% (5/5), 74.47 KiB | 990.00 KiB/s, done.
From https://github.com/km-mipt-nlp-gen/hw1
 * branch            main       -> FETCH_HEAD
   438840f..10e324f  main       -> origin/main
Updating 438840f..10e324f
Fast-forward
 ml/notebook/features_preprocessing_notebook.ipynb | 9415 [32m+++++++++++++++++++++++++++++++++++++++++[m
 1 file changed, 9415 insertions(+)
 create mode 100644 ml/notebook/features_preprocessing_notebook.ipynb


# Тренировка моделей и создание ембеддингов

## Обновление модулей

In [None]:
def reload_modules(constants: Constants) -> Tuple[Constants, ChatUtil]:
    """
    Перезагружает модули проекта для обновления изменений в коде без перезапуска среды выполнения.

    Parameters:
    constants (Constants): Экземпляр класса Constants, содержащий постоянные поля.

    Returns:
    Tuple[Constants, ChatUtil]: Кортеж, содержащий обновленный экземпляр класса Constants и экземпляр ChatUtil.
    """
    import sys
    sys.path.append(f"{constants.GIT_HUB_PROJECT_PATH}/web_app/src/")
    sys.path.append(f"{constants.GIT_HUB_PROJECT_PATH}/ml/src/train/")

    import importlib
    import chat_service_accelerator_module
    import chat_repository_module
    import chat_service_module
    import chat_controller_module
    import chat_util_module
    import constants_module
    import models_zoo_module
    import siamese_bi_encoder_training_pipeline_module
    import cross_encoder_training_pipeline_module

    importlib.reload(chat_service_accelerator_module)
    importlib.reload(chat_repository_module)
    importlib.reload(chat_service_module)
    importlib.reload(chat_controller_module)
    importlib.reload(chat_util_module)
    importlib.reload(constants_module)
    importlib.reload(models_zoo_module)
    importlib.reload(siamese_bi_encoder_training_pipeline_module)
    importlib.reload(cross_encoder_training_pipeline_module)

    from constants_module import Constants
    from chat_util_module import ChatUtil
    from chat_service_accelerator_module import ChatServiceAccelerator
    from chat_repository_module import ChatRepository
    from chat_service_module import ChatService
    from chat_controller_module import ChatController
    from siamese_bi_encoder_training_pipeline_module import SiameseBiEncoderTrainingPipeline
    from cross_encoder_training_pipeline_module import CrossEncoderTrainingPipeline
    from models_zoo_module import SiameseBiEncoder
    from models_zoo_module import CrossEncoder

    constants = Constants()

    return constants, ChatUtil(logging.DEBUG, constants)

constants, chat_util = reload_modules(constants)
preprocessed_data = load(constants.PROCESSED_QA_PATH)

DEVICE: cuda:0
Число процессов для использования: 12


## Тренировка моделей

### Тренировка Siamese Bi-Encoder

#### Кросс-валидационный поиск в пространстве гиперпараметров Siamese-Bi-Encoder и тренировка на основе лучших

In [None]:
best_params = SiameseBiEncoderTrainingPipeline(preprocessed_data, constants, chat_util).do_hyperparam_search(SiameseBiEncoder, n_trials=4, n_epochs=1, val_interval=32)

INFO:chat_util_module:Старт поиска в пространстве гиперпараметров..
[I 2024-02-25 04:52:30,855] A new study created in memory with name: no-name-6ff33eb1-a768-422a-8052-79d9e802a5f7
INFO:chat_util_module:Поиск гиперпараметров: кросс-валидация для набора параметров "opt_learning_rate"=1.0029518072924353e-05; "scheduler_type"=linear
INFO:chat_util_module:Поиск гиперпараметров: кросс-валидация - обучение на KFold 1
INFO:chat_util_module:Training step     0/602, loss =  0.674
INFO:chat_util_module:Validation step     0/602, val_loss =  0.696
INFO:chat_util_module:Training step    32/602, loss =  0.678
INFO:chat_util_module:Validation step    32/602, val_loss =  0.676
INFO:chat_util_module:Training step    64/602, loss =  0.347
INFO:chat_util_module:Validation step    64/602, val_loss =  0.314
INFO:chat_util_module:Training step    96/602, loss =  0.136
INFO:chat_util_module:Validation step    96/602, val_loss =  0.055
INFO:chat_util_module:Training step   128/602, loss =  0.014
INFO:chat_u

INFO:chat_util_module:Поиск гиперпараметров: кросс-валидация - обучение на KFold 2
INFO:chat_util_module:Training step     0/602, loss =  0.695
INFO:chat_util_module:Validation step     0/602, val_loss =  0.690
INFO:chat_util_module:Training step    32/602, loss =  0.658
INFO:chat_util_module:Validation step    32/602, val_loss =  0.673
INFO:chat_util_module:Training step    64/602, loss =  0.370
INFO:chat_util_module:Validation step    64/602, val_loss =  0.311
INFO:chat_util_module:Training step    96/602, loss =  0.033
INFO:chat_util_module:Validation step    96/602, val_loss =  0.052
INFO:chat_util_module:Training step   128/602, loss =  0.012
INFO:chat_util_module:Validation step   128/602, val_loss =  0.033
INFO:chat_util_module:Training step   160/602, loss =  0.011
INFO:chat_util_module:Validation step   160/602, val_loss =  0.029
INFO:chat_util_module:Training step   192/602, loss =  0.009
INFO:chat_util_module:Validation step   192/602, val_loss =  0.026
INFO:chat_util_module

[I 2024-02-25 05:05:01,329] Trial 0 finished with value: 0.020892692084228497 and parameters: {'opt_learning_rate': 1.0029518072924353e-05, 'scheduler_type': 'linear'}. Best is trial 0 with value: 0.020892692084228497.
INFO:chat_util_module:Поиск гиперпараметров: кросс-валидация для набора параметров "opt_learning_rate"=7.390685482893155e-06; "scheduler_type"=linear
INFO:chat_util_module:Поиск гиперпараметров: кросс-валидация - обучение на KFold 1
INFO:chat_util_module:Training step     0/602, loss =  0.700
INFO:chat_util_module:Validation step     0/602, val_loss =  0.694
INFO:chat_util_module:Training step    32/602, loss =  0.683
INFO:chat_util_module:Validation step    32/602, val_loss =  0.685
INFO:chat_util_module:Training step    64/602, loss =  0.583
INFO:chat_util_module:Validation step    64/602, val_loss =  0.587
INFO:chat_util_module:Training step    96/602, loss =  0.087
INFO:chat_util_module:Validation step    96/602, val_loss =  0.109
INFO:chat_util_module:Training step 

INFO:chat_util_module:Поиск гиперпараметров: кросс-валидация - обучение на KFold 2
INFO:chat_util_module:Training step     0/602, loss =  0.725
INFO:chat_util_module:Validation step     0/602, val_loss =  0.705
INFO:chat_util_module:Training step    32/602, loss =  0.702
INFO:chat_util_module:Validation step    32/602, val_loss =  0.690
INFO:chat_util_module:Training step    64/602, loss =  0.605
INFO:chat_util_module:Validation step    64/602, val_loss =  0.606
INFO:chat_util_module:Training step    96/602, loss =  0.119
INFO:chat_util_module:Validation step    96/602, val_loss =  0.123
INFO:chat_util_module:Training step   128/602, loss =  0.025
INFO:chat_util_module:Validation step   128/602, val_loss =  0.061
INFO:chat_util_module:Training step   160/602, loss =  0.018
INFO:chat_util_module:Validation step   160/602, val_loss =  0.053
INFO:chat_util_module:Training step   192/602, loss =  0.019
INFO:chat_util_module:Validation step   192/602, val_loss =  0.048
INFO:chat_util_module

[I 2024-02-25 05:17:31,892] Trial 1 finished with value: 0.024327071058715496 and parameters: {'opt_learning_rate': 7.390685482893155e-06, 'scheduler_type': 'linear'}. Best is trial 0 with value: 0.020892692084228497.
INFO:chat_util_module:Поиск гиперпараметров: кросс-валидация для набора параметров "opt_learning_rate"=1.4634158668829792e-05; "scheduler_type"=cosine
INFO:chat_util_module:Поиск гиперпараметров: кросс-валидация - обучение на KFold 1
INFO:chat_util_module:Training step     0/602, loss =  0.707
INFO:chat_util_module:Validation step     0/602, val_loss =  0.704
INFO:chat_util_module:Training step    32/602, loss =  0.645
INFO:chat_util_module:Validation step    32/602, val_loss =  0.673
INFO:chat_util_module:Training step    64/602, loss =  0.114
INFO:chat_util_module:Validation step    64/602, val_loss =  0.115
INFO:chat_util_module:Training step    96/602, loss =  0.028
INFO:chat_util_module:Validation step    96/602, val_loss =  0.040
INFO:chat_util_module:Training step 

INFO:chat_util_module:Поиск гиперпараметров: кросс-валидация - обучение на KFold 2
INFO:chat_util_module:Training step     0/602, loss =  0.716
INFO:chat_util_module:Validation step     0/602, val_loss =  0.695
INFO:chat_util_module:Training step    32/602, loss =  0.713
INFO:chat_util_module:Validation step    32/602, val_loss =  0.667
INFO:chat_util_module:Training step    64/602, loss =  0.256
INFO:chat_util_module:Validation step    64/602, val_loss =  0.130
INFO:chat_util_module:Training step    96/602, loss =  0.047
INFO:chat_util_module:Validation step    96/602, val_loss =  0.079
INFO:chat_util_module:Training step   128/602, loss =  0.011
INFO:chat_util_module:Validation step   128/602, val_loss =  0.054
INFO:chat_util_module:Training step   160/602, loss =  0.035
INFO:chat_util_module:Validation step   160/602, val_loss =  0.024
INFO:chat_util_module:Training step   192/602, loss =  0.006
INFO:chat_util_module:Validation step   192/602, val_loss =  0.027
INFO:chat_util_module

[I 2024-02-25 05:30:00,405] Trial 2 finished with value: 0.02106863793289866 and parameters: {'opt_learning_rate': 1.4634158668829792e-05, 'scheduler_type': 'cosine'}. Best is trial 0 with value: 0.020892692084228497.
INFO:chat_util_module:Поиск гиперпараметров: кросс-валидация для набора параметров "opt_learning_rate"=5.338949588464445e-06; "scheduler_type"=linear
INFO:chat_util_module:Поиск гиперпараметров: кросс-валидация - обучение на KFold 1
INFO:chat_util_module:Training step     0/602, loss =  0.695
INFO:chat_util_module:Validation step     0/602, val_loss =  0.692
INFO:chat_util_module:Training step    32/602, loss =  0.673
INFO:chat_util_module:Validation step    32/602, val_loss =  0.676
INFO:chat_util_module:Training step    64/602, loss =  0.595
INFO:chat_util_module:Validation step    64/602, val_loss =  0.568
INFO:chat_util_module:Training step    96/602, loss =  0.127
INFO:chat_util_module:Validation step    96/602, val_loss =  0.138
INFO:chat_util_module:Training step  

INFO:chat_util_module:Поиск гиперпараметров: кросс-валидация - обучение на KFold 2
INFO:chat_util_module:Training step     0/602, loss =  0.696
INFO:chat_util_module:Validation step     0/602, val_loss =  0.704
INFO:chat_util_module:Training step    32/602, loss =  0.694
INFO:chat_util_module:Validation step    32/602, val_loss =  0.693
INFO:chat_util_module:Training step    64/602, loss =  0.673
INFO:chat_util_module:Validation step    64/602, val_loss =  0.667
INFO:chat_util_module:Training step    96/602, loss =  0.474
INFO:chat_util_module:Validation step    96/602, val_loss =  0.468
INFO:chat_util_module:Training step   128/602, loss =  0.076
INFO:chat_util_module:Validation step   128/602, val_loss =  0.102
INFO:chat_util_module:Training step   160/602, loss =  0.161
INFO:chat_util_module:Validation step   160/602, val_loss =  0.052
INFO:chat_util_module:Training step   192/602, loss =  0.020
INFO:chat_util_module:Validation step   192/602, val_loss =  0.043
INFO:chat_util_module

[I 2024-02-25 05:42:29,623] Trial 3 finished with value: 0.02589993287427846 and parameters: {'opt_learning_rate': 5.338949588464445e-06, 'scheduler_type': 'linear'}. Best is trial 0 with value: 0.020892692084228497.
INFO:chat_util_module:Поиск в пространстве гиперпараметров завершен.
INFO:chat_util_module:Лучший Learning Rate: 1.0029518072924353e-05
INFO:chat_util_module:Лучший Scheduler Type: linear


In [None]:
user_opt_lr = best_params['opt_learning_rate']
scheduler_type = best_params['scheduler_type']

In [None]:
bi_encoder_model, all_train_batch_losses, all_mean_val_losses_per_val_interval =  SiameseBiEncoderTrainingPipeline(preprocessed_data, constants, chat_util).train(val_interval=32, n_epochs=1, user_opt_lr=user_opt_lr, user_scheduler_type=scheduler_type)

INFO:chat_util_module:Установлены данные гиперпараметров. Начать обучение модели..
INFO:chat_util_module:Training step     0/963, loss =  0.695
INFO:chat_util_module:Validation step     0/963, val_loss =  0.692
INFO:chat_util_module:Training step    32/963, loss =  0.659
INFO:chat_util_module:Validation step    32/963, val_loss =  0.683
INFO:chat_util_module:Training step    64/963, loss =  0.544
INFO:chat_util_module:Validation step    64/963, val_loss =  0.552
INFO:chat_util_module:Training step    96/963, loss =  0.036
INFO:chat_util_module:Validation step    96/963, val_loss =  0.066
INFO:chat_util_module:Training step   128/963, loss =  0.009
INFO:chat_util_module:Validation step   128/963, val_loss =  0.055
INFO:chat_util_module:Training step   160/963, loss =  0.057
INFO:chat_util_module:Validation step   160/963, val_loss =  0.031
INFO:chat_util_module:Training step   192/963, loss =  0.009
INFO:chat_util_module:Validation step   192/963, val_loss =  0.025
INFO:chat_util_module

#### Сохранение Siamese-Bi-Encoder в файловую систему

In [None]:
torch.save(bi_encoder_model.state_dict(), constants.BI_ENCODER_MODEL_PATH_AUX)

### Тренировка Cross-Encoder

In [None]:
cross_encoder_pipeline = CrossEncoderTrainingPipeline(preprocessed_data, constants, chat_util)

In [None]:
cross_encoder_model, all_train_batch_losses, all_mean_val_losses_per_val_interval = cross_encoder_pipeline.train(val_interval=32)

INFO:chat_util_module:Training step     0/963, loss =  0.572
INFO:chat_util_module:Validation step     0/963, val_loss =  0.823
INFO:chat_util_module:Training step    32/963, loss =  0.198
INFO:chat_util_module:Validation step    32/963, val_loss =  0.238
INFO:chat_util_module:Training step    64/963, loss =  0.164
INFO:chat_util_module:Validation step    64/963, val_loss =  0.159
INFO:chat_util_module:Training step    96/963, loss =  0.087
INFO:chat_util_module:Validation step    96/963, val_loss =  0.067
INFO:chat_util_module:Training step   128/963, loss =  0.008
INFO:chat_util_module:Validation step   128/963, val_loss =  0.029
INFO:chat_util_module:Training step   160/963, loss =  0.015
INFO:chat_util_module:Validation step   160/963, val_loss =  0.014
INFO:chat_util_module:Training step   192/963, loss =  0.010
INFO:chat_util_module:Validation step   192/963, val_loss =  0.009
INFO:chat_util_module:Training step   224/963, loss =  0.011
INFO:chat_util_module:Validation step   224

#### Сохранение Cross-Encoder в файловую систему

In [None]:
torch.save(cross_encoder_model.state_dict(), constants.CROSS_ENCODER_MODEL_PATH_AUX)

## Создание эмбеддингов и индексов на основе обученных моделей (сохранение в файловую систему)

In [None]:
target_char_questions_and_answers = load(constants.TARGET_CHAR_PROCESSED_QA_PATH)
target_char_answers = load(constants.TARGET_CHAR_PROCESSED_ANSWERS_PATH)

chat_service_accelerator = ChatServiceAccelerator(bi_encoder_model, cross_encoder_model, target_char_questions_and_answers, target_char_answers,
                 constants, chat_util)

training_data_embeddings = chat_service_accelerator.preprocess_training_data_embeddings(target_char_questions_and_answers, path=constants.TARGET_CHAR_QA_PAIRS_EMBEDDINGS_PATH_AUX)
chat_service_accelerator.preprocess_answers_embeddings(target_char_answers, path=constants.TARGET_CHAR_ANSWERS_EMBEDDINGS_PATH_AUX)
chat_service_accelerator.create_faiss_index(training_data_embeddings, gpu_index=constants.GPU_FAISS_INDEX, path=constants.TARGET_CHAR_QA_PAIRS_FAISS_INDEX_PATH_AUX)
chat_service_accelerator.create_faiss_psa_index(training_data_embeddings, gpu_index=constants.GPU_FAISS_INDEX, path=constants.TARGET_CHAR_QA_PAIRS_FAISS_PSA_INDEX_PATH_AUX)

<faiss.swigfaiss_avx2.IndexFlatL2; proxy of <Swig Object of type 'faiss::IndexFlatL2 *' at 0x7b9e220467c0> >