# Инициализация

In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
!pip install -q transformers sentencepiece sentence-transformers catboost simpletransformers

In [3]:
import pandas as pd

import torch
import transformers
from torch.utils.data import Dataset, DataLoader
from torch import cuda
from tokenizers import BertWordPieceTokenizer
from transformers import AutoModel, AutoTokenizer

from tqdm import tqdm

import warnings
warnings.filterwarnings('ignore')

# Обучение

In [4]:
df = pd.read_csv("/content/drive/MyDrive/Sar_hacks_ai/train_dataset_train_new.csv").drop(columns=['id'])

In [5]:
tokenizer = AutoTokenizer.from_pretrained('MoritzLaurer/mDeBERTa-v3-base-mnli-xnli')
MAX_LEN = 256 
TRAIN_BATCH_SIZE = 16
VALID_BATCH_SIZE = 16
EPOCHS = 3
LEARNING_RATE = 1e-05

In [6]:
class Triage(Dataset):
    '''Даталоадер'''
    def __init__(self, dataframe, tokenizer, max_len):
        self.len = len(dataframe)
        self.data = dataframe
        self.tokenizer = tokenizer
        self.max_len = max_len
        
    def __getitem__(self, index):
        title = str(self.data.name[index])
        title = " ".join(title.split())
        inputs = self.tokenizer.encode_plus(
            title,
            None,
            add_special_tokens=True,
            max_length=self.max_len,
            pad_to_max_length=True,
            return_token_type_ids=True,
            truncation=True
        )
        ids = inputs['input_ids']
        mask = inputs['attention_mask']

        return {
            'ids': torch.tensor(ids, dtype=torch.long),
            'mask': torch.tensor(mask, dtype=torch.long),
            'targets': torch.tensor(self.data.groups[index], dtype=torch.long)
        } 
    
    def __len__(self):
        return self.len

In [7]:
train_size = 1
train_dataset=df.sample(frac=train_size,random_state=200).reset_index(drop=True)

training_set = Triage(train_dataset, tokenizer, MAX_LEN)

In [8]:
train_params = {'batch_size': TRAIN_BATCH_SIZE,
                'shuffle': False,
                'num_workers': 0
                }

training_loader = DataLoader(training_set, **train_params)

In [9]:
def calcuate_accu(big_idx, targets):
    n_correct = (big_idx==targets).sum().item()
    return n_correct

In [10]:
def train(epoch):
    tr_loss = 0
    n_correct = 0
    nb_tr_steps = 0
    nb_tr_examples = 0
    model.train()
    for _,data in tqdm(enumerate(training_loader, 0)):
        ids = data['ids'].to(device, dtype = torch.long)
        mask = data['mask'].to(device, dtype = torch.long)
        targets = data['targets'].to(device, dtype = torch.long)

        outputs = model(ids, mask)
        loss = loss_function(outputs, targets)
        tr_loss += loss.item()
        big_val, big_idx = torch.max(outputs.data, dim=1)
        n_correct += calcuate_accu(big_idx, targets)

        nb_tr_steps += 1
        nb_tr_examples+=targets.size(0)
        
        if _%100==0:
            loss_step = tr_loss/nb_tr_steps
            accu_step = (n_correct*100)/nb_tr_examples 
            print(f"Training Loss per 100 steps: {loss_step}")
            print(f"Training Accuracy per 100 steps: {accu_step}")

    print(f'The Total Accuracy for Epoch {epoch}: {(n_correct*100)/nb_tr_examples}')
    epoch_loss = tr_loss/nb_tr_steps
    epoch_accu = (n_correct*100)/nb_tr_examples
    print(f"Training Loss Epoch: {epoch_loss}")
    print(f"Training Accuracy Epoch: {epoch_accu}")

    return 

In [9]:

class DeBert(torch.nn.Module):
    def __init__(self):
        super(DeBert, self).__init__()
        self.deberta = AutoModel.from_pretrained('MoritzLaurer/mDeBERTa-v3-base-mnli-xnli')
        self.pre_classifier = torch.nn.Linear(768, 768)
        self.dropout = torch.nn.Dropout(0.3)
        self.classifier = torch.nn.Linear(768, 11)

    def forward(self, input_ids, attention_mask):
        output_1 = self.deberta(input_ids=input_ids, attention_mask=attention_mask)
        hidden_state = output_1[0]
        pooler = hidden_state[:, 0]
        pooler = self.pre_classifier(pooler)
        pooler = torch.nn.ReLU()(pooler)
        pooler = self.dropout(pooler)
        output = self.classifier(pooler)
        return output

In [None]:
device = 'cuda' if cuda.is_available() else 'cpu'
model = DeBert()
model.to(device)

In [13]:
loss_function = torch.nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(params =  model.parameters(), lr=LEARNING_RATE)

In [None]:
for epoch in range (EPOCHS):
    train(epoch)

In [None]:
output_model_file = 'deBert.bin'

model_to_save = model

torch.save(model_to_save.state_dict())

print('Данные сохранены')

# Предсказание

In [7]:
data = torch.load("/content/drive/MyDrive/Sar_hacks_ai/debert.bin")
data['pre_classifier.weight'] = data['pooler.dense.weight']
data['pre_classifier.bias'] = data['pooler.dense.bias']
del data['pooler.dense.weight'] 
del data['pooler.dense.bias']

In [None]:
device = 'cuda' if cuda.is_available() else 'cpu'
model = DeBert()
model.load_state_dict(data)
model.to(device)

In [12]:
test = pd.read_csv("/content/drive/MyDrive/Sar_hacks_ai/test_dataset_test_new.csv").dropna()
test['groups'] = 0
test = test.reset_index(drop=True)
testing_set = Triage(test, tokenizer, MAX_LEN)
test_params = {'batch_size': VALID_BATCH_SIZE,
                'shuffle': False,
                'num_workers': 0
                }
testing_loader = DataLoader(testing_set, **test_params)


In [13]:
def valid(model, testing_loader):
    model.eval()
    all_preds = []
    with torch.no_grad():
        for _, data in tqdm(enumerate(testing_loader, 0)):
            ids = data['ids'].to(device, dtype = torch.long)
            mask = data['mask'].to(device, dtype = torch.long)
            outputs = model(ids, mask).squeeze()
            big_val, big_idx = torch.max(outputs.data, dim=1)
            all_preds.append(big_idx)
    return all_preds

In [None]:
all_preds = valid( model, testing_loader)
flat_list = [x.tolist() for xs in all_preds for x in xs]

In [None]:
test_dataset = pd.read_csv("/content/drive/MyDrive/Sar_hacks_ai/test_dataset_test_new.csv").dropna()
test_dataset = test_dataset[['id']]
test_dataset['groups']=flat_list
test_dataset.to_csv('sample_solution.csv', index=False)