In [1]:
import torch

if torch.cuda.is_available():       
    device = torch.device("cuda")
    print(f'There are {torch.cuda.device_count()} GPU(s) available.')
    print('Device name:', torch.cuda.get_device_name(0))

else:
    print('No GPU available, using the CPU instead.')
    device = torch.device("cpu")

There are 1 GPU(s) available.
Device name: Tesla T4


In [2]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [3]:
!pip install sentencepiece 
!pip install transformers

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting sentencepiece
  Downloading sentencepiece-0.1.97-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.3/1.3 MB[0m [31m13.6 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: sentencepiece
Successfully installed sentencepiece-0.1.97
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting transformers
  Downloading transformers-4.27.2-py3-none-any.whl (6.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m6.8/6.8 MB[0m [31m55.0 MB/s[0m eta [36m0:00:00[0m
Collecting tokenizers!=0.11.3,<0.14,>=0.11.1
  Downloading tokenizers-0.13.2-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (7.6 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.6/7.6 MB[0m [31m108.7 MB/s[0m eta [36m0:00:00

In [4]:
import torch
import torch.nn as nn
import os
import matplotlib.pyplot as plt
from copy import deepcopy
import torch.optim as optim
import random
import numpy as np
import pandas as pd
from torch.utils.data import DataLoader, Dataset
from torch.cuda.amp import autocast, GradScaler
from tqdm import tqdm
from transformers import AutoTokenizer, AutoModel, AdamW, get_linear_schedule_with_warmup

os.environ["TOKENIZERS_PARALLELISM"] = "false"

In [5]:
# !unzip RuCoS.zip -d  /content

## Loading the dataset

In [5]:
import json
import os, re
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt


In [6]:
lst = ['"', "'", '«', '»', '/', ')', '(']

In [7]:
def prepeare_text(text, lst):
  for i in lst:
    text = text.replace(i, ' ')
  text = ' '.join(text.split())
  text = text.replace(".", ". ")
  text = text.replace(",", ", ")
  text = text.replace("!", "! ")
  text = text.replace("?", "? ")

  return text

def get_X_y_for_bert(data_json_file):
    idx, X_a, X_b, y = [], [], [], []
    with open(data_json_file, 'r') as json_file:
        json_list = list(json_file)
        for json_str in json_list:
            item = json.loads(json_str)
            text = item['passage']['text']
            text1 = item['passage']['text']
            new_text = []
            for i in text.split():
                if '@' not in i:
                    new_text.append(i)
            text = ' '.join(new_text)
            text = prepeare_text(text, lst)
            entities = item['passage']['entities']
            for query in item['qas']:
                ques = query['query']
                text = prepeare_text(ques, lst)
                for i in range(len(entities)):
                    entities[i]['text'] = text1[entities[i]['start']: entities[i]['end']]
                for i in entities:
                    if i in query['answers']:
                        y.append(1)
                        X_a.append(text)
                        X_b.append(ques.replace('@placeholder', i['text']))
                        idx.append(0)
                    else:
                        y.append(0)
                        X_a.append(text)
                        X_b.append(ques.replace('@placeholder', i['text']))
                        idx.append(0)
                    if len((X_a[-1] + ' ' + X_b[-1]).split()) >= 300:
                        del y[-1]
                        del X_a[-1]
                        del X_b[-1]
                        del idx[-1]
    return idx, X_a, X_b,  y

In [8]:
idx_train, X_train_a, X_train_b, y_train = get_X_y_for_bert('/content/drive/MyDrive/NTI/train.jsonl')
idx_val, X_test_a, X_test_b, y_test = get_X_y_for_bert('/content/drive/MyDrive/NTI/val.jsonl')

In [None]:
X_train_a

In [10]:
df_train = pd.DataFrame({
    'id': idx_train,
    'sentence1': X_train_a,
    'sentence2': X_train_b,
    'label':y_train
})

print(df_train.head())

df_val = pd.DataFrame({
    'id': idx_val,
    'sentence1': X_test_a,
    'sentence2': X_test_b,
    'label': y_test
})

print(df_val.head())

   id                                          sentence1  \
0   0  Кроме того,  серьезным вызовом для России стан...   
1   0  Кроме того,  серьезным вызовом для России стан...   
2   0  Кроме того,  серьезным вызовом для России стан...   
3   0  Кроме того,  серьезным вызовом для России стан...   
4   0  Кроме того,  серьезным вызовом для России стан...   

                                           sentence2  label  
0  Кроме того, серьезным вызовом для России стано...      0  
1  Кроме того, серьезным вызовом для России стано...      0  
2  Кроме того, серьезным вызовом для России стано...      0  
3  Кроме того, серьезным вызовом для России стано...      0  
4  Кроме того, серьезным вызовом для России стано...      0  
   id                                          sentence1  \
0   0  В него вошли @placeholder,  Россия,  Украина и...   
1   0  В него вошли @placeholder,  Россия,  Украина и...   
2   0  В него вошли @placeholder,  Россия,  Украина и...   
3   0  В него вошли @placeh

In [11]:
df_train['sentence1'] = df_train['sentence1'].str.lower()
df_train['sentence2'] = df_train['sentence2'].str.lower()
df_val['sentence2']   = df_val['sentence2'].str.lower()
df_val['sentence1']   = df_val['sentence1'].str.lower()

In [12]:
df_train = df_train.drop_duplicates(subset=['sentence2'])
df_val = df_val.drop_duplicates(subset=['sentence2'])

In [13]:
df_train = pd.DataFrame({
    'id': df_train['id'].values.tolist(),
    'sentence1': df_train['sentence1'].values.tolist(),
    'sentence2': df_train['sentence2'].values.tolist(),
    'label':df_train['label'].values.tolist()
})

print(df_train.head())

df_val = pd.DataFrame({
    'id': df_val['id'].values.tolist(),
    'sentence1': df_val['sentence1'].values.tolist(),
    'sentence2': df_val['sentence2'].values.tolist(),
    'label': df_val['label'].values.tolist()
})

   id                                          sentence1  \
0   0  кроме того,  серьезным вызовом для россии стан...   
1   0  кроме того,  серьезным вызовом для россии стан...   
2   0  кроме того,  серьезным вызовом для россии стан...   
3   0  кроме того,  серьезным вызовом для россии стан...   
4   0  кроме того,  серьезным вызовом для россии стан...   

                                           sentence2  label  
0  кроме того, серьезным вызовом для россии стано...      0  
1  кроме того, серьезным вызовом для россии стано...      0  
2  кроме того, серьезным вызовом для россии стано...      0  
3  кроме того, серьезным вызовом для россии стано...      0  
4  кроме того, серьезным вызовом для россии стано...      0  


## Classes and functions

In [14]:
class CustomDataset(Dataset):

    def __init__(self, data, maxlen, with_labels=True, bert_model = ""):

        self.data = data 
        self.tokenizer = AutoTokenizer.from_pretrained(bert_model)  

        self.maxlen = maxlen
        self.with_labels = with_labels 

    def __len__(self):
        return len(self.data)

    def __getitem__(self, index):
        sent1 = str(self.data.loc[index, 'sentence1'])
        sent2 = str(self.data.loc[index, 'sentence2'])
        encoded_pair = self.tokenizer(sent1, sent2, 
                                      padding='max_length', 
                                      truncation=True, 
                                      max_length=self.maxlen,  
                                      return_tensors='pt', 
                                      return_token_type_ids=True) 
        
        token_ids = encoded_pair['input_ids'].squeeze(0) 
        attn_masks = encoded_pair['attention_mask'].squeeze(0)  
        token_type_ids = encoded_pair['token_type_ids'].squeeze(0)  

        if self.with_labels:  
            label = self.data.loc[index, 'label']
            idx = self.data.loc[index, 'id']
            return token_ids, attn_masks, token_type_ids, label, idx  
        else:
            return token_ids, attn_masks, token_type_ids, idx

In [15]:
class SentencePairClassifier(nn.Module):

    def __init__(self, bert_model="DeepPavlov/rubert-base-cased-sentence", freeze_bert=False):
        super(SentencePairClassifier, self).__init__()
        self.bert = AutoModel.from_pretrained(bert_model)
        if freeze_bert:
            for p in self.bert_layer.parameters():
                p.requires_grad = False

        D_in, H, D_out = 1024, 512, 1

        self.classifier = nn.Sequential(
            nn.Linear(D_in, H),
            nn.ReLU(),
            nn.Linear(H, D_out)
        )

    @autocast()
    def forward(self, input_ids, attn_masks, token_type_ids):
     

        outputs = self.bert(input_ids, attn_masks, token_type_ids)
        
  
        last_hidden_state_cls = outputs[0][:, 0, :]
        
        logits = self.classifier(last_hidden_state_cls)

        return logits

In [16]:
import torch

In [17]:
def set_seed(seed):
    torch.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False
    np.random.seed(seed)
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    

In [18]:
from sklearn.metrics import f1_score

In [19]:
def evaluate_loss(net, device, criterion, dataloader):
    net.eval()

    mean_loss = 0
    count = 0
    mean_acc = 0

    fin_targets=[]
    fin_outputs=[]

    with torch.no_grad():
        for it, (seq, attn_masks, token_type_ids, labels, _) in enumerate(tqdm(dataloader)):
            seq, attn_masks, token_type_ids, labels = \
                seq.to(device), attn_masks.to(device), token_type_ids.to(device), labels.to(device)
            logits = net(seq, attn_masks, token_type_ids)
            fin_targets.extend(labels.float().cpu().detach().numpy().tolist())
            fin_outputs.extend(torch.sigmoid(logits.float()).cpu().detach().numpy().tolist())
            count += 1
    
    lst_out = []
    for i in range(len(fin_targets)):
      ans = 0
      if fin_outputs[i][0] >= 0.5:
        ans = 1
      lst_out.append(ans)
      fin_targets[i] = int(fin_targets[i])

    return f1_score(fin_targets, lst_out, average='binary')

In [20]:
print("Creation of the models' folder...")
!mkdir /content/drive/MyDrive/models

Creation of the models' folder...
mkdir: cannot create directory ‘/content/drive/MyDrive/models’: File exists


In [21]:
def train_bert(net, criterion, opti, lr, lr_scheduler, train_loader, val_loader, epochs, iters_to_accumulate, path_to_save):

    best_loss = -1
    best_ep = 1
    nb_iterations = len(train_loader)
    print_every = nb_iterations // 5 
    iters = []
    train_losses = []
    val_losses = []

    scaler = GradScaler()

    for ep in range(epochs):

        net.train()
        running_loss = 0.0
        for it, (seq, attn_masks, token_type_ids, labels, _) in enumerate(tqdm(train_loader)):

            seq, attn_masks, token_type_ids, labels = \
                seq.to(device), attn_masks.to(device), token_type_ids.to(device), labels.to(device)
    
            with autocast():
                logits = net(seq, attn_masks, token_type_ids)

                loss = criterion(logits.squeeze(-1), labels.float())
                loss = loss / iters_to_accumulate 

            scaler.scale(loss).backward()

            if (it + 1) % iters_to_accumulate == 0:
                scaler.step(opti)
                scaler.update()
                lr_scheduler.step()
                opti.zero_grad()


            running_loss += loss.item()

            if (it + 1) % print_every == 0:  # Print training loss information
                print()
                print("Iteration {}/{} of epoch {} complete. Loss : {} "
                      .format(it+1, nb_iterations, ep+1, running_loss / print_every))

                running_loss = 0.0


        val_acc = evaluate_loss(net, device, criterion, val_loader)  # Compute validation loss
        print()
        print("Epoch {} complete! Validation ACC : {}".format(ep+1, val_acc))

        if val_acc > best_loss:
            print("Best validation ACC improved from {} to {}".format(best_loss, val_acc))
            print()
            net_copy = deepcopy(net)  # save a copy of the model
            best_loss = val_acc
            best_ep = ep + 1

            path_to_model='{}/{}_val_loss_{}_ep_{}.pt'.format(path_to_save, bert_model.replace('/', 'r'), round(best_loss, 3), best_ep)
            torch.save(net_copy.state_dict(), path_to_model)
            print("The model has been saved in {}".format(path_to_model))

    del loss
    torch.cuda.empty_cache()

## Parameters

In [22]:
#ЗАПУСК

In [43]:
from multipledispatch import dispatch

class Forest:
  @dispatch(str, int)
  def __init__(self, bert_model, count_models):
    self.bert_model = bert_model

    device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

    self.models = [SentencePairClassifier(self.bert_model, freeze_bert=freeze_bert).to(device) for i in range(count_models)]

  @dispatch(str, list)
  def __init__(self, bert_model, model_paths_):
    self.bert_model = bert_model

    device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
    self.models = [SentencePairClassifier(self.bert_model, freeze_bert=freeze_bert).to(device) for i in range(len(model_paths_))]
    
    for net, path_to_model in zip(self.models, model_paths_):
      net.load_state_dict(torch.load(path_to_model))


  def train(self, df_train, df_val,  freeze_bert = False, maxlen = 315, bs = 32, iters_to_accumulate = 2, lr = 0.000005, epochs = 3):

    for index, net in enumerate(self.models):
      train = df_train.sample(int(df_train.shape[0] * 0.5)).reset_index(drop = True)
      print("Reading training data...")
      train_set = CustomDataset(train, maxlen, True, bert_model)
      print("Reading validation data...")
      val_set      = CustomDataset(df_val, maxlen, True, bert_model)
      train_loader = DataLoader(train_set, batch_size=bs, num_workers=5)
      val_loader   = DataLoader(val_set, batch_size=bs, num_workers=5)


      criterion = nn.BCEWithLogitsLoss()
      opti = AdamW(net.parameters(), lr=lr, weight_decay=1e-2)
      num_warmup_steps = 0 # The number of steps for the warmup phase.
      num_training_steps = epochs * len(train_loader)  # The total number of training steps
      t_total = (len(train_loader) // iters_to_accumulate) * epochs  # Necessary to take into account Gradient accumulation
      lr_scheduler = get_linear_schedule_with_warmup(optimizer=opti, num_warmup_steps=num_warmup_steps, num_training_steps=t_total)

      path = "/content/drive/MyDrive/models/model{}/".format(2)

      if not os.path.exists(path):
          os.makedirs(path)
      
      train_bert(net, criterion, opti, lr, lr_scheduler, train_loader, val_loader, epochs, iters_to_accumulate, path)
  
  def get_probs_from_logits(self, logits):

    probs = torch.sigmoid(logits.unsqueeze(-1))
    return probs.detach().cpu().numpy()

  def predict(self, net, device, data, with_labels=True, result_file="results/output.txt"):

    net.eval()
    probs_all = np.zeros(data.shape)
    idx_all = []

    dataset = CustomDataset(data, maxlen, self.bert_model)
    dataloader = DataLoader(dataset, batch_size=bs, num_workers=5)
    
    with torch.no_grad():
      for index, net in enumerate(self.models):
          for seq, attn_masks, token_type_ids, labels, idx in tqdm(dataloader):
              seq, attn_masks, token_type_ids = seq.to(device), attn_masks.to(device), token_type_ids.to(device)
              logits = net(seq, attn_masks, token_type_ids)
              probs = [i[0] for i in torch.sigmoid(logits.float()).cpu().detach().numpy().tolist()]
              
              idx_all += idx
              probs_all[index] += probs
    
    
    return idx_all, probs_all.mean(axis=1)


In [41]:
bert_model = "sismetanin/xlm_roberta_large-ru-sentiment-rureviews"
freeze_bert = False  
maxlen = 315 
bs = 2
iters_to_accumulate = 2  
lr = 0.000005  
epochs = 4 

In [None]:
forest = Forest(bert_model, 3)
forest.train(df_train, df_val)

In [None]:
#С СОХРАНЕНИЯ

## Prediction

In [None]:
def get_X_y_for_bert_test(data_json_file):
    idx, X_a, X_b,  y = [], [], [], []
    dct = {}
    with open(data_json_file, 'r') as json_file:
        json_list = list(json_file)
        for json_str in json_list:
            item = json.loads(json_str)
            text = item['passage']['text']
            text1 = item['passage']['text']
            new_text = []
            for i in text.split():
                if '@' not in i:
                    new_text.append(i)
            text = ' '.join(new_text)
            for i in lst:
                text = text.replace(i, ' ')
            text = ' '.join(text.split())
            entities = item['passage']['entities']
            if len(item['qas']) > 1:
                print(1)
            for query in item['qas']:
                ques = query['query']
                for i in lst:
                    ques = ques.replace(i, ' ')
                ques = ' '.join(ques.split())
                for i in range(len(entities)):
                    entities[i]['text'] = text1[entities[i]['start']: entities[i]['end']]
                c = 0
                for i in entities:
                    c += 1
                    y.append(1)
                    X_a.append(text)
                    X_b.append(ques.replace('@placeholder', i['text']))
                    idx.append(str(item['idx']) + '_' + str(c))
                    if len((X_a[-1] + ' ' + X_b[-1]).split()) >= 300:
                        del y[-1]
                        del X_a[-1]
                        del X_b[-1]
                        del idx[-1]
                        y.append(1)
                        X_a.append('Привет')
                        X_b.append('Пока')
                        idx.append(str(item['idx']) + '_' + str(c))
                        print(2)
                    dct[idx[-1]] = i
    return idx, X_a, X_b, dct

In [None]:
idx_t, X_t_a, X_t_b, dct_ans = get_X_y_for_bert_test('RuCoS/rucos_test.jsonl')

In [None]:
df_test = pd.DataFrame({
    'id': idx_t,
    'sentence1': X_t_a,
    'sentence2': X_t_b,
    'label': [0 for i in range(len(X_t_a))]
})

In [None]:
df_test['sentence1'] = df_test['sentence1'].str.lower()
df_test['sentence2'] = df_test['sentence2'].str.lower()

In [None]:
path_to_model = '/content/drive/MyDrive/models/sismetaninrxlm_roberta_large-ru-sentiment-rureviews_lr_5e-06_val_loss_0.70084_ep_4.pt'  

path_to_output_file = 'results/output.txt'

print("Reading test data...")
test_set = CustomDataset(df_val, maxlen, bert_model)
test_loader = DataLoader(test_set, batch_size=bs, num_workers=5)

model = SentencePairClassifier(bert_model)
if torch.cuda.device_count() > 1:  
    print("Let's use", torch.cuda.device_count(), "GPUs!")
    model = nn.DataParallel(model)

print()
print("Loading the weights of the model...")
model.load_state_dict(torch.load(path_to_model))
model.to(device)

print("Predicting on test data...")
idx_all, proba_all = test_prediction(net=model, device=device, dataloader=test_loader, with_labels=True,  # set the with_labels parameter to False if your want to get predictions on a dataset without labels
                result_file=path_to_output_file)
print()
print("Predictions are available in : {}".format(path_to_output_file))

In [None]:
dfc1 = {'proba': proba_all, 'labels': df_val['label'].values}
df2 = pd.DataFrame(dfc1)

In [None]:
df2.to_csv('train_xlm_300_450.csv') # Сохранение предикта на валидации

In [None]:
dct1 = {'id1': [], 'id2': [], 'proba': []}
for i in range(len(idx_all)):
    t = idx_all[i].split('_')
    dct1['id1'].append(t[0])
    dct1['id2'].append(t)
    dct1['proba'].append(proba_all[i])

In [None]:
dct1

In [None]:
lst_ans = []
df1 = pd.DataFrame(dct1)
for i in df1['id1'].unique():
    df2 = df1[df1['id1'] == i]
    a = {}
    c = 0
    for j in range(len(df2)):
        if c < df2['proba'].values[j]:
            c = df2['proba'].values[j]
            a['text'] = dct_ans[df2['id2'].values[j]]['text']
    a['idx'] = int(i)
    lst_ans.append(a)

In [None]:
import json
import numpy as np

class NpEncoder(json.JSONEncoder):
    def default(self, obj):
        if isinstance(obj, np.integer):
            return int(obj)
        elif isinstance(obj, np.floating):
            return float(obj)
        elif isinstance(obj, np.ndarray):
            return obj.tolist()
        else:
            return super(NpEncoder, self).default(obj)

In [None]:
import json

with open('output_02_03_v1.jsonl', 'w') as outfile:
    for entry in lst_ans:
        json.dump(entry, outfile,  cls=NpEncoder)
        outfile.write('\n')

In [None]:
###################################################

In [None]:
# ОБЪЕДИНЕНИЕ МОДЕЛЕЙ

In [None]:
from sklearn.linear_model import LogisticRegression
import sklearn

In [None]:
train = pd.read_csv('/content/drive/MyDrive/train_v1.csv')
trainX, testX, trainY, testY = sklearn.model_selection.train_test_split(train[train.columns[1:-1]].values, train[train.columns[-1]].values, test_size=0.3, shuffle=True, random_state=42)

In [None]:
test = pd.read_csv('/content/drive/MyDrive/test_v1.csv')

In [None]:
res = {}
res["proba"] = (test["proba_xlm_mix_1"].values + test["proba_xlm_mix_2"].values + test["proba_xlm_mix_3"].values)/ 3 * 0.75 + (test["proba_xlm_1"].values + test["proba_xlm_2"].values + test["proba_xlm_3"].values)/ 3 * 0.25
res["id1"] = df1["id1"].values
res["id2"] = df1["id2"].values
res = pd.DataFrame(res)
res

In [None]:
lst_ans = []
df1 = res
for i in df1['id1'].unique():
    df2 = df1[df1['id1'] == i]
    a = {}
    c = 0
    for j in range(len(df2)):
        if c < df2['proba'].values[j]:
            c = df2['proba'].values[j]
            a['text'] = dct_ans[df2['id2'].values[j]]
    a['idx'] = int(i)
    lst_ans.append(a)

In [None]:
import json

with open('all_mean_median.jsonl', 'w') as outfile:
    for entry in lst_ans:
        json.dump(entry, outfile,  cls=NpEncoder)
        outfile.write('\n')