In [None]:
import os
import pandas as pd
import numpy as np
from sklearn.model_selection import KFold, StratifiedKFold
import shutil

from torch.utils.data import DataLoader, Dataset
import datasets, transformers

from transformers import TrainingArguments, Trainer
from transformers import AutoModelForSequenceClassification, AutoTokenizer

os.environ['WANDB_DISABLED'] = 'true'

# Test data preprocessing

In [None]:
class InferDataset(Dataset):
    def __init__(self, df):
        self.inputs = df['input'].values.astype(str)
        self.targets = df['target'].values.astype(str)

    def __len__(self):
        return len(self.inputs)

    def __getitem__(self, item):
        inputs = self.inputs[item]
        targets = self.targets[item]
        
        return {
        **tokenizer( inputs, targets )
    }

In [None]:
test_df = pd.read_csv(f'/kaggle/input/us-patent-phrase-to-phrase-matching/test.csv')
titles = pd.read_csv('/kaggle/input/cpc-codes/titles.csv')
test_df = test_df.merge(titles, left_on='context', right_on='code')
test_df['input'] = test_df['title'] + '[SEP]' + test_df['anchor']

In [None]:
title + [SEP] + anchor + [SEP] + target + [SEP]

In [None]:
sentence1 + [SEP] + sentence2 + [END]


# 1. deberta_v3_large_5_folds

In [None]:
class CFG_DEBERTA:
    input_path = '/kaggle/input/us-patent-phrase-to-phrase-matching/'
    model_path = '/kaggle/input/deberta-v3-large/deberta/'
    model_4_path = '/kaggle/input/deberta-v3-large/deberta_4/'
    
    learning_rate = 8.270853307579581e-06
    weight_decay = 0.01
    num_fold = 4
    epochs = 1
    batch_size = 10

In [None]:
predictions_deberta = []

for fold in range(CFG_DEBERTA.num_fold):
    test_dataset = InferDataset(test_df)
    tokenizer = AutoTokenizer.from_pretrained(CFG_DEBERTA.model_path + f'deberta_{fold}/')
    model = AutoModelForSequenceClassification.from_pretrained(CFG_DEBERTA.model_path + f'deberta_{fold}/', num_labels=1)
    trainer = Trainer(
            model,
            tokenizer=tokenizer
        )

    outputs = trainer.predict(test_dataset)
    prediction = outputs.predictions.reshape(-1)
    predictions_deberta.append(prediction)
    
test_dataset = InferDataset(test_df)
tokenizer = AutoTokenizer.from_pretrained(CFG_DEBERTA.model_4_path)
model = AutoModelForSequenceClassification.from_pretrained(CFG_DEBERTA.model_4_path, num_labels=1)
trainer = Trainer(
        model,
        tokenizer=tokenizer,
    )

outputs = trainer.predict(test_dataset)
prediction = outputs.predictions.reshape(-1)
predictions_deberta.append(prediction)

predictions_deberta = np.mean(predictions_deberta, axis=0)

# 2. electra_base_5_folds

In [None]:
class CFG_ELECTRA:
    input_path = '/kaggle/input/us-patent-phrase-to-phrase-matching/'
    model_path = '/kaggle/input/electra-patent-5-folds/electra/'
    
    learning_rate = 2e-5
    weight_decay = 0.01
    num_fold = 5
    epochs = 10
    batch_size = 32

In [None]:
predictions_electra = []

for fold in range(CFG_ELECTRA.num_fold):
    test_dataset = InferDataset(test_df)
    tokenizer = AutoTokenizer.from_pretrained(CFG_ELECTRA.model_path + f'electra_{fold}/')
    model = AutoModelForSequenceClassification.from_pretrained(CFG_ELECTRA.model_path + f'electra_{fold}/', num_labels=1)
    trainer = Trainer(
            model,
            tokenizer=tokenizer,
        )

    outputs = trainer.predict(test_dataset)
    prediction = outputs.predictions.reshape(-1)
    predictions_electra.append(prediction)

predictions_electra = np.mean(predictions_electra, axis=0)

# 3. bert_patent_5_folds

In [None]:
class CFG_BERT_PATENT:
    input_path = '/kaggle/input/us-patent-phrase-to-phrase-matching/'
    model_path = '/kaggle/input/bert-patent-5-folds/'
    
    learning_rate = 2e-5
    weight_decay = 0.01
    num_fold = 5
    epochs = 5
    batch_size = 32

In [None]:
predictions_bert_patent = []

for fold in range(CFG_BERT_PATENT.num_fold):
    test_dataset = InferDataset(test_df)
    tokenizer = AutoTokenizer.from_pretrained(CFG_BERT_PATENT.model_path + f'bert_patent_{fold}/')
    model = AutoModelForSequenceClassification.from_pretrained(CFG_BERT_PATENT.model_path + f'bert_patent_{fold}/', num_labels=1)
    trainer = Trainer(
            model,
            tokenizer=tokenizer,
        )

    outputs = trainer.predict(test_dataset)
    prediction = outputs.predictions.reshape(-1)
    predictions_bert_patent.append(prediction)

predictions_bert_patent = np.mean(predictions_bert_patent, axis=0)

# Submission

In [None]:
w1 = 0.6
w2 = 0.2
w3 = 0.2

In [None]:
from sklearn.preprocessing import MinMaxScaler

MMscaler = MinMaxScaler()

pred1_mm = MMscaler.fit_transform(predictions_deberta.reshape(-1,1)).reshape(-1)
pred2_mm = MMscaler.fit_transform(predictions_electra.reshape(-1,1)).reshape(-1)
pred3_mm = MMscaler.fit_transform(predictions_bert_patent.reshape(-1,1)).reshape(-1)

final_predictions =  pred1_mm * w1 + pred2_mm * w2 + pred3_mm * w3

In [None]:
submission = datasets.Dataset.from_dict({
    'id': test_df['id'],
    'score': final_predictions,
})

submission.to_csv('submission.csv', index=False)

In [None]:
submission