In [1]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [2]:
!pip install transformers

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [3]:
!pip install sentencepiece

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [4]:
!pip install catboost

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [5]:
import os
import gc
import math
import time
import random
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import warnings
warnings.simplefilter('ignore')
from tqdm import tqdm
import re

import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.optim import lr_scheduler
from torch.utils.data import DataLoader, Dataset

from sklearn.model_selection import StratifiedKFold,StratifiedGroupKFold,GroupKFold
from sklearn.metrics import log_loss,f1_score

from transformers import AutoModel, AutoConfig, AutoTokenizer, AdamW, DataCollatorWithPadding
from transformers import get_linear_schedule_with_warmup, get_cosine_schedule_with_warmup

from sklearn.linear_model import LogisticRegression
import lightgbm as lgb
import catboost as ctb

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [6]:
INPUT_DIR = '/content/drive/MyDrive/Competitions/Signate/Student Cup 2022/input/'
OUTPUT_DIR = '/content/drive/MyDrive/Competitions/Signate/Student Cup 2022/output/'
OUTPUT_SUB_DIR = os.path.join(OUTPUT_DIR,'Submission')
OUTPUT_MODEL_DIR1 = os.path.join(OUTPUT_DIR,'Model/DeBERTa-large/')  #0.7455 → 0.7717
OUTPUT_MODEL_DIR2 = os.path.join(OUTPUT_DIR,'Model/DeBERTa-base[ver2]/')  #0.7327　→ 0.7565
OUTPUT_MODEL_DIR3 = os.path.join(OUTPUT_DIR,'Model/RoBERTa-large/')  #0.7385　→ 0.7519
OUTPUT_MODEL_DIR4 = os.path.join(OUTPUT_DIR,'Model/DeBERTa-v3-large/')  #0.7301　→ 0.7506
OUTPUT_MODEL_DIR5 = os.path.join(OUTPUT_DIR,'Model/DeBERTa-v3-large[ver2]meanpooling/') #0.7525　→ 0.7571

In [7]:
class CFG1:
    num_workers=2
    path=OUTPUT_MODEL_DIR1
    config_path=OUTPUT_MODEL_DIR1+'config.pth'
    model="microsoft/deberta-large"
    batch_size=32
    dropout=0.2
    target_size=4
    max_len=1024
    seed=42
    n_fold=5
    trn_fold=[0, 1, 2, 3, 4]
    gradient_checkpointing=True
    freezing=True

In [8]:
class CFG2:
    num_workers=2
    path=OUTPUT_MODEL_DIR2
    config_path=OUTPUT_MODEL_DIR2+'config.pth'
    model="microsoft/deberta-v3-base"
    batch_size=16
    dropout=0.1
    target_size=4
    max_len=1024
    seed=42
    n_fold=5
    trn_fold=[0, 1, 2, 3, 4]
    gradient_checkpointing=True
    freezing=True

In [9]:
class CFG3:
    num_workers=2
    path=OUTPUT_MODEL_DIR3
    config_path=OUTPUT_MODEL_DIR3+'config.pth'
    model="roberta-large"
    batch_size=32
    dropout=0.2
    target_size=4
    max_len=128
    seed=42
    n_fold=5
    trn_fold=[0, 1, 2, 3, 4]
    gradient_checkpointing=True
    freezing=True

In [10]:
class CFG4:
    num_workers=2
    path=OUTPUT_MODEL_DIR4
    config_path=OUTPUT_MODEL_DIR4+'config.pth'
    model="microsoft/deberta-v3-large"
    batch_size=32
    dropout=0.2
    target_size=4
    max_len=1024
    seed=42
    n_fold=5
    trn_fold=[0, 1, 2, 3, 4]
    gradient_checkpointing=True
    freezing=True

In [11]:
class CFG5:
    num_workers=2
    path=OUTPUT_MODEL_DIR5
    config_path=OUTPUT_MODEL_DIR5+'config.pth'
    model="microsoft/deberta-v3-large"
    batch_size=32
    dropout=0.2
    target_size=4
    max_len=1024
    seed=42
    n_fold=5
    trn_fold=[0, 1, 2, 3, 4]
    gradient_checkpointing=True
    freezing=True

In [12]:
# Loss Func
def criterion(outputs, labels):
    return nn.CrossEntropyLoss()(outputs, labels)

def softmax(z):
    assert len(z.shape) == 2
    s = np.max(z, axis=1)
    s = s[:, np.newaxis] # necessary step to do broadcasting
    e_x = np.exp(z - s)
    div = np.sum(e_x, axis=1)
    div = div[:, np.newaxis] # dito
    return e_x / div
"""
def get_score(y_true, y_pred):
    y_pred = softmax(y_pred)
    score = log_loss(y_true, y_pred)
    return round(score, 5)
"""
def get_score(outputs, labels):
    outputs = F.softmax(torch.tensor(outputs)).numpy()
    return f1_score(np.argmax(outputs,axis=1),labels ,average='macro')

def get_logger(filename=OUTPUT_DIR+'train'):
    from logging import getLogger, INFO, FileHandler, Formatter, StreamHandler
    logger = getLogger(__name__)
    logger.setLevel(INFO)
    handler1 = StreamHandler()
    handler1.setFormatter(Formatter("%(message)s"))
    handler2 = FileHandler(filename=f"{filename}.log")
    handler2.setFormatter(Formatter("%(message)s"))
    logger.addHandler(handler1)
    logger.addHandler(handler2)
    return logger

LOGGER = get_logger()

def seed_everything(seed=CFG1.seed):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True
    
seed_everything(seed=42)

In [13]:
oof_df = pd.DataFrame()
oof_df1 = pd.read_pickle(OUTPUT_MODEL_DIR1+'oof_df.pkl')
oof_df2 = pd.read_pickle(OUTPUT_MODEL_DIR2+'oof_df.pkl')
oof_df3 = pd.read_pickle(OUTPUT_MODEL_DIR3+'oof_df.pkl')
oof_df4 = pd.read_pickle(OUTPUT_MODEL_DIR4+'oof_df.pkl')
oof_df5 = pd.read_pickle(OUTPUT_MODEL_DIR5+'oof_df.pkl')
labels = oof_df1['label'].values
labels2 = oof_df2['label'].values
preds1 = oof_df1[['Data scientist','Machine learning engineer','Software engineer','Consultant']]
preds3 = oof_df3[['Data scientist','Machine learning engineer','Software engineer','Consultant']]
preds2 = oof_df2[['Consultant','Data scientist','Machine learning engineer','Software engineer']]
preds4 = oof_df4[['Data scientist','Machine learning engineer','Software engineer','Consultant']]
preds5 = oof_df5[['Data scientist','Machine learning engineer','Software engineer','Consultant']]
oof_df["Data scientist"] = (preds1["Data scientist"] + preds2["Data scientist"] + preds3["Data scientist"] + preds4["Data scientist"] + preds5["Data scientist"]) / 5
oof_df["Machine learning engineer"] = (preds1["Machine learning engineer"] + preds2["Machine learning engineer"] + preds3["Machine learning engineer"] + preds4["Machine learning engineer"] + preds5["Machine learning engineer"]) / 5
oof_df["Software engineer"] = (preds1["Software engineer"] + preds2["Software engineer"] + preds3["Software engineer"] + preds4["Software engineer"] + preds5["Software engineer"]) / 5
oof_df["Consultant"] = (preds1["Consultant"] + preds2["Consultant"] + preds3["Consultant"] + preds4["Consultant"] + preds5["Consultant"]) / 5
preds = oof_df[['Data scientist','Machine learning engineer','Software engineer','Consultant']].values.tolist()
score1 = get_score(preds1.values, labels)
score2 = get_score(preds2.values, labels2)
score3 = get_score(preds3.values, labels)
score4 = get_score(preds4.values, labels)
score5 = get_score(preds5.values, labels)
mean_score = (score1+score2+score3+score4+score5)/5
score = get_score(preds, labels)
LOGGER.info(f'Deberta-large CV Score: {score1:<.4f}')
LOGGER.info(f'Deberta-v3-baseCV Score: {score2:<.4f}')
LOGGER.info(f'Roberta-large CV Score: {score3:<.4f}')
LOGGER.info(f'Deberta-v3-large(attension) CV Score: {score4:<.4f}')
LOGGER.info(f'Roberta-v3-large(meanpool) CV Score: {score5:<.4f}')
LOGGER.info(f'CV Mean Score: {mean_score:<.4f}')
LOGGER.info(f'CV Score: {score:<.4f}')

Deberta-large CV Score: 0.7455
INFO:__main__:Deberta-large CV Score: 0.7455
Deberta-v3-baseCV Score: 0.7327
INFO:__main__:Deberta-v3-baseCV Score: 0.7327
Roberta-large CV Score: 0.7385
INFO:__main__:Roberta-large CV Score: 0.7385
Deberta-v3-large(attension) CV Score: 0.7301
INFO:__main__:Deberta-v3-large(attension) CV Score: 0.7301
Roberta-v3-large(meanpool) CV Score: 0.7525
INFO:__main__:Roberta-v3-large(meanpool) CV Score: 0.7525
CV Mean Score: 0.7399
INFO:__main__:CV Mean Score: 0.7399
CV Score: 0.7533
INFO:__main__:CV Score: 0.7533


In [14]:
oof_df1['std 1'] = oof_df1[['Data scientist','Machine learning engineer','Software engineer','Consultant']].std(axis=1)
oof_df1['mean 1'] = oof_df1[['Data scientist','Machine learning engineer','Software engineer','Consultant']].mean(axis=1)
oof_df2['std 2'] = oof_df2[['Data scientist','Machine learning engineer','Software engineer','Consultant']].std(axis=1)
oof_df2['mean 2'] = oof_df2[['Data scientist','Machine learning engineer','Software engineer','Consultant']].mean(axis=1)
oof_df3['std 3'] = oof_df3[['Data scientist','Machine learning engineer','Software engineer','Consultant']].std(axis=1)
oof_df3['mean 3'] = oof_df2[['Data scientist','Machine learning engineer','Software engineer','Consultant']].mean(axis=1)
oof_df4['std 4'] = oof_df4[['Data scientist','Machine learning engineer','Software engineer','Consultant']].std(axis=1)
oof_df4['mean 4'] = oof_df4[['Data scientist','Machine learning engineer','Software engineer','Consultant']].mean(axis=1)
oof_df5['std 5'] = oof_df5[['Data scientist','Machine learning engineer','Software engineer','Consultant']].std(axis=1)
oof_df5['mean 5'] = oof_df5[['Data scientist','Machine learning engineer','Software engineer','Consultant']].mean(axis=1)

oof_df1 = oof_df1.rename(columns={'Data scientist': 'Data scientist 1','Machine learning engineer':'Machine learning engineer 1',
                                  'Software engineer':'Software engineer 1','Consultant':'Consultant 1'})
oof_df2 = oof_df2.rename(columns={'Data scientist': 'Data scientist 2','Machine learning engineer':'Machine learning engineer 2',
                                  'Software engineer':'Software engineer 2','Consultant':'Consultant 2'})
oof_df3 = oof_df3.rename(columns={'Data scientist': 'Data scientist 3','Machine learning engineer':'Machine learning engineer 3',
                                  'Software engineer':'Software engineer 3','Consultant':'Consultant 3'})
oof_df4 = oof_df4.rename(columns={'Data scientist': 'Data scientist 4','Machine learning engineer':'Machine learning engineer 4',
                                  'Software engineer':'Software engineer 4','Consultant':'Consultant 4'})
oof_df5 = oof_df5.rename(columns={'Data scientist': 'Data scientist 5','Machine learning engineer':'Machine learning engineer 5',
                                  'Software engineer':'Software engineer 5','Consultant':'Consultant 5'})

In [15]:
stacking_train = oof_df1[['id','label','kfold','Data scientist 1','Machine learning engineer 1','Software engineer 1','Consultant 1','std 1','mean 1']].merge(oof_df2[['id','kfold','Data scientist 2','Machine learning engineer 2','Software engineer 2','Consultant 2','std 2','mean 2']],how='left',on=['id','kfold']).merge(oof_df3[['id','label','kfold','Data scientist 3','Machine learning engineer 3','Software engineer 3','Consultant 3','std 3','mean 3']],how='left',on=['id','label','kfold']).merge(oof_df4[['id','label','kfold','Data scientist 4','Machine learning engineer 4','Software engineer 4','Consultant 4','std 4','mean 4']],how='left',on=['id','label','kfold']).merge(oof_df5[['id','label','kfold','Data scientist 5','Machine learning engineer 5','Software engineer 5','Consultant 5','std 5','mean 5']],how='left',on=['id','label','kfold'])
display(stacking_train)

Unnamed: 0,id,label,kfold,Data scientist 1,Machine learning engineer 1,Software engineer 1,Consultant 1,std 1,mean 1,Data scientist 2,...,Software engineer 4,Consultant 4,std 4,mean 4,Data scientist 5,Machine learning engineer 5,Software engineer 5,Consultant 5,std 5,mean 5
0,1,2,0,-2.965860,-1.017218,7.890793,-3.641326,5.333454,0.066597,-2.773179,...,5.362940,-0.850250,3.976717,-0.242818,-2.555282,-2.009481,4.370277,-1.767629,3.257240,-0.490529
1,9,0,0,6.925399,-3.056653,-5.294641,1.608724,5.413475,0.045707,5.992557,...,-3.066729,0.438705,3.431118,0.148691,4.466621,-1.450690,-2.954009,-0.027843,3.203220,0.008520
2,10,1,0,0.208691,7.635777,-1.905769,-5.799308,5.645285,0.034848,-0.239947,...,-0.121819,-3.344715,3.060674,-0.063436,-1.460583,5.359279,-0.740972,-5.097662,4.337801,-0.484984
3,26,0,0,3.692573,6.822127,-3.615892,-5.866379,5.983285,0.258107,3.016727,...,-0.643374,-3.675896,3.094977,-0.043446,4.484289,1.420338,-1.549800,-3.985067,3.668545,0.092440
4,32,1,0,2.262520,5.702750,-2.445959,-5.642027,5.015037,-0.030679,3.374876,...,-1.026131,-4.107147,3.478127,-0.105858,-0.069221,2.483360,0.301805,-4.304681,2.837970,-0.397184
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1511,1499,3,4,2.378140,-4.249498,-4.891499,6.624092,5.523113,-0.034691,-1.000058,...,-2.068501,7.273594,4.925186,0.080474,0.229901,-4.581410,-2.374878,7.053916,5.046885,0.081882
1512,1500,3,4,2.794290,-3.991971,-4.152143,5.879009,5.015872,0.132296,4.696843,...,-1.963520,5.614141,4.224563,0.006614,2.911459,-3.948030,-2.946266,4.470585,4.190094,0.121937
1513,1509,2,4,-4.527265,-0.770007,6.463882,-2.509835,4.786057,-0.335806,-2.623826,...,7.332036,-0.497775,4.730177,0.407091,-3.080327,-2.951421,7.119303,-0.792462,4.812830,0.073773
1514,1511,0,4,7.434820,-3.601152,-5.101388,0.958970,5.632193,-0.077188,5.443227,...,-3.402658,-0.489366,4.459239,0.036450,5.797778,-1.438778,-3.207971,-0.745921,3.936613,0.101277


In [16]:
def freeze(module):
    """
    Freezes module's parameters.
    """
    
    for parameter in module.parameters():
        parameter.requires_grad = False
        
def get_freezed_parameters(module):
    """
    Returns names of freezed parameters of the given module.
    """
    
    freezed_parameters = []
    for name, parameter in module.named_parameters():
        if not parameter.requires_grad:
            freezed_parameters.append(name)
            
    return freezed_parameters

def set_embedding_parameters_bits(embeddings_path, optim_bits=32):
    """
    https://github.com/huggingface/transformers/issues/14819#issuecomment-1003427930
    """
    
    embedding_types = ("word", "position", "token_type")
    for embedding_type in embedding_types:
        attr_name = f"{embedding_type}_embeddings"
        
        if hasattr(embeddings_path, attr_name): 
            bnb.optim.GlobalOptimManager.get_instance().register_module_override(
                getattr(embeddings_path, attr_name), 'weight', {'optim_bits': optim_bits}
            )

In [17]:
test = pd.read_csv(os.path.join(INPUT_DIR, 'test.csv'))
submission_df = pd.read_csv(os.path.join(INPUT_DIR, 'submit_sample.csv'),header=None)

def remove_tag(x):
    p = re.compile(r"<[^>]*?>")
    return p.sub('',x)

def cleaning(texts):
    clean_texts = []
    for text in texts:
        # htmlタグを削除
        text = remove_tag(text)
        #アルファベット以外をスペースに置き換え
        #clean_punc = re.sub(r'[^a-zA-Z]', ' ', text)
        clean_texts.append(text)
    return clean_texts



from text_unidecode import unidecode
from typing import Dict, List, Tuple
import codecs

def replace_encoding_with_utf8(error: UnicodeError) -> Tuple[bytes, int]:
    return error.object[error.start : error.end].encode("utf-8"), error.end


def replace_decoding_with_cp1252(error: UnicodeError) -> Tuple[str, int]:
    return error.object[error.start : error.end].decode("cp1252"), error.end

# Register the encoding and decoding error handlers for `utf-8` and `cp1252`.
codecs.register_error("replace_encoding_with_utf8", replace_encoding_with_utf8)
codecs.register_error("replace_decoding_with_cp1252", replace_decoding_with_cp1252)

def resolve_encodings_and_normalize(text: str) -> str:
    """Resolve the encoding problems and normalize the abnormal characters."""
    text = (
        text.encode("raw_unicode_escape")
        .decode("utf-8", errors="replace_decoding_with_cp1252")
        .encode("cp1252", errors="replace_encoding_with_utf8")
        .decode("utf-8", errors="replace_decoding_with_cp1252")
    )
    text = unidecode(text)
    return text


test['description'] = cleaning(test['description'])
test['inputs'] = test['description'].apply(lambda x : resolve_encodings_and_normalize(x))
display(test.head())

Unnamed: 0,id,description,inputs
0,1516,Building decision-making models and proposing ...,Building decision-making models and proposing ...
1,1517,Educate homeowners on the benefits of solar en...,Educate homeowners on the benefits of solar en...
2,1518,"Design, develop, document, and implement web a...","Design, develop, document, and implement web a..."
3,1519,Apply advanced technical expertise and skills ...,Apply advanced technical expertise and skills ...
4,1520,Project manage and deliver against our roadmap...,Project manage and deliver against our roadmap...


In [18]:
tokenizer1 = AutoTokenizer.from_pretrained(CFG1.model)
CFG1.tokenizer = tokenizer1
tokenizer2 = AutoTokenizer.from_pretrained(CFG2.model)
CFG2.tokenizer = tokenizer2
tokenizer3 = AutoTokenizer.from_pretrained(CFG3.model)
CFG3.tokenizer = tokenizer3
tokenizer4 = AutoTokenizer.from_pretrained(CFG4.model)
CFG4.tokenizer = tokenizer4

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [19]:
class Dataset1(Dataset):
    def __init__(self, df, tokenizer, max_length):
        self.df = df
        self.max_len = CFG1.max_len
        self.text = df['inputs'].values
        self.tokenizer = CFG1.tokenizer
        #self.targets = df['label'].values

    def __len__(self):
        return len(self.df)

    def __getitem__(self, index):
        text = self.text[index]
        inputs = tokenizer1.encode_plus(
            text,
            truncation=True,
            add_special_tokens=True,
            max_length = self.max_len
        )
        samples = {
            'input_ids': inputs['input_ids'],
            'attention_mask': inputs['attention_mask'],
            #'target': self.targets[index]
        }

        if 'token_type_ids' in inputs:
            samples['token_type_ids'] = inputs['token_type_ids']
            
        return samples

In [20]:
class Dataset2(Dataset):
    def __init__(self, df, tokenizer, max_length):
        self.df = df
        self.max_len = CFG2.max_len
        self.text = df['inputs'].values
        self.tokenizer = CFG2.tokenizer
        #self.targets = df['label'].values

    def __len__(self):
        return len(self.df)

    def __getitem__(self, index):
        text = self.text[index]
        inputs = tokenizer2.encode_plus(
            text,
            truncation=True,
            add_special_tokens=True,
            max_length = self.max_len
        )
        samples = {
            'input_ids': inputs['input_ids'],
            'attention_mask': inputs['attention_mask'],
            #'target': self.targets[index]
        }

        if 'token_type_ids' in inputs:
            samples['token_type_ids'] = inputs['token_type_ids']
            
        return samples

In [21]:
class Dataset3(Dataset):
    def __init__(self, df, tokenizer, max_length):
        self.df = df
        self.max_len = CFG3.max_len
        self.text = df['inputs'].values
        self.tokenizer = CFG3.tokenizer
        #self.targets = df['label'].values

    def __len__(self):
        return len(self.df)

    def __getitem__(self, index):
        text = self.text[index]
        inputs = tokenizer3.encode_plus(
            text,
            truncation=True,
            add_special_tokens=True,
            max_length = self.max_len
        )
        samples = {
            'input_ids': inputs['input_ids'],
            'attention_mask': inputs['attention_mask'],
            #'target': self.targets[index]
        }

        if 'token_type_ids' in inputs:
            samples['token_type_ids'] = inputs['token_type_ids']
            
        return samples

In [22]:
class Dataset4(Dataset):
    def __init__(self, df, tokenizer, max_length):
        self.df = df
        self.max_len = CFG4.max_len
        self.text = df['inputs'].values
        self.tokenizer = CFG4.tokenizer
        #self.targets = df['label'].values

    def __len__(self):
        return len(self.df)

    def __getitem__(self, index):
        text = self.text[index]
        inputs = tokenizer4.encode_plus(
            text,
            truncation=True,
            add_special_tokens=True,
            max_length = self.max_len
        )
        samples = {
            'input_ids': inputs['input_ids'],
            'attention_mask': inputs['attention_mask'],
            #'target': self.targets[index]
        }

        if 'token_type_ids' in inputs:
            samples['token_type_ids'] = inputs['token_type_ids']
            
        return samples

In [23]:
class Collate:
    def __init__(self, tokenizer, isTrain=True):
        self.tokenizer = tokenizer
        self.isTrain = isTrain
        # self.args = args

    def __call__(self, batch):
        output = dict()
        output["input_ids"] = [sample["input_ids"] for sample in batch]
        output["attention_mask"] = [sample["attention_mask"] for sample in batch]
        if self.isTrain:
            output["target"] = [sample["target"] for sample in batch]

        # calculate max token length of this batch
        batch_max = max([len(ids) for ids in output["input_ids"]])

        # add padding
        if self.tokenizer.padding_side == "right":
            output["input_ids"] = [s + (batch_max - len(s)) * [self.tokenizer.pad_token_id] for s in output["input_ids"]]
            output["attention_mask"] = [s + (batch_max - len(s)) * [0] for s in output["attention_mask"]]
        else:
            output["input_ids"] = [(batch_max - len(s)) * [self.tokenizer.pad_token_id] + s for s in output["input_ids"]]
            output["attention_mask"] = [(batch_max - len(s)) * [0] + s for s in output["attention_mask"]]

        # convert to tensors
        output["input_ids"] = torch.tensor(output["input_ids"], dtype=torch.long)
        output["attention_mask"] = torch.tensor(output["attention_mask"], dtype=torch.long)
        if self.isTrain:
            output["target"] = torch.tensor(output["target"], dtype=torch.long)

        return output
    
collate_fn1 = Collate(CFG1.tokenizer, isTrain=False)
collate_fn2 = Collate(CFG2.tokenizer, isTrain=False)
collate_fn3 = Collate(CFG3.tokenizer, isTrain=False)
collate_fn4 = Collate(CFG4.tokenizer, isTrain=False)
collate_fn5 = Collate(CFG4.tokenizer, isTrain=False)

In [24]:
class MeanPooling(nn.Module):
    def __init__(self):
        super(MeanPooling, self).__init__()
        
    def forward(self, last_hidden_state, attention_mask):
        input_mask_expanded = attention_mask.unsqueeze(-1).expand(last_hidden_state.size()).float()
        sum_embeddings = torch.sum(last_hidden_state * input_mask_expanded, 1)
        sum_mask = input_mask_expanded.sum(1)
        sum_mask = torch.clamp(sum_mask, min=1e-9) #
        mean_embeddings = sum_embeddings / sum_mask
        return mean_embeddings

In [25]:
class CustomModel1(nn.Module):
    def __init__(self, model_name):
        super(CustomModel1, self).__init__()
        # Header (fast or normal)
        self.model = AutoModel.from_pretrained(model_name)
        
        # Gradient_checkpointing
        if CFG1.gradient_checkpointing:
            (self.model).gradient_checkpointing_enable()
        
        # Freezing
        if CFG1.freezing:
            # freezing embeddings and first 2 layers of encoder
            freeze((self.model).embeddings)
            freeze((self.model).encoder.layer[:2])
            CFG1.after_freezed_parameters = filter(lambda parameter: parameter.requires_grad, (self.model).parameters())
        
        self.config = AutoConfig.from_pretrained(model_name)
        self.drop = nn.Dropout(p=CFG1.dropout)
        #self.pooler = MeanPooling()
        self.dropout1 = nn.Dropout(0.1)
        self.dropout2 = nn.Dropout(0.2)
        self.dropout3 = nn.Dropout(0.3)
        self.dropout4 = nn.Dropout(0.4)
        self.dropout5 = nn.Dropout(0.5)
        self.output = nn.Sequential( nn.Linear(self.config.hidden_size, CFG1.target_size) )

    def _init_weights(self, module):
        if isinstance(module, nn.Linear):
            module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
            if module.bias is not None:
                module.bias.data.zero_()
        elif isinstance(module, nn.Embedding):
            module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
            if module.padding_idx is not None:
                module.weight.data[module.padding_idx].zero_()
        elif isinstance(module, nn.LayerNorm):
            module.bias.data.zero_()
            module.weight.data.fill_(1.0)
        
    def forward(self, ids, mask):        
        output = self.model(input_ids=ids, 
                         attention_mask=mask,
                         output_hidden_states=False)
        output = output[0][:, 0, :]
        logits1 = self.output(self.dropout1(output))
        logits2 = self.output(self.dropout2(output))
        logits3 = self.output(self.dropout3(output))
        logits4 = self.output(self.dropout4(output))
        logits5 = self.output(self.dropout5(output))
        outputs = (logits1 + logits2 + logits3 + logits4 + logits5) / 5
        return outputs

In [26]:
class CustomModel2(nn.Module):
    def __init__(self, model_name):
        super(CustomModel2, self).__init__()
        # Header (fast or normal)
        self.model = AutoModel.from_pretrained(model_name)
        
        # Gradient_checkpointing
        if CFG2.gradient_checkpointing:
            (self.model).gradient_checkpointing_enable()
        
        # Freezing
        if CFG2.freezing:
            # freezing embeddings and first 2 layers of encoder
            freeze((self.model).embeddings)
            freeze((self.model).encoder.layer[:2])
            CFG2.after_freezed_parameters = filter(lambda parameter: parameter.requires_grad, (self.model).parameters())
        
        self.config = AutoConfig.from_pretrained(model_name)
        self.drop = nn.Dropout(p=CFG2.dropout)
        self.pooler = MeanPooling()
        self.fc = nn.Linear(self.config.hidden_size, CFG2.target_size)
        
    def forward(self, ids, mask):        
        out = self.model(input_ids=ids, 
                         attention_mask=mask,
                         output_hidden_states=False)
        out = self.pooler(out.last_hidden_state, mask)
        out = self.drop(out)
        outputs = self.fc(out)
        return outputs

In [27]:
class CustomModel3(nn.Module):
    def __init__(self, model_name):
        super(CustomModel3, self).__init__()
        # Header (fast or normal)
        self.model = AutoModel.from_pretrained(model_name)
        
        # Gradient_checkpointing
        if CFG3.gradient_checkpointing:
            (self.model).gradient_checkpointing_enable()
        
        # Freezing
        if CFG3.freezing:
            # freezing embeddings and first 2 layers of encoder
            freeze((self.model).embeddings)
            freeze((self.model).encoder.layer[:2])
            CFG3.after_freezed_parameters = filter(lambda parameter: parameter.requires_grad, (self.model).parameters())
        
        self.config = AutoConfig.from_pretrained(model_name)
        self.fc = nn.Linear(self.config.hidden_size, CFG3.target_size)
        
    def forward(self, ids, mask):        
        out = self.model(input_ids=ids, 
                         attention_mask=mask,
                         output_hidden_states=False)
        outputs = out[0][:, 0, :]
        outputs = self.fc(outputs)
        return outputs

In [28]:
class CustomModel4(nn.Module):
    def __init__(self, model_name):
        super(CustomModel4, self).__init__()
        # Header (fast or normal)
        self.model = AutoModel.from_pretrained(model_name)
        
        # Gradient_checkpointing
        if CFG4.gradient_checkpointing:
            (self.model).gradient_checkpointing_enable()
        
        # Freezing
        if CFG4.freezing:
            # freezing embeddings and first 2 layers of encoder
            freeze((self.model).embeddings)
            freeze((self.model).encoder.layer[:2])
            CFG4.after_freezed_parameters = filter(lambda parameter: parameter.requires_grad, (self.model).parameters())
        
        self.config = AutoConfig.from_pretrained(model_name)
        self.drop = nn.Dropout(p=CFG4.dropout)
        #self.pooler = MeanPooling()
        self.fc = nn.Linear(self.config.hidden_size, CFG4.target_size)
        self._init_weights(self.fc)
        self.attention = nn.Sequential(
            nn.Linear(self.config.hidden_size, 512),
            nn.Tanh(),
            nn.Linear(512, 1),
            nn.Softmax(dim=1)
        )
        self._init_weights(self.attention)
        self.output = nn.Sequential( nn.Linear(self.config.hidden_size, CFG4.target_size) )

    def _init_weights(self, module):
        if isinstance(module, nn.Linear):
            module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
            if module.bias is not None:
                module.bias.data.zero_()
        elif isinstance(module, nn.Embedding):
            module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
            if module.padding_idx is not None:
                module.weight.data[module.padding_idx].zero_()
        elif isinstance(module, nn.LayerNorm):
            module.bias.data.zero_()
            module.weight.data.fill_(1.0)

    def feature(self, ids, mask):
        output = self.model(input_ids=ids, 
                         attention_mask=mask,
                         output_hidden_states=False)
        last_hidden_states = output[0]
        #feature = torch.mean(last_hidden_states, 1)
        weights = self.attention(last_hidden_states)
        feature = torch.sum(weights * last_hidden_states, dim=1)
        return feature


    def forward(self, ids, mask):
        feature = self.feature(ids, mask)
        output = self.fc(self.drop(feature))        
        return output

In [29]:
class CustomModel5(nn.Module):
    def __init__(self, model_name):
        super(CustomModel5, self).__init__()
        # Header (fast or normal)
        self.model = AutoModel.from_pretrained(model_name)
        
        # Gradient_checkpointing
        if CFG4.gradient_checkpointing:
            (self.model).gradient_checkpointing_enable()
        
        # Freezing
        if CFG4.freezing:
            # freezing embeddings and first 2 layers of encoder
            freeze((self.model).embeddings)
            freeze((self.model).encoder.layer[:2])
            CFG4.after_freezed_parameters = filter(lambda parameter: parameter.requires_grad, (self.model).parameters())
        
        self.config = AutoConfig.from_pretrained(model_name)
        #self.drop = nn.Dropout(p=CFG4.dropout)
        self.fc = nn.Linear(self.config.hidden_size, CFG4.target_size)
        self._init_weights(self.fc)
        self.pooler = MeanPooling()
        self.layer_norm1 = nn.LayerNorm(self.config.hidden_size)
        

    def _init_weights(self, module):
        if isinstance(module, nn.Linear):
            module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
            if module.bias is not None:
                module.bias.data.zero_()
        elif isinstance(module, nn.Embedding):
            module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
            if module.padding_idx is not None:
                module.weight.data[module.padding_idx].zero_()
        elif isinstance(module, nn.LayerNorm):
            module.bias.data.zero_()
            module.weight.data.fill_(1.0)


    def forward(self, ids, mask):
        out = self.model(input_ids=ids, 
                         attention_mask=mask,
                         output_hidden_states=False)
        out = self.pooler(out.last_hidden_state, mask)
        out = self.layer_norm1(out)
        outputs = self.fc(out)        
        return outputs

In [30]:
def inference_one_epoch(model, dataloader, device):
    model.eval()
    pred = []
    model.to(device)
    for step, data in enumerate(dataloader):
        ids = data['input_ids'].to(device, dtype=torch.long)
        mask = data['attention_mask'].to(device, dtype=torch.long)
        with torch.no_grad():
            outputs = model(ids, mask)
        pred.append(outputs.to('cpu').numpy())
    pred = np.concatenate(pred)
    return pred

In [31]:
testdataset = Dataset1(test, CFG1.tokenizer, CFG1.max_len)

test_loader = DataLoader(testdataset, 
                         batch_size=CFG1.batch_size,
                         shuffle=False,
                         collate_fn = collate_fn1,
                         num_workers = CFG1.num_workers,
                         pin_memory = True,
                         drop_last = False,
                         )

predictions1 = []

for fold in CFG1.trn_fold:
    model = CustomModel1(CFG1.model)
    config_path=CFG1.config_path
    state = torch.load(CFG1.path+f"{CFG1.model.replace('/', '-')}_fold{fold}_best.pth",
                       map_location = torch.device('cpu'))
    model.load_state_dict(state['model'])

    prediction = inference_one_epoch(model, test_loader, device)
    #prediction = F.softmax(torch.tensor(prediction)).numpy().astype(float)
    predictions1.append(prediction)
    del model, state, prediction
    gc.collect()
    torch.cuda.empty_cache()

del testdataset,test_loader

Some weights of the model checkpoint at microsoft/deberta-large were not used when initializing DebertaModel: ['lm_predictions.lm_head.LayerNorm.weight', 'lm_predictions.lm_head.dense.weight', 'lm_predictions.lm_head.bias', 'lm_predictions.lm_head.LayerNorm.bias', 'lm_predictions.lm_head.dense.bias']
- This IS expected if you are initializing DebertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DebertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of the model checkpoint at microsoft/deberta-large were not used when initializing DebertaModel: ['lm_predictions.lm_head.LayerNorm.weight', 'lm_predictions.lm_head.dense.weight', 'lm_predictions.lm_head.bias', 'lm_predictions

In [32]:
sub1 = submission_df.copy()
sub1.columns = ["id","label"]


#method1
sub1_predictions = np.mean(predictions1, axis=0)
sub1['Data scientist 1'] = sub1_predictions[:, 0]
sub1['Machine learning engineer 1'] = sub1_predictions[:, 1]
sub1['Software engineer 1'] = sub1_predictions[:, 2]
sub1['Consultant 1'] = sub1_predictions[:, 3]
sub1["std 1"] = sub1[['Data scientist 1','Machine learning engineer 1','Software engineer 1','Consultant 1']].std(axis=1)
sub1["mean 1"] = sub1[['Data scientist 1','Machine learning engineer 1','Software engineer 1','Consultant 1']].mean(axis=1)

display(sub1)

Unnamed: 0,id,label,Data scientist 1,Machine learning engineer 1,Software engineer 1,Consultant 1,std 1,mean 1
0,1516,1,6.685815,-0.781564,-4.555237,-1.050914,4.730869,0.074525
1,1517,1,-0.630478,-3.923366,-2.001289,6.555913,4.574402,0.000195
2,1518,1,-3.591125,-2.077838,6.774759,-1.719068,4.689425,-0.153318
3,1519,1,-1.075718,-4.926772,-2.082120,8.101068,5.638934,0.004115
4,1520,1,1.750711,-0.921016,1.564643,-2.328322,1.981697,0.016504
...,...,...,...,...,...,...,...,...
1512,3028,1,-1.058913,-1.431018,3.122421,-1.188190,2.179696,-0.138925
1513,3029,1,7.246647,-2.578345,-4.135920,-0.358376,5.046070,0.043501
1514,3030,1,-3.504260,-1.852924,6.179841,-1.525572,4.324636,-0.175729
1515,3031,1,4.794982,-4.459991,-3.940049,3.310294,4.807761,-0.073691


In [33]:
testdataset = Dataset3(test, CFG3.tokenizer, CFG3.max_len)

test_loader = DataLoader(testdataset, 
                         batch_size=CFG3.batch_size,
                         shuffle=False,
                         collate_fn = collate_fn3,
                         num_workers = CFG3.num_workers,
                         pin_memory = True,
                         drop_last = False,
                         )

predictions3 = []

for fold in CFG3.trn_fold:
    model = CustomModel3(CFG3.model)
    config_path=CFG3.config_path
    state = torch.load(CFG3.path+f"{CFG3.model.replace('/', '-')}_fold{fold}_best.pth",
                       map_location = torch.device('cpu'))
    model.load_state_dict(state['model'])

    prediction = inference_one_epoch(model, test_loader, device)
    #prediction = F.softmax(torch.tensor(prediction)).numpy().astype(float)
    predictions3.append(prediction)
    del model, state, prediction
    gc.collect()
    torch.cuda.empty_cache()

del testdataset,test_loader

Some weights of the model checkpoint at roberta-large were not used when initializing RobertaModel: ['lm_head.layer_norm.weight', 'lm_head.bias', 'lm_head.decoder.weight', 'lm_head.dense.bias', 'lm_head.layer_norm.bias', 'lm_head.dense.weight']
- This IS expected if you are initializing RobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of the model checkpoint at roberta-large were not used when initializing RobertaModel: ['lm_head.layer_norm.weight', 'lm_head.bias', 'lm_head.decoder.weight', 'lm_head.dense.bias', 'lm_head.layer_norm.bias', 'lm_head.dense.weight']
- This IS expected if you are initializing RobertaM

In [34]:
sub3 = submission_df.copy()
sub3.columns = ["id","label"]


#method1
sub3_predictions = np.mean(predictions3, axis=0)
sub3['Data scientist 3'] = sub3_predictions[:, 0]
sub3['Machine learning engineer 3'] = sub3_predictions[:, 1]
sub3['Software engineer 3'] = sub3_predictions[:, 2]
sub3['Consultant 3'] = sub3_predictions[:, 3]
sub3["std 3"] = sub3[['Data scientist 3','Machine learning engineer 3','Software engineer 3','Consultant 3']].std(axis=1)
sub3["mean 3"] = sub3[['Data scientist 3','Machine learning engineer 3','Software engineer 3','Consultant 3']].mean(axis=1)

display(sub3)

Unnamed: 0,id,label,Data scientist 3,Machine learning engineer 3,Software engineer 3,Consultant 3,std 3,mean 3
0,1516,1,6.044259,-1.794004,-3.752031,-1.068430,4.277441,-0.142552
1,1517,1,-1.944885,-3.088769,-0.404268,5.969779,4.043680,0.132964
2,1518,1,-3.012674,-1.977117,6.045528,-1.233587,4.125023,-0.044462
3,1519,1,-0.599138,-3.951811,-1.738244,6.822005,4.671404,0.133203
4,1520,1,-0.544280,-1.243665,4.054093,-2.812925,2.950504,-0.136694
...,...,...,...,...,...,...,...,...
1512,3028,1,-2.184433,-2.806317,4.975789,-0.311162,3.534396,-0.081531
1513,3029,1,6.613635,-2.103014,-3.325536,-1.455415,4.521135,-0.067582
1514,3030,1,-2.555261,-1.915734,5.903517,-1.648849,3.989900,-0.054082
1515,3031,1,3.723150,-4.764701,-2.551706,3.618307,4.326947,0.006263


In [35]:
testdataset = Dataset2(test, CFG2.tokenizer, CFG2.max_len)

test_loader = DataLoader(testdataset, 
                         batch_size=CFG2.batch_size,
                         shuffle=False,
                         collate_fn = collate_fn2,
                         num_workers = CFG2.num_workers,
                         pin_memory = True,
                         drop_last = False,
                         )

predictions2 = []

for fold in CFG2.trn_fold:
    model = CustomModel2(CFG2.model)
    config_path=CFG2.config_path
    state = torch.load(CFG2.path+f"{CFG2.model.replace('/', '-')}_fold{fold}_best.pth",
                       map_location = torch.device('cpu'))
    model.load_state_dict(state['model'])

    prediction = inference_one_epoch(model, test_loader, device)
    #prediction = F.softmax(torch.tensor(prediction)).numpy().astype(float)
    predictions2.append(prediction)
    del model, state, prediction
    gc.collect()
    torch.cuda.empty_cache()

del testdataset,test_loader

Some weights of the model checkpoint at microsoft/deberta-v3-base were not used when initializing DebertaV2Model: ['lm_predictions.lm_head.LayerNorm.weight', 'lm_predictions.lm_head.dense.weight', 'mask_predictions.LayerNorm.bias', 'lm_predictions.lm_head.bias', 'mask_predictions.dense.bias', 'lm_predictions.lm_head.LayerNorm.bias', 'lm_predictions.lm_head.dense.bias', 'mask_predictions.classifier.bias', 'mask_predictions.LayerNorm.weight', 'mask_predictions.dense.weight', 'mask_predictions.classifier.weight']
- This IS expected if you are initializing DebertaV2Model from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DebertaV2Model from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of the m

In [36]:
sub2 = submission_df.copy()
sub2.columns = ["id","label"]


#method1
sub2_predictions = np.mean(predictions2, axis=0)
sub2['Consultant 2'] = sub2_predictions[:, 0]
sub2['Machine learning engineer 2'] = sub2_predictions[:, 2]
sub2['Software engineer 2'] = sub2_predictions[:, 3]
sub2['Data scientist 2'] = sub2_predictions[:, 1]
sub2["std 2"] = sub2[['Data scientist 2','Machine learning engineer 2','Software engineer 2','Consultant 2']].std(axis=1)
sub2["mean 2"] = sub2[['Data scientist 2','Machine learning engineer 2','Software engineer 2','Consultant 2']].mean(axis=1)

display(sub2)

Unnamed: 0,id,label,Consultant 2,Machine learning engineer 2,Software engineer 2,Data scientist 2,std 2,mean 2
0,1516,1,-1.095628,-1.072270,-3.321170,5.762257,3.939766,0.068297
1,1517,1,5.418258,-2.323590,-1.500041,-1.254337,3.584742,0.085073
2,1518,1,-0.704817,-2.258667,5.560447,-2.873216,3.862348,-0.069063
3,1519,1,6.253388,-2.861278,-2.194961,-1.014173,4.208317,0.045744
4,1520,1,-1.468158,-1.135850,3.335479,-0.809194,2.252728,-0.019431
...,...,...,...,...,...,...,...,...
1512,3028,1,-0.319308,-1.310320,2.546636,-0.914356,1.745499,0.000663
1513,3029,1,-0.890843,-1.386537,-3.261573,5.881723,3.996634,0.085693
1514,3030,1,-1.245309,-1.574746,5.309282,-2.950127,3.690946,-0.115225
1515,3031,1,4.159598,-3.362355,-2.505882,1.947549,3.590070,0.059728


In [37]:
testdataset = Dataset4(test, CFG4.tokenizer, CFG4.max_len)

test_loader = DataLoader(testdataset, 
                         batch_size=CFG4.batch_size,
                         shuffle=False,
                         collate_fn = collate_fn1,
                         num_workers = CFG4.num_workers,
                         pin_memory = True,
                         drop_last = False,
                         )

predictions4 = []

for fold in CFG4.trn_fold:
    model = CustomModel4(CFG4.model)
    config_path=CFG4.config_path
    state = torch.load(CFG4.path+f"{CFG4.model.replace('/', '-')}_fold{fold}_best.pth",
                       map_location = torch.device('cpu'))
    model.load_state_dict(state['model'])

    prediction = inference_one_epoch(model, test_loader, device)
    #prediction = F.softmax(torch.tensor(prediction)).numpy().astype(float)
    predictions4.append(prediction)
    del model, state, prediction
    gc.collect()
    torch.cuda.empty_cache()

del testdataset,test_loader

Some weights of the model checkpoint at microsoft/deberta-v3-large were not used when initializing DebertaV2Model: ['lm_predictions.lm_head.LayerNorm.weight', 'lm_predictions.lm_head.dense.weight', 'lm_predictions.lm_head.bias', 'mask_predictions.LayerNorm.bias', 'mask_predictions.dense.bias', 'lm_predictions.lm_head.LayerNorm.bias', 'lm_predictions.lm_head.dense.bias', 'mask_predictions.classifier.bias', 'mask_predictions.LayerNorm.weight', 'mask_predictions.dense.weight', 'mask_predictions.classifier.weight']
- This IS expected if you are initializing DebertaV2Model from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DebertaV2Model from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of the 

In [38]:
sub4 = submission_df.copy()
sub4.columns = ["id","label"]


#method1
sub4_predictions = np.mean(predictions4, axis=0)
sub4['Data scientist 4'] = sub4_predictions[:, 0]
sub4['Machine learning engineer 4'] = sub4_predictions[:, 1]
sub4['Software engineer 4'] = sub4_predictions[:, 2]
sub4['Consultant 4'] = sub4_predictions[:, 3]
sub4["std 4"] = sub4[['Data scientist 4','Machine learning engineer 4','Software engineer 4','Consultant 4']].std(axis=1)
sub4["mean 4"] = sub4[['Data scientist 4','Machine learning engineer 4','Software engineer 4','Consultant 4']].mean(axis=1)

display(sub4)

Unnamed: 0,id,label,Data scientist 4,Machine learning engineer 4,Software engineer 4,Consultant 4,std 4,mean 4
0,1516,1,6.131341,-2.006162,-3.930109,-0.041512,4.361177,0.038390
1,1517,1,-1.435503,-3.094449,-1.120204,6.155100,4.111474,0.126236
2,1518,1,-2.967395,-2.247585,5.858779,-1.070024,4.053061,-0.106556
3,1519,1,-1.469517,-3.398634,-1.704444,7.262183,4.804017,0.172397
4,1520,1,-2.238134,-1.169403,5.061175,-1.871674,3.439162,-0.054509
...,...,...,...,...,...,...,...,...
1512,3028,1,-1.567770,-1.393590,3.450119,-0.677963,2.363190,-0.047301
1513,3029,1,6.894443,-1.280073,-3.893260,-1.689687,4.732307,0.007856
1514,3030,1,-3.276514,-1.704441,6.133006,-1.369423,4.207847,-0.054343
1515,3031,1,1.157998,-3.581794,-2.105423,5.109935,3.857034,0.145179


In [39]:
testdataset = Dataset4(test, CFG4.tokenizer, CFG4.max_len)

test_loader = DataLoader(testdataset, 
                         batch_size=CFG4.batch_size,
                         shuffle=False,
                         collate_fn = collate_fn1,
                         num_workers = CFG4.num_workers,
                         pin_memory = True,
                         drop_last = False,
                         )

predictions5 = []

for fold in CFG5.trn_fold:
    model = CustomModel5(CFG5.model)
    config_path=CFG5.config_path
    state = torch.load(CFG5.path+f"{CFG5.model.replace('/', '-')}_fold{fold}_best.pth",
                       map_location = torch.device('cpu'))
    model.load_state_dict(state['model'])

    prediction = inference_one_epoch(model, test_loader, device)
    #prediction = F.softmax(torch.tensor(prediction)).numpy().astype(float)
    predictions5.append(prediction)
    del model, state, prediction
    gc.collect()
    torch.cuda.empty_cache()

del testdataset,test_loader

Some weights of the model checkpoint at microsoft/deberta-v3-large were not used when initializing DebertaV2Model: ['lm_predictions.lm_head.LayerNorm.weight', 'lm_predictions.lm_head.dense.weight', 'lm_predictions.lm_head.bias', 'mask_predictions.LayerNorm.bias', 'mask_predictions.dense.bias', 'lm_predictions.lm_head.LayerNorm.bias', 'lm_predictions.lm_head.dense.bias', 'mask_predictions.classifier.bias', 'mask_predictions.LayerNorm.weight', 'mask_predictions.dense.weight', 'mask_predictions.classifier.weight']
- This IS expected if you are initializing DebertaV2Model from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DebertaV2Model from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of the 

In [40]:
sub5 = submission_df.copy()
sub5.columns = ["id","label"]


#method1
sub5_predictions = np.mean(predictions5, axis=0)
sub5['Data scientist 5'] = sub5_predictions[:, 0]
sub5['Machine learning engineer 5'] = sub5_predictions[:, 1]
sub5['Software engineer 5'] = sub5_predictions[:, 2]
sub5['Consultant 5'] = sub5_predictions[:, 3]
sub5["std 5"] = sub5[['Data scientist 5','Machine learning engineer 5','Software engineer 5','Consultant 5']].std(axis=1)
sub5["mean 5"] = sub5[['Data scientist 5','Machine learning engineer 5','Software engineer 5','Consultant 5']].mean(axis=1)

display(sub5)

Unnamed: 0,id,label,Data scientist 5,Machine learning engineer 5,Software engineer 5,Consultant 5,std 5,mean 5
0,1516,1,6.211955,-1.987111,-4.183251,0.073621,4.473577,0.028804
1,1517,1,-1.507465,-3.639722,-1.153705,6.226192,4.305622,-0.018675
2,1518,1,-2.610141,-2.877807,6.343557,-1.112021,4.341869,-0.064103
3,1519,1,-0.695383,-4.301244,-2.004891,7.349094,5.065663,0.086894
4,1520,1,0.221690,-1.864811,2.857144,-1.610435,2.179003,-0.099103
...,...,...,...,...,...,...,...,...
1512,3028,1,-1.434959,-2.221851,3.937849,-0.632115,2.761107,-0.087769
1513,3029,1,7.104337,-1.588706,-4.403844,-0.971667,4.943975,0.035030
1514,3030,1,-2.780249,-2.071175,5.696069,-1.301909,3.920351,-0.114316
1515,3031,1,2.977597,-2.965709,-2.729434,3.217475,3.435161,0.124982


In [41]:
stacking_test = sub1.merge(sub2[['id','Data scientist 2','Machine learning engineer 2','Software engineer 2','Consultant 2','std 2','mean 2']],how="left",on=["id"]).merge(sub3,how="left",on=["id","label"]).merge(sub4,how="left",on=["id","label"]).merge(sub5,how="left",on=["id","label"])
display(stacking_test)

Unnamed: 0,id,label,Data scientist 1,Machine learning engineer 1,Software engineer 1,Consultant 1,std 1,mean 1,Data scientist 2,Machine learning engineer 2,...,Software engineer 4,Consultant 4,std 4,mean 4,Data scientist 5,Machine learning engineer 5,Software engineer 5,Consultant 5,std 5,mean 5
0,1516,1,6.685815,-0.781564,-4.555237,-1.050914,4.730869,0.074525,5.762257,-1.072270,...,-3.930109,-0.041512,4.361177,0.038390,6.211955,-1.987111,-4.183251,0.073621,4.473577,0.028804
1,1517,1,-0.630478,-3.923366,-2.001289,6.555913,4.574402,0.000195,-1.254337,-2.323590,...,-1.120204,6.155100,4.111474,0.126236,-1.507465,-3.639722,-1.153705,6.226192,4.305622,-0.018675
2,1518,1,-3.591125,-2.077838,6.774759,-1.719068,4.689425,-0.153318,-2.873216,-2.258667,...,5.858779,-1.070024,4.053061,-0.106556,-2.610141,-2.877807,6.343557,-1.112021,4.341869,-0.064103
3,1519,1,-1.075718,-4.926772,-2.082120,8.101068,5.638934,0.004115,-1.014173,-2.861278,...,-1.704444,7.262183,4.804017,0.172397,-0.695383,-4.301244,-2.004891,7.349094,5.065663,0.086894
4,1520,1,1.750711,-0.921016,1.564643,-2.328322,1.981697,0.016504,-0.809194,-1.135850,...,5.061175,-1.871674,3.439162,-0.054509,0.221690,-1.864811,2.857144,-1.610435,2.179003,-0.099103
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1512,3028,1,-1.058913,-1.431018,3.122421,-1.188190,2.179696,-0.138925,-0.914356,-1.310320,...,3.450119,-0.677963,2.363190,-0.047301,-1.434959,-2.221851,3.937849,-0.632115,2.761107,-0.087769
1513,3029,1,7.246647,-2.578345,-4.135920,-0.358376,5.046070,0.043501,5.881723,-1.386537,...,-3.893260,-1.689687,4.732307,0.007856,7.104337,-1.588706,-4.403844,-0.971667,4.943975,0.035030
1514,3030,1,-3.504260,-1.852924,6.179841,-1.525572,4.324636,-0.175729,-2.950127,-1.574746,...,6.133006,-1.369423,4.207847,-0.054343,-2.780249,-2.071175,5.696069,-1.301909,3.920351,-0.114316
1515,3031,1,4.794982,-4.459991,-3.940049,3.310294,4.807761,-0.073691,1.947549,-3.362355,...,-2.105423,5.109935,3.857034,0.145179,2.977597,-2.965709,-2.729434,3.217475,3.435161,0.124982


In [42]:
features = stacking_test.columns.to_list()
features.remove('id')
features.remove('label')

In [43]:
def get_lgbm_cv_score(oof_df, lr=0.1, n_est=100, n_leave=31):
    f1_score_ = []
    clf_ = []
    for fold in range(5):
        print("Fold :", fold)
        X_train = oof_df[oof_df.kfold!=fold][features].values
        y_train = oof_df[oof_df.kfold!=fold].label.values

        X_test = oof_df[oof_df.kfold==fold][features].values
        y_test = oof_df[oof_df.kfold==fold].label.values

        clf = lgb.LGBMClassifier(learning_rate=lr,
                                n_estimators=n_est,
                                num_leaves=n_leave,
                                boosting_type='gbdt',
                                random_state=42)
        clf.fit(X_train, y_train, eval_set=(X_test, y_test), early_stopping_rounds=100,
               verbose=500)
        
        y_pred = clf.predict(X_test)
        score = f1_score(y_pred,y_test ,average='macro')
        print(f"F1 Score : {score}")
        f1_score_.append(score)
        clf_.append(clf)
    return f1_score_, clf_

lr=0.05
lgbm_f1_score_, lgbm_clf_ = get_lgbm_cv_score(stacking_train, lr=lr, n_est=1000, n_leave=31)
print(np.array(lgbm_f1_score_).mean())

Fold : 0
Training until validation scores don't improve for 100 rounds.
Early stopping, best iteration is:
[58]	valid_0's multi_logloss: 0.549269
F1 Score : 0.7903658823855448
Fold : 1
Training until validation scores don't improve for 100 rounds.
Early stopping, best iteration is:
[96]	valid_0's multi_logloss: 0.580805
F1 Score : 0.7493826473859844
Fold : 2
Training until validation scores don't improve for 100 rounds.
Early stopping, best iteration is:
[62]	valid_0's multi_logloss: 0.621708
F1 Score : 0.6964788493267811
Fold : 3
Training until validation scores don't improve for 100 rounds.
Early stopping, best iteration is:
[63]	valid_0's multi_logloss: 0.573549
F1 Score : 0.7021752613857877
Fold : 4
Training until validation scores don't improve for 100 rounds.
Early stopping, best iteration is:
[75]	valid_0's multi_logloss: 0.556865
F1 Score : 0.7009728997914864
0.7278751080551169


In [44]:
def get_ctb_cv_score(oof_df):
    params = {
          "loss_function":"Logloss",
          "n_estimators":1000,
          "random_seed":42,
          "learning_rate":0.05,
          'task_type' : 'GPU'
    }
    f1_score_ = []
    clf_ = []
    for fold in range(5):
        print("Fold :", fold)
        X_train = oof_df[oof_df.kfold!=fold][features].values
        y_train = oof_df[oof_df.kfold!=fold].label.values

        X_test = oof_df[oof_df.kfold==fold][features].values
        y_test = oof_df[oof_df.kfold==fold].label.values

        clf = ctb.CatBoostClassifier(**params)
        clf.fit(X_train, y_train,
            eval_set=[(X_train,y_train),(X_test, y_test)],
            verbose = 500,
            early_stopping_rounds = 100)
        
        y_pred = clf.predict(X_test)
        score = f1_score(y_pred,y_test ,average='macro')
        print(f"F1 Score : {score}")
        f1_score_.append(score)
        clf_.append(clf)
    return f1_score_, clf_


ctb_f1_score_, ctb_clf_ = get_lgbm_cv_score(stacking_train)
print(np.array(ctb_f1_score_).mean())

Fold : 0
Training until validation scores don't improve for 100 rounds.
Did not meet early stopping. Best iteration is:
[29]	valid_0's multi_logloss: 0.549462
F1 Score : 0.8030796805665916
Fold : 1
Training until validation scores don't improve for 100 rounds.
Did not meet early stopping. Best iteration is:
[43]	valid_0's multi_logloss: 0.588313
F1 Score : 0.7256165164946239
Fold : 2
Training until validation scores don't improve for 100 rounds.
Did not meet early stopping. Best iteration is:
[29]	valid_0's multi_logloss: 0.632381
F1 Score : 0.6797060282493853
Fold : 3
Training until validation scores don't improve for 100 rounds.
Did not meet early stopping. Best iteration is:
[34]	valid_0's multi_logloss: 0.570637
F1 Score : 0.71499552483861
Fold : 4
Training until validation scores don't improve for 100 rounds.
Did not meet early stopping. Best iteration is:
[31]	valid_0's multi_logloss: 0.561672
F1 Score : 0.720012439667612
0.7286820379633646


In [45]:
def get_logit_cv_score(oof_df):
    f1_score_ = []
    clf_ = []
    for fold in range(5):
        print("Fold :", fold)
        X_train = oof_df[oof_df.kfold!=fold][features].values
        y_train = oof_df[oof_df.kfold!=fold].label.values

        X_test = oof_df[oof_df.kfold==fold][features].values
        y_test = oof_df[oof_df.kfold==fold].label.values

        clf = LogisticRegression(random_state=0)
        clf.fit(X_train, y_train)
        
        y_pred = clf.predict(X_test)
        score = f1_score(y_pred,y_test ,average='macro')
        print(f"F1 Score : {score}")
        f1_score_.append(score)
        clf_.append(clf)
    return f1_score_, clf_


logit_f1_score_, logit_clf_ = get_logit_cv_score(stacking_train)
print(np.array(logit_f1_score_).mean())

Fold : 0
F1 Score : 0.7284419034419035
Fold : 1
F1 Score : 0.7133859050357818
Fold : 2
F1 Score : 0.6839008487754872
Fold : 3
F1 Score : 0.684930860015057
Fold : 4
F1 Score : 0.7610791860968366
0.7143477406730132


In [46]:
def get_predicts(df_,lgb_clfs,ctb_clfs,logit_clfs):
  lgb_predict,ctb_predict,logit_predict = [],[],[]
  test_X = df_[features].values
  for fold in range(5):
    lgb_predict.append(lgb_clfs[fold].predict(test_X))
    ctb_predict.append(ctb_clfs[fold].predict(test_X))
    logit_predict.append(logit_clfs[fold].predict(test_X))

  return lgb_predict,ctb_predict,logit_predict

lgb_pred,ctb_pred,logit_pred = get_predicts(stacking_test,lgbm_clf_,ctb_clf_,logit_clf_)

In [47]:
sub = submission_df.copy()
sub.columns = ["id","label"]

for fold in range(5):
  sub[f"lgb_{fold}"] = lgb_pred[fold]
  sub[f"ctb_{fold}"] = ctb_pred[fold]
  sub[f"logit_{fold}"] = logit_pred[fold]

sub["label"] = sub.loc[:,"lgb_0":"logit_4"].mode(axis=1)[0]
sub["label"] = sub["label"].astype("int")
sub["label"] = sub["label"] + 1
sub[["id","label"]].to_csv(os.path.join(OUTPUT_SUB_DIR,"submission32_Stacking.csv"),index=False,header=False)
display(sub)

Unnamed: 0,id,label,lgb_0,ctb_0,logit_0,lgb_1,ctb_1,logit_1,lgb_2,ctb_2,logit_2,lgb_3,ctb_3,logit_3,lgb_4,ctb_4,logit_4
0,1516,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
1,1517,4,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3
2,1518,3,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2
3,1519,4,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3
4,1520,3,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1512,3028,3,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2
1513,3029,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
1514,3030,3,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2
1515,3031,4,3,3,3,3,3,3,3,3,3,0,0,3,3,3,3


In [48]:
#train,testでダブっているデータを参照
dup_test_ids = [1707,2122,2291,2775,2191,1700,2304,2149,2676,2844,2144,2764,1774,2446,2736,2301,1822,1852,2070,1609,2423,2695,
                2077,2409,2233,2076,1568,3001,1662,1997, 2896,2352,2842,2321,1630,2259,2968,1551, 1673, 2168]
sub[sub["id"].isin(dup_test_ids)]  

Unnamed: 0,id,label,lgb_0,ctb_0,logit_0,lgb_1,ctb_1,logit_1,lgb_2,ctb_2,logit_2,lgb_3,ctb_3,logit_3,lgb_4,ctb_4,logit_4
35,1551,4,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3
52,1568,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
93,1609,4,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3
114,1630,4,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3
146,1662,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
157,1673,4,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3
184,1700,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
191,1707,3,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2
258,1774,4,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3
306,1822,4,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3


In [49]:
T = pd.DataFrame([[1707,2122,2291,2775,2191,1700,2304,2149,2676,2844,2144,2764,1774,2446,2736,2301,1822,1852,2070,1609,2423,2695,
                2077,2409,2233,2076,1568,3001,1662,1997, 2896,2352,2842,2321,1630,2259,2968,1551, 1673, 2168],
                [3,4,1,4,1,1,1,1,3,1,3,4,4,1,1,1,4,4,3,4,4,4,3,4,4,1,1,1,1,3,3,1,4,4,4,1,4,4,4,4]]).T
T.columns =["id","label"]
T = T.sort_values(by="id")
T

Unnamed: 0,id,label
37,1551,4
26,1568,1
19,1609,4
34,1630,4
28,1662,1
38,1673,4
5,1700,1
0,1707,3
12,1774,4
16,1822,4


In [51]:
sub.to_csv(os.path.join(OUTPUT_DIR,"Stacking_df.csv"),index=False)