In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
!pip install transformers

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [None]:
!pip install sentencepiece

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [None]:
import os
import gc
import math
import time
import random
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import warnings
warnings.simplefilter('ignore')
from tqdm import tqdm
import re

import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.optim import lr_scheduler
from torch.utils.data import DataLoader, Dataset

from sklearn.model_selection import StratifiedKFold,StratifiedGroupKFold,GroupKFold
from sklearn.metrics import log_loss,f1_score

from transformers import AutoModel, AutoConfig, AutoTokenizer, AdamW, DataCollatorWithPadding
from transformers import get_linear_schedule_with_warmup, get_cosine_schedule_with_warmup

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [None]:
INPUT_DIR = '/content/drive/MyDrive/Competitions/Signate/Student Cup 2022/input/'
OUTPUT_DIR = '/content/drive/MyDrive/Competitions/Signate/Student Cup 2022/output/'
OUTPUT_SUB_DIR = os.path.join(OUTPUT_DIR,'Submission')
OUTPUT_MODEL_DIR1 = os.path.join(OUTPUT_DIR,'Model/DeBERTa-large/')
OUTPUT_MODEL_DIR2 = os.path.join(OUTPUT_DIR,'Model/DeBERTa-large[ver3]/')
OUTPUT_MODEL_DIR3 = os.path.join(OUTPUT_DIR,'Model/DeBERTa-large[ver4]/')

In [None]:
class CFG1:
    num_workers=2
    path=OUTPUT_MODEL_DIR1
    config_path=OUTPUT_MODEL_DIR1+'config.pth'
    model="microsoft/deberta-large"
    batch_size=32
    dropout=0.2
    target_size=4
    max_len=1024
    seed=42
    n_fold=5
    trn_fold=[0, 1, 2, 3, 4]
    gradient_checkpointing=True
    freezing=True

In [None]:
class CFG2:
    num_workers=2
    path=OUTPUT_MODEL_DIR2
    config_path=OUTPUT_MODEL_DIR2+'config.pth'
    model="microsoft/deberta-large"
    batch_size=32
    dropout=0.2
    target_size=4
    max_len=1024
    seed=42
    n_fold=5
    trn_fold=[0, 1, 2]
    gradient_checkpointing=True
    freezing=True

In [None]:
class CFG3:
    num_workers=2
    path=OUTPUT_MODEL_DIR3
    config_path=OUTPUT_MODEL_DIR3+'config.pth'
    model="microsoft/deberta-large"
    batch_size=32
    dropout=0.2
    target_size=4
    max_len=1024
    seed=42
    n_fold=5
    trn_fold=[0, 1, 2, 3, 4]
    gradient_checkpointing=True
    freezing=True

In [None]:
# Loss Func
def criterion(outputs, labels):
    return nn.CrossEntropyLoss()(outputs, labels)

def softmax(z):
    assert len(z.shape) == 2
    s = np.max(z, axis=1)
    s = s[:, np.newaxis] # necessary step to do broadcasting
    e_x = np.exp(z - s)
    div = np.sum(e_x, axis=1)
    div = div[:, np.newaxis] # dito
    return e_x / div
"""
def get_score(y_true, y_pred):
    y_pred = softmax(y_pred)
    score = log_loss(y_true, y_pred)
    return round(score, 5)
"""
def get_score(outputs, labels):
    outputs = F.softmax(torch.tensor(outputs)).numpy()
    return f1_score(np.argmax(outputs,axis=1),labels ,average='macro')

def get_logger(filename=OUTPUT_DIR+'train'):
    from logging import getLogger, INFO, FileHandler, Formatter, StreamHandler
    logger = getLogger(__name__)
    logger.setLevel(INFO)
    handler1 = StreamHandler()
    handler1.setFormatter(Formatter("%(message)s"))
    handler2 = FileHandler(filename=f"{filename}.log")
    handler2.setFormatter(Formatter("%(message)s"))
    logger.addHandler(handler1)
    logger.addHandler(handler2)
    return logger

LOGGER = get_logger()

def seed_everything(seed=CFG1.seed):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True
    
seed_everything(seed=42)

In [None]:
oof_df1 = pd.read_pickle(OUTPUT_MODEL_DIR1+'oof_df.pkl')
#oof_df2 = pd.read_pickle(OUTPUT_MODEL_DIR2+'oof_df.pkl')
oof_df3 = pd.read_pickle(OUTPUT_MODEL_DIR3+'oof_df.pkl')
labels1 = oof_df1['label'].values
labels3 = oof_df3["label"].values
preds1 = oof_df1[['Data scientist','Machine learning engineer','Software engineer','Consultant']]
#preds2 = oof_df2[['Data scientist','Machine learning engineer','Software engineer','Consultant']]
preds3 = oof_df3[['Data scientist','Machine learning engineer','Software engineer','Consultant']]
score1 = get_score(preds1.values, labels1)
#score2 = get_score(preds2.values, labels)
score3 = get_score(preds3.values, labels3)
mean_score = (score1+score3)/2
LOGGER.info(f'Deberta-large CV Score: {score1:<.4f}')
LOGGER.info(f'Deberta-large ver4 CV Score: {score3:<.4f}')
LOGGER.info(f'CV Mean Score: {mean_score:<.4f}')

Deberta-large CV Score: 0.7455
INFO:__main__:Deberta-large CV Score: 0.7455
Deberta-large ver4 CV Score: 0.7429
INFO:__main__:Deberta-large ver4 CV Score: 0.7429
CV Mean Score: 0.7442
INFO:__main__:CV Mean Score: 0.7442


In [None]:
def freeze(module):
    """
    Freezes module's parameters.
    """
    
    for parameter in module.parameters():
        parameter.requires_grad = False
        
def get_freezed_parameters(module):
    """
    Returns names of freezed parameters of the given module.
    """
    
    freezed_parameters = []
    for name, parameter in module.named_parameters():
        if not parameter.requires_grad:
            freezed_parameters.append(name)
            
    return freezed_parameters

def set_embedding_parameters_bits(embeddings_path, optim_bits=32):
    """
    https://github.com/huggingface/transformers/issues/14819#issuecomment-1003427930
    """
    
    embedding_types = ("word", "position", "token_type")
    for embedding_type in embedding_types:
        attr_name = f"{embedding_type}_embeddings"
        
        if hasattr(embeddings_path, attr_name): 
            bnb.optim.GlobalOptimManager.get_instance().register_module_override(
                getattr(embeddings_path, attr_name), 'weight', {'optim_bits': optim_bits}
            )

In [None]:
test = pd.read_csv(os.path.join(INPUT_DIR, 'test.csv'))
submission_df = pd.read_csv(os.path.join(INPUT_DIR, 'submit_sample.csv'),header=None)

def remove_tag(x):
    p = re.compile(r"<[^>]*?>")
    return p.sub('',x)

def cleaning(texts):
    clean_texts = []
    for text in texts:
        # htmlタグを削除
        text = remove_tag(text)
        #アルファベット以外をスペースに置き換え
        #clean_punc = re.sub(r'[^a-zA-Z]', ' ', text)
        clean_texts.append(text)
    return clean_texts



from text_unidecode import unidecode
from typing import Dict, List, Tuple
import codecs

def replace_encoding_with_utf8(error: UnicodeError) -> Tuple[bytes, int]:
    return error.object[error.start : error.end].encode("utf-8"), error.end


def replace_decoding_with_cp1252(error: UnicodeError) -> Tuple[str, int]:
    return error.object[error.start : error.end].decode("cp1252"), error.end

# Register the encoding and decoding error handlers for `utf-8` and `cp1252`.
codecs.register_error("replace_encoding_with_utf8", replace_encoding_with_utf8)
codecs.register_error("replace_decoding_with_cp1252", replace_decoding_with_cp1252)

def resolve_encodings_and_normalize(text: str) -> str:
    """Resolve the encoding problems and normalize the abnormal characters."""
    text = (
        text.encode("raw_unicode_escape")
        .decode("utf-8", errors="replace_decoding_with_cp1252")
        .encode("cp1252", errors="replace_encoding_with_utf8")
        .decode("utf-8", errors="replace_decoding_with_cp1252")
    )
    text = unidecode(text)
    return text


test['description'] = cleaning(test['description'])
test['inputs'] = test['description'].apply(lambda x : resolve_encodings_and_normalize(x))
display(test.head())

Unnamed: 0,id,description,inputs
0,1516,Building decision-making models and proposing ...,Building decision-making models and proposing ...
1,1517,Educate homeowners on the benefits of solar en...,Educate homeowners on the benefits of solar en...
2,1518,"Design, develop, document, and implement web a...","Design, develop, document, and implement web a..."
3,1519,Apply advanced technical expertise and skills ...,Apply advanced technical expertise and skills ...
4,1520,Project manage and deliver against our roadmap...,Project manage and deliver against our roadmap...


In [None]:
tokenizer = AutoTokenizer.from_pretrained(CFG1.model)
CFG1.tokenizer = tokenizer
CFG2.tokenizer = tokenizer
CFG3.tokenizer = tokenizer

In [None]:
class Dataset(Dataset):
    def __init__(self, df, tokenizer, max_length):
        self.df = df
        self.max_len = CFG1.max_len
        self.text = df['inputs'].values
        self.tokenizer = CFG1.tokenizer
        #self.targets = df['label'].values

    def __len__(self):
        return len(self.df)

    def __getitem__(self, index):
        text = self.text[index]
        inputs = tokenizer.encode_plus(
            text,
            truncation=True,
            add_special_tokens=True,
            max_length = self.max_len
        )
        samples = {
            'input_ids': inputs['input_ids'],
            'attention_mask': inputs['attention_mask'],
            #'target': self.targets[index]
        }

        if 'token_type_ids' in inputs:
            samples['token_type_ids'] = inputs['token_type_ids']
            
        return samples

In [None]:
class Collate:
    def __init__(self, tokenizer, isTrain=True):
        self.tokenizer = tokenizer
        self.isTrain = isTrain
        # self.args = args

    def __call__(self, batch):
        output = dict()
        output["input_ids"] = [sample["input_ids"] for sample in batch]
        output["attention_mask"] = [sample["attention_mask"] for sample in batch]
        if self.isTrain:
            output["target"] = [sample["target"] for sample in batch]

        # calculate max token length of this batch
        batch_max = max([len(ids) for ids in output["input_ids"]])

        # add padding
        if self.tokenizer.padding_side == "right":
            output["input_ids"] = [s + (batch_max - len(s)) * [self.tokenizer.pad_token_id] for s in output["input_ids"]]
            output["attention_mask"] = [s + (batch_max - len(s)) * [0] for s in output["attention_mask"]]
        else:
            output["input_ids"] = [(batch_max - len(s)) * [self.tokenizer.pad_token_id] + s for s in output["input_ids"]]
            output["attention_mask"] = [(batch_max - len(s)) * [0] + s for s in output["attention_mask"]]

        # convert to tensors
        output["input_ids"] = torch.tensor(output["input_ids"], dtype=torch.long)
        output["attention_mask"] = torch.tensor(output["attention_mask"], dtype=torch.long)
        if self.isTrain:
            output["target"] = torch.tensor(output["target"], dtype=torch.long)

        return output
    
collate_fn1 = Collate(CFG1.tokenizer, isTrain=False)
collate_fn2 = Collate(CFG2.tokenizer, isTrain=False)
collate_fn3 = Collate(CFG3.tokenizer, isTrain=False)

In [None]:
class MeanPooling(nn.Module):
    def __init__(self):
        super(MeanPooling, self).__init__()
        
    def forward(self, last_hidden_state, attention_mask):
        input_mask_expanded = attention_mask.unsqueeze(-1).expand(last_hidden_state.size()).float()
        sum_embeddings = torch.sum(last_hidden_state * input_mask_expanded, 1)
        sum_mask = input_mask_expanded.sum(1)
        sum_mask = torch.clamp(sum_mask, min=1e-9) #
        mean_embeddings = sum_embeddings / sum_mask
        return mean_embeddings

In [None]:
class CustomModel1(nn.Module):
    def __init__(self, model_name):
        super(CustomModel1, self).__init__()
        # Header (fast or normal)
        self.model = AutoModel.from_pretrained(model_name)
        
        # Gradient_checkpointing
        if CFG1.gradient_checkpointing:
            (self.model).gradient_checkpointing_enable()
        
        # Freezing
        if CFG1.freezing:
            # freezing embeddings and first 2 layers of encoder
            freeze((self.model).embeddings)
            freeze((self.model).encoder.layer[:2])
            CFG1.after_freezed_parameters = filter(lambda parameter: parameter.requires_grad, (self.model).parameters())
        
        self.config = AutoConfig.from_pretrained(model_name)
        self.drop = nn.Dropout(p=CFG1.dropout)
        #self.pooler = MeanPooling()
        self.dropout1 = nn.Dropout(0.1)
        self.dropout2 = nn.Dropout(0.2)
        self.dropout3 = nn.Dropout(0.3)
        self.dropout4 = nn.Dropout(0.4)
        self.dropout5 = nn.Dropout(0.5)
        self.output = nn.Sequential( nn.Linear(self.config.hidden_size, CFG1.target_size) )

    def _init_weights(self, module):
        if isinstance(module, nn.Linear):
            module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
            if module.bias is not None:
                module.bias.data.zero_()
        elif isinstance(module, nn.Embedding):
            module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
            if module.padding_idx is not None:
                module.weight.data[module.padding_idx].zero_()
        elif isinstance(module, nn.LayerNorm):
            module.bias.data.zero_()
            module.weight.data.fill_(1.0)
        
    def forward(self, ids, mask):        
        output = self.model(input_ids=ids, 
                         attention_mask=mask,
                         output_hidden_states=False)
        output = output[0][:, 0, :]
        logits1 = self.output(self.dropout1(output))
        logits2 = self.output(self.dropout2(output))
        logits3 = self.output(self.dropout3(output))
        logits4 = self.output(self.dropout4(output))
        logits5 = self.output(self.dropout5(output))
        outputs = (logits1 + logits2 + logits3 + logits4 + logits5) / 5
        return outputs

In [None]:
class CustomModel3(nn.Module):
    def __init__(self, model_name):
        super(CustomModel3, self).__init__()
        # Header (fast or normal)

        #hidden_dropout_prob: float = 0.1
        #layer_norm_eps: float = 1e-7

        #self.model = AutoModel.from_pretrained(model_name)
        self.config = self.config = torch.load(CFG3.config_path)

        """
        self.config.update(
            {
                "output_hidden_states": True,
                "hidden_dropout_prob": hidden_dropout_prob,
                "layer_norm_eps": layer_norm_eps,
                "add_pooling_layer": False,
                "num_labels": CFG.target_size,
            }
        )
        """

        self.model = AutoModel.from_config(self.config)

        self.dropout = nn.Dropout(self.config.hidden_dropout_prob)
        self.dropout1 = nn.Dropout(0.1)
        self.dropout2 = nn.Dropout(0.2)
        self.dropout3 = nn.Dropout(0.3)
        self.dropout4 = nn.Dropout(0.4)
        self.dropout5 = nn.Dropout(0.5)
        self.pooler = MeanPooling()
        self.output = nn.Linear(self.config.hidden_size, CFG3.target_size)
        self._init_weights(self.output)
        self.layer_norm1 = nn.LayerNorm(self.config.hidden_size)
        
        # Gradient_checkpointing
        if CFG3.gradient_checkpointing:
            (self.model).gradient_checkpointing_enable()
        
        # Freezing
        if CFG3.freezing:
            # freezing embeddings and first 2 layers of encoder
            freeze((self.model).embeddings)
            freeze((self.model).encoder.layer[:2])
            CFG3.after_freezed_parameters = filter(lambda parameter: parameter.requires_grad, (self.model).parameters())
        
        

    def _init_weights(self, module):
        if isinstance(module, nn.Linear):
            module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
            if module.bias is not None:
                module.bias.data.zero_()
        elif isinstance(module, nn.Embedding):
            module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
            if module.padding_idx is not None:
                module.weight.data[module.padding_idx].zero_()
        elif isinstance(module, nn.LayerNorm):
            module.bias.data.zero_()
            module.weight.data.fill_(1.0)


    def forward(self, ids, mask):

        transformer_out = self.model(input_ids=ids, 
                         attention_mask=mask,
                         )
        sequence_output = transformer_out.last_hidden_state
        sequence_output = self.pooler(sequence_output, mask)
        #sequence_output = self.dropout(sequence_output)
        sequence_output = self.layer_norm1(sequence_output)

        logits1 = self.output(self.dropout1(sequence_output))
        logits2 = self.output(self.dropout2(sequence_output))
        logits3 = self.output(self.dropout3(sequence_output))
        logits4 = self.output(self.dropout4(sequence_output))
        logits5 = self.output(self.dropout5(sequence_output))

        logits = (logits1 + logits2 + logits3 + logits4 + logits5) / 5
     
        return logits

In [None]:
def inference_one_epoch(model, dataloader, device):
    model.eval()
    pred = []
    model.to(device)
    for step, data in enumerate(dataloader):
        ids = data['input_ids'].to(device, dtype=torch.long)
        mask = data['attention_mask'].to(device, dtype=torch.long)
        with torch.no_grad():
            outputs = model(ids, mask)
        pred.append(outputs.to('cpu').numpy())
    pred = np.concatenate(pred)
    return pred

In [None]:
testdataset = Dataset(test, CFG1.tokenizer, CFG1.max_len)

test_loader = DataLoader(testdataset, 
                         batch_size=CFG1.batch_size,
                         shuffle=False,
                         collate_fn = collate_fn1,
                         num_workers = CFG1.num_workers,
                         pin_memory = True,
                         drop_last = False,
                         )

predictions1 = []

for fold in CFG1.trn_fold:
    model = CustomModel1(CFG1.model)
    config_path=CFG1.config_path
    state = torch.load(CFG1.path+f"{CFG1.model.replace('/', '-')}_fold{fold}_best.pth",
                       map_location = torch.device('cpu'))
    model.load_state_dict(state['model'])

    prediction = inference_one_epoch(model, test_loader, device)
    prediction = F.softmax(torch.tensor(prediction)).numpy().astype(float)
    predictions1.append(prediction)
    del model, state, prediction
    gc.collect()
    torch.cuda.empty_cache()

del testdataset,test_loader

Some weights of the model checkpoint at microsoft/deberta-large were not used when initializing DebertaModel: ['lm_predictions.lm_head.dense.bias', 'lm_predictions.lm_head.LayerNorm.weight', 'lm_predictions.lm_head.LayerNorm.bias', 'lm_predictions.lm_head.dense.weight', 'lm_predictions.lm_head.bias']
- This IS expected if you are initializing DebertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DebertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of the model checkpoint at microsoft/deberta-large were not used when initializing DebertaModel: ['lm_predictions.lm_head.dense.bias', 'lm_predictions.lm_head.LayerNorm.weight', 'lm_predictions.lm_head.LayerNorm.bias', 'lm_pre

In [None]:
sub1 = submission_df.copy()
sub1.columns = ["id","label"]


#method1
sub1_predictions = np.mean(predictions1, axis=0)
sub1['Data scientist'] = sub1_predictions[:, 0]
sub1['Machine learning engineer'] = sub1_predictions[:, 1]
sub1['Software engineer'] = sub1_predictions[:, 2]
sub1['Consultant'] = sub1_predictions[:, 3]

display(sub1)

Unnamed: 0,id,label,Data scientist,Machine learning engineer,Software engineer,Consultant
0,1516,1,0.992574,0.002775,0.000357,0.004294
1,1517,1,0.010402,0.000219,0.002974,0.986406
2,1518,1,0.000332,0.002413,0.993055,0.004200
3,1519,1,0.005065,0.000023,0.000847,0.994065
4,1520,1,0.378666,0.023385,0.586371,0.011578
...,...,...,...,...,...,...
1512,3028,1,0.073787,0.020873,0.859536,0.045804
1513,3029,1,0.993717,0.000814,0.000463,0.005006
1514,3030,1,0.000591,0.003917,0.990050,0.005441
1515,3031,1,0.689201,0.001036,0.002863,0.306900


In [None]:
testdataset = Dataset(test, CFG3.tokenizer, CFG3.max_len)

test_loader = DataLoader(testdataset, 
                         batch_size=CFG3.batch_size,
                         shuffle=False,
                         collate_fn = collate_fn3,
                         num_workers = CFG3.num_workers,
                         pin_memory = True,
                         drop_last = False,
                         )

predictions3 = []

for fold in CFG3.trn_fold:
    model = CustomModel3(CFG3.model)
    config_path=CFG3.config_path
    state = torch.load(CFG3.path+f"{CFG3.model.replace('/', '-')}_fold{fold}_best.pth",
                       map_location = torch.device('cpu'))
    model.load_state_dict(state['model'])

    prediction = inference_one_epoch(model, test_loader, device)
    prediction = F.softmax(torch.tensor(prediction)).numpy().astype(float)
    predictions3.append(prediction)
    del model, state, prediction
    gc.collect()
    torch.cuda.empty_cache()

del testdataset,test_loader

In [None]:
sub3 = submission_df.copy()
sub3.columns = ["id","label"]


#method1
sub3_predictions = np.mean(predictions3, axis=0)
sub3['Data scientist'] = sub3_predictions[:, 0]
sub3['Machine learning engineer'] = sub3_predictions[:, 1]
sub3['Software engineer'] = sub3_predictions[:, 2]
sub3['Consultant'] = sub3_predictions[:, 3]

display(sub3)

Unnamed: 0,id,label,Data scientist,Machine learning engineer,Software engineer,Consultant
0,1516,1,0.999648,0.000217,0.000007,0.000128
1,1517,1,0.006463,0.000472,0.000843,0.992222
2,1518,1,0.000024,0.000025,0.999889,0.000062
3,1519,1,0.000392,0.000003,0.000016,0.999589
4,1520,1,0.182321,0.013379,0.803751,0.000549
...,...,...,...,...,...,...
1512,3028,1,0.000284,0.000179,0.997581,0.001956
1513,3029,1,0.999882,0.000025,0.000004,0.000089
1514,3030,1,0.000073,0.000064,0.999664,0.000199
1515,3031,1,0.471427,0.000030,0.000061,0.528483


In [None]:
testdataset = Dataset(test, CFG2.tokenizer, CFG2.max_len)

test_loader = DataLoader(testdataset, 
                         batch_size=CFG2.batch_size,
                         shuffle=False,
                         collate_fn = collate_fn2,
                         num_workers = CFG2.num_workers,
                         pin_memory = True,
                         drop_last = False,
                         )

predictions2 = []

for fold in CFG2.trn_fold:
    model = CustomModel1(CFG2.model)
    config_path=CFG2.config_path
    state = torch.load(CFG2.path+f"{CFG2.model.replace('/', '-')}_fold{fold}_best.pth",
                       map_location = torch.device('cpu'))
    model.load_state_dict(state['model'])

    prediction = inference_one_epoch(model, test_loader, device)
    prediction = F.softmax(torch.tensor(prediction)).numpy().astype(float)
    predictions2.append(prediction)
    del model, state, prediction
    gc.collect()
    torch.cuda.empty_cache()

del testdataset,test_loader

Some weights of the model checkpoint at microsoft/deberta-large were not used when initializing DebertaModel: ['lm_predictions.lm_head.dense.bias', 'lm_predictions.lm_head.LayerNorm.weight', 'lm_predictions.lm_head.LayerNorm.bias', 'lm_predictions.lm_head.dense.weight', 'lm_predictions.lm_head.bias']
- This IS expected if you are initializing DebertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DebertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of the model checkpoint at microsoft/deberta-large were not used when initializing DebertaModel: ['lm_predictions.lm_head.dense.bias', 'lm_predictions.lm_head.LayerNorm.weight', 'lm_predictions.lm_head.LayerNorm.bias', 'lm_pre

In [None]:
sub2 = submission_df.copy()
sub2.columns = ["id","label"]


sub2_predictions = np.mean(predictions2, axis=0)
sub2['Data scientist'] = sub2_predictions[:, 0]
sub2['Machine learning engineer'] = sub2_predictions[:, 1]
sub2['Software engineer'] = sub2_predictions[:, 2]
sub2['Consultant'] = sub2_predictions[:, 3]

display(sub2)

Unnamed: 0,id,label,Data scientist,Machine learning engineer,Software engineer,Consultant
0,1516,1,0.999189,0.000682,0.000005,0.000123
1,1517,1,0.000160,0.000034,0.000116,0.999691
2,1518,1,0.000007,0.000029,0.999896,0.000069
3,1519,1,0.000102,0.000001,0.000012,0.999885
4,1520,1,0.288593,0.010193,0.700885,0.000328
...,...,...,...,...,...,...
1512,3028,1,0.000798,0.000837,0.997069,0.001296
1513,3029,1,0.999820,0.000038,0.000007,0.000135
1514,3030,1,0.000095,0.000114,0.997977,0.001814
1515,3031,1,0.550952,0.000034,0.000028,0.448986


In [None]:
sub = submission_df.copy()
sub.columns = ["id","label"]

sub['Data scientist'] = (sub1['Data scientist'] + sub2['Data scientist'] + sub3['Data scientist']) / 3
sub['Machine learning engineer'] = (sub1['Machine learning engineer'] + sub2['Machine learning engineer'] + sub3['Machine learning engineer']) / 3
sub['Software engineer'] = (sub1['Software engineer'] + sub2['Software engineer'] + sub3['Software engineer']) / 3
sub['Consultant'] = (sub1['Consultant'] + sub2['Consultant'] + sub3['Consultant']) / 3
sub["label"] = np.argmax(sub[['Data scientist','Machine learning engineer','Software engineer','Consultant']].values,axis=1)

sub["label"] = sub["label"].astype("int")
sub["label"] = sub["label"] + 1
#sub[["id","label"]].to_csv(os.path.join(OUTPUT_SUB_DIR,"submission33 ensemble[12+30+31].csv"),index=False,header=False)
display(sub)

Unnamed: 0,id,label,Data scientist,Machine learning engineer,Software engineer,Consultant
0,1516,1,0.997137,0.001225,0.000123,0.001515
1,1517,4,0.005675,0.000242,0.001311,0.992773
2,1518,3,0.000121,0.000822,0.997613,0.001444
3,1519,4,0.001853,0.000009,0.000292,0.997846
4,1520,3,0.283193,0.015652,0.697002,0.004152
...,...,...,...,...,...,...
1512,3028,3,0.024956,0.007296,0.951395,0.016352
1513,3029,1,0.997806,0.000292,0.000158,0.001743
1514,3030,3,0.000253,0.001365,0.995897,0.002485
1515,3031,1,0.570527,0.000367,0.000984,0.428123


In [None]:
submit = submission_df.copy()
submit.columns = ["id","label"]

submit['Data scientist'] = sub1['Data scientist']*.35 + sub2['Data scientist']*.25 + sub3['Data scientist']*.4
submit['Machine learning engineer'] = sub1['Machine learning engineer']*.35 + sub2['Machine learning engineer']*.25 + sub3['Machine learning engineer']*.4
submit['Software engineer'] = sub1['Software engineer']*.35 + sub2['Software engineer']*.25 + sub3['Software engineer']*.4
submit['Consultant'] = sub1['Consultant']*.35 + sub2['Consultant']*.25 + sub3['Consultant']*.4
submit["label"] = np.argmax(sub[['Data scientist','Machine learning engineer','Software engineer','Consultant']].values,axis=1)

submit["label"] = submit["label"].astype("int")
submit["label"] = submit["label"] + 1
#submit[["id","label"]].to_csv(os.path.join(OUTPUT_SUB_DIR,"submission33 ensemble[12+30+31].csv"),index=False,header=False)
display(submit)

Unnamed: 0,id,label,Data scientist,Machine learning engineer,Software engineer,Consultant
0,1516,1,0.997057,0.001229,0.000129,0.001585
1,1517,4,0.006266,0.000274,0.001407,0.992053
2,1518,3,0.000128,0.000862,0.997499,0.001512
3,1519,4,0.001955,0.000009,0.000306,0.997730
4,1520,3,0.277610,0.016085,0.701952,0.004354
...,...,...,...,...,...,...
1512,3028,3,0.026139,0.007586,0.949137,0.017138
1513,3029,1,0.997709,0.000304,0.000165,0.001821
1514,3030,3,0.000260,0.001425,0.995877,0.002437
1515,3031,1,0.567529,0.000383,0.001033,0.431054


In [None]:
#train,testでダブっているデータを参照
dup_test_ids = [1707,2122,2291,2775,2191,1700,2304,2149,2676,2844,2144,2764,1774,2446,2736,2301,1822,1852,2070,1609,2423,2695,
                2077,2409,2233,2076,1568,3001,1662,1997, 2896,2352,2842,2321,1630,2259,2968,1551, 1673, 2168]
sub[sub["id"].isin(dup_test_ids)]  

Unnamed: 0,id,label,Data scientist,Machine learning engineer,Software engineer,Consultant
35,1551,4,0.218886,0.000511,0.002262,0.778341
52,1568,1,0.998109,0.000617,0.000111,0.001163
93,1609,4,0.14823,5.9e-05,0.000348,0.851363
114,1630,4,0.088461,2.2e-05,0.000342,0.911174
146,1662,1,0.993967,0.000601,0.001223,0.004208
157,1673,4,0.218886,0.000511,0.002262,0.778341
184,1700,1,0.981522,0.002874,0.004834,0.010769
191,1707,3,0.002075,0.004471,0.977834,0.01562
258,1774,4,0.003284,1.8e-05,0.000435,0.996263
306,1822,4,0.00098,6e-06,0.000117,0.998896


In [None]:
submit[submit["id"].isin(dup_test_ids)] 

Unnamed: 0,id,label,Data scientist,Machine learning engineer,Software engineer,Consultant
35,1551,4,0.206612,0.00054,0.00237,0.790478
52,1568,1,0.998014,0.000646,0.000117,0.001223
93,1609,4,0.161927,6.2e-05,0.000364,0.837648
114,1630,4,0.069283,2.3e-05,0.000359,0.930334
146,1662,1,0.994439,0.000601,0.001012,0.003948
157,1673,4,0.206612,0.00054,0.00237,0.790478
184,1700,1,0.980183,0.003015,0.005076,0.011725
191,1707,3,0.001712,0.004019,0.980326,0.013942
258,1774,4,0.003447,1.9e-05,0.000457,0.996078
306,1822,4,0.001029,7e-06,0.000123,0.998841


In [3]:
import pandas as pd
T = pd.DataFrame([[1707,2122,2291,2775,2191,1700,2304,2149,2676,2844,2144,2764,1774,2446,2736,2301,1822,1852,2070,1609,2423,2695,
                2077,2409,2233,2076,1568,3001,1662,1997, 2896,2352,2842,2321,1630,2259,2968,1551, 1673, 2168],
                [3,4,1,4,1,1,1,1,3,1,3,4,4,1,1,1,4,4,3,4,4,4,3,4,4,1,1,1,1,3,3,1,4,4,4,1,4,4,4,4]]).T
T.columns =["id","label"]
T = T.sort_values(by="id")
T

Unnamed: 0,id,label
37,1551,4
26,1568,1
19,1609,4
34,1630,4
28,1662,1
38,1673,4
5,1700,1
0,1707,3
12,1774,4
16,1822,4


In [4]:
T.shape

(40, 2)

In [None]:
#sub[["id","label"]].to_csv(os.path.join(OUTPUT_SUB_DIR,"submission33 ensemble[12+30+31].csv"),index=False,header=False)
submit[["id","label"]].to_csv(os.path.join(OUTPUT_SUB_DIR,"submission33ensemble[12+30+31].csv"),index=False,header=False)

In [None]:
submit[["id","label"]].to_csv(os.path.join(OUTPUT_SUB_DIR,"final_submission[12+30+31].csv"),index=False,header=False)