In [1]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [2]:
!pip install transformers

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [3]:
!pip install sentencepiece

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [4]:
import os
import gc
import math
import time
import random
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import warnings
warnings.simplefilter('ignore')
from tqdm import tqdm
import re

import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.optim import lr_scheduler
from torch.utils.data import DataLoader, Dataset

from sklearn.model_selection import StratifiedKFold,StratifiedGroupKFold,GroupKFold
from sklearn.metrics import log_loss,f1_score

from transformers import AutoModel, AutoConfig, AutoTokenizer, AdamW, DataCollatorWithPadding
from transformers import get_linear_schedule_with_warmup, get_cosine_schedule_with_warmup

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [5]:
INPUT_DIR = '/content/drive/MyDrive/Competitions/Signate/Student Cup 2022/input/'
OUTPUT_DIR = '/content/drive/MyDrive/Competitions/Signate/Student Cup 2022/output/'
OUTPUT_SUB_DIR = os.path.join(OUTPUT_DIR,'Submission')
OUTPUT_MODEL_DIR1 = os.path.join(OUTPUT_DIR,'Model/DeBERTa-large/')
OUTPUT_MODEL_DIR2 = os.path.join(OUTPUT_DIR,'Model/DeBERTa-base[ver2]/')
OUTPUT_MODEL_DIR3 = os.path.join(OUTPUT_DIR,'Model/RoBERTa-large/')

In [6]:
class CFG1:
    num_workers=2
    path=OUTPUT_MODEL_DIR1
    config_path=OUTPUT_MODEL_DIR1+'config.pth'
    model="microsoft/deberta-large"
    batch_size=32
    dropout=0.2
    target_size=4
    max_len=1024
    seed=42
    n_fold=5
    trn_fold=[0, 1, 2, 3, 4]
    gradient_checkpointing=True
    freezing=True

In [7]:
class CFG2:
    num_workers=2
    path=OUTPUT_MODEL_DIR2
    config_path=OUTPUT_MODEL_DIR2+'config.pth'
    model="microsoft/deberta-v3-base"
    batch_size=16
    dropout=0.1
    target_size=4
    max_len=1024
    seed=42
    n_fold=5
    trn_fold=[0, 1, 2, 3, 4]
    gradient_checkpointing=True
    freezing=True

In [8]:
class CFG3:
    num_workers=2
    path=OUTPUT_MODEL_DIR3
    config_path=OUTPUT_MODEL_DIR3+'config.pth'
    model="roberta-large"
    batch_size=32
    dropout=0.2
    target_size=4
    max_len=128
    seed=42
    n_fold=5
    trn_fold=[0, 1, 2, 3, 4]
    gradient_checkpointing=True
    freezing=True

In [9]:
# Loss Func
def criterion(outputs, labels):
    return nn.CrossEntropyLoss()(outputs, labels)

def softmax(z):
    assert len(z.shape) == 2
    s = np.max(z, axis=1)
    s = s[:, np.newaxis] # necessary step to do broadcasting
    e_x = np.exp(z - s)
    div = np.sum(e_x, axis=1)
    div = div[:, np.newaxis] # dito
    return e_x / div
"""
def get_score(y_true, y_pred):
    y_pred = softmax(y_pred)
    score = log_loss(y_true, y_pred)
    return round(score, 5)
"""
def get_score(outputs, labels):
    outputs = F.softmax(torch.tensor(outputs)).numpy()
    return f1_score(np.argmax(outputs,axis=1),labels ,average='macro')

def get_logger(filename=OUTPUT_DIR+'train'):
    from logging import getLogger, INFO, FileHandler, Formatter, StreamHandler
    logger = getLogger(__name__)
    logger.setLevel(INFO)
    handler1 = StreamHandler()
    handler1.setFormatter(Formatter("%(message)s"))
    handler2 = FileHandler(filename=f"{filename}.log")
    handler2.setFormatter(Formatter("%(message)s"))
    logger.addHandler(handler1)
    logger.addHandler(handler2)
    return logger

LOGGER = get_logger()

def seed_everything(seed=CFG1.seed):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True
    
seed_everything(seed=42)

In [10]:
oof_df = pd.DataFrame()
oof_df1 = pd.read_pickle(OUTPUT_MODEL_DIR1+'oof_df.pkl')
oof_df2 = pd.read_pickle(OUTPUT_MODEL_DIR2+'oof_df.pkl')
oof_df3 = pd.read_pickle(OUTPUT_MODEL_DIR3+'oof_df.pkl')
labels = oof_df1['label'].values
labels2 = oof_df2['label'].values
preds1 = oof_df1[['Data scientist','Machine learning engineer','Software engineer','Consultant']]
preds3 = oof_df3[['Data scientist','Machine learning engineer','Software engineer','Consultant']]
preds2 = oof_df2[['Consultant','Data scientist','Machine learning engineer','Software engineer']]
oof_df["Data scientist"] = (preds1["Data scientist"] + preds2["Data scientist"] + preds3["Data scientist"]) / 3
oof_df["Machine learning engineer"] = (preds1["Machine learning engineer"] + preds2["Machine learning engineer"] + preds3["Machine learning engineer"]) / 3
oof_df["Software engineer"] = (preds1["Software engineer"] + preds2["Software engineer"] + preds3["Software engineer"]) / 3
oof_df["Consultant"] = (preds1["Consultant"] + preds2["Consultant"] + preds3["Consultant"]) / 3
preds = oof_df[['Data scientist','Machine learning engineer','Software engineer','Consultant']].values.tolist()
score1 = get_score(preds1.values, labels)
score2 = get_score(preds2.values, labels2)
score3 = get_score(preds3.values, labels)
mean_score = (score1+score2+score3)/3
score = get_score(preds, labels)
LOGGER.info(f'Deberta-large CV Score: {score1:<.4f}')
LOGGER.info(f'Deberta-v3-baseCV Score: {score2:<.4f}')
LOGGER.info(f'Roberta-large CV Score: {score3:<.4f}')
LOGGER.info(f'CV Mean Score: {mean_score:<.4f}')
LOGGER.info(f'CV Score: {score:<.4f}')

Deberta-large CV Score: 0.7455
Deberta-v3-baseCV Score: 0.7327
Roberta-large CV Score: 0.7385
CV Mean Score: 0.7389
CV Score: 0.7473


In [11]:
def freeze(module):
    """
    Freezes module's parameters.
    """
    
    for parameter in module.parameters():
        parameter.requires_grad = False
        
def get_freezed_parameters(module):
    """
    Returns names of freezed parameters of the given module.
    """
    
    freezed_parameters = []
    for name, parameter in module.named_parameters():
        if not parameter.requires_grad:
            freezed_parameters.append(name)
            
    return freezed_parameters

def set_embedding_parameters_bits(embeddings_path, optim_bits=32):
    """
    https://github.com/huggingface/transformers/issues/14819#issuecomment-1003427930
    """
    
    embedding_types = ("word", "position", "token_type")
    for embedding_type in embedding_types:
        attr_name = f"{embedding_type}_embeddings"
        
        if hasattr(embeddings_path, attr_name): 
            bnb.optim.GlobalOptimManager.get_instance().register_module_override(
                getattr(embeddings_path, attr_name), 'weight', {'optim_bits': optim_bits}
            )

In [12]:
test = pd.read_csv(os.path.join(INPUT_DIR, 'test.csv'))
submission_df = pd.read_csv(os.path.join(INPUT_DIR, 'submit_sample.csv'),header=None)

def remove_tag(x):
    p = re.compile(r"<[^>]*?>")
    return p.sub('',x)

def cleaning(texts):
    clean_texts = []
    for text in texts:
        # htmlタグを削除
        text = remove_tag(text)
        #アルファベット以外をスペースに置き換え
        #clean_punc = re.sub(r'[^a-zA-Z]', ' ', text)
        clean_texts.append(text)
    return clean_texts



from text_unidecode import unidecode
from typing import Dict, List, Tuple
import codecs

def replace_encoding_with_utf8(error: UnicodeError) -> Tuple[bytes, int]:
    return error.object[error.start : error.end].encode("utf-8"), error.end


def replace_decoding_with_cp1252(error: UnicodeError) -> Tuple[str, int]:
    return error.object[error.start : error.end].decode("cp1252"), error.end

# Register the encoding and decoding error handlers for `utf-8` and `cp1252`.
codecs.register_error("replace_encoding_with_utf8", replace_encoding_with_utf8)
codecs.register_error("replace_decoding_with_cp1252", replace_decoding_with_cp1252)

def resolve_encodings_and_normalize(text: str) -> str:
    """Resolve the encoding problems and normalize the abnormal characters."""
    text = (
        text.encode("raw_unicode_escape")
        .decode("utf-8", errors="replace_decoding_with_cp1252")
        .encode("cp1252", errors="replace_encoding_with_utf8")
        .decode("utf-8", errors="replace_decoding_with_cp1252")
    )
    text = unidecode(text)
    return text


test['description'] = cleaning(test['description'])
test['inputs'] = test['description'].apply(lambda x : resolve_encodings_and_normalize(x))
display(test.head())

Unnamed: 0,id,description,inputs
0,1516,Building decision-making models and proposing ...,Building decision-making models and proposing ...
1,1517,Educate homeowners on the benefits of solar en...,Educate homeowners on the benefits of solar en...
2,1518,"Design, develop, document, and implement web a...","Design, develop, document, and implement web a..."
3,1519,Apply advanced technical expertise and skills ...,Apply advanced technical expertise and skills ...
4,1520,Project manage and deliver against our roadmap...,Project manage and deliver against our roadmap...


In [13]:
tokenizer1 = AutoTokenizer.from_pretrained(CFG1.model)
CFG1.tokenizer = tokenizer1
tokenizer2 = AutoTokenizer.from_pretrained(CFG2.model)
CFG2.tokenizer = tokenizer2
tokenizer3 = AutoTokenizer.from_pretrained(CFG3.model)
CFG3.tokenizer = tokenizer3

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [14]:
class Dataset1(Dataset):
    def __init__(self, df, tokenizer, max_length):
        self.df = df
        self.max_len = CFG1.max_len
        self.text = df['inputs'].values
        self.tokenizer = CFG1.tokenizer
        #self.targets = df['label'].values

    def __len__(self):
        return len(self.df)

    def __getitem__(self, index):
        text = self.text[index]
        inputs = tokenizer1.encode_plus(
            text,
            truncation=True,
            add_special_tokens=True,
            max_length = self.max_len
        )
        samples = {
            'input_ids': inputs['input_ids'],
            'attention_mask': inputs['attention_mask'],
            #'target': self.targets[index]
        }

        if 'token_type_ids' in inputs:
            samples['token_type_ids'] = inputs['token_type_ids']
            
        return samples

In [15]:
class Dataset2(Dataset):
    def __init__(self, df, tokenizer, max_length):
        self.df = df
        self.max_len = CFG2.max_len
        self.text = df['inputs'].values
        self.tokenizer = CFG2.tokenizer
        #self.targets = df['label'].values

    def __len__(self):
        return len(self.df)

    def __getitem__(self, index):
        text = self.text[index]
        inputs = tokenizer2.encode_plus(
            text,
            truncation=True,
            add_special_tokens=True,
            max_length = self.max_len
        )
        samples = {
            'input_ids': inputs['input_ids'],
            'attention_mask': inputs['attention_mask'],
            #'target': self.targets[index]
        }

        if 'token_type_ids' in inputs:
            samples['token_type_ids'] = inputs['token_type_ids']
            
        return samples

In [16]:
class Dataset3(Dataset):
    def __init__(self, df, tokenizer, max_length):
        self.df = df
        self.max_len = CFG3.max_len
        self.text = df['inputs'].values
        self.tokenizer = CFG3.tokenizer
        #self.targets = df['label'].values

    def __len__(self):
        return len(self.df)

    def __getitem__(self, index):
        text = self.text[index]
        inputs = tokenizer3.encode_plus(
            text,
            truncation=True,
            add_special_tokens=True,
            max_length = self.max_len
        )
        samples = {
            'input_ids': inputs['input_ids'],
            'attention_mask': inputs['attention_mask'],
            #'target': self.targets[index]
        }

        if 'token_type_ids' in inputs:
            samples['token_type_ids'] = inputs['token_type_ids']
            
        return samples

In [17]:
class Collate:
    def __init__(self, tokenizer, isTrain=True):
        self.tokenizer = tokenizer
        self.isTrain = isTrain
        # self.args = args

    def __call__(self, batch):
        output = dict()
        output["input_ids"] = [sample["input_ids"] for sample in batch]
        output["attention_mask"] = [sample["attention_mask"] for sample in batch]
        if self.isTrain:
            output["target"] = [sample["target"] for sample in batch]

        # calculate max token length of this batch
        batch_max = max([len(ids) for ids in output["input_ids"]])

        # add padding
        if self.tokenizer.padding_side == "right":
            output["input_ids"] = [s + (batch_max - len(s)) * [self.tokenizer.pad_token_id] for s in output["input_ids"]]
            output["attention_mask"] = [s + (batch_max - len(s)) * [0] for s in output["attention_mask"]]
        else:
            output["input_ids"] = [(batch_max - len(s)) * [self.tokenizer.pad_token_id] + s for s in output["input_ids"]]
            output["attention_mask"] = [(batch_max - len(s)) * [0] + s for s in output["attention_mask"]]

        # convert to tensors
        output["input_ids"] = torch.tensor(output["input_ids"], dtype=torch.long)
        output["attention_mask"] = torch.tensor(output["attention_mask"], dtype=torch.long)
        if self.isTrain:
            output["target"] = torch.tensor(output["target"], dtype=torch.long)

        return output
    
collate_fn1 = Collate(CFG1.tokenizer, isTrain=False)
collate_fn2 = Collate(CFG2.tokenizer, isTrain=False)
collate_fn3 = Collate(CFG3.tokenizer, isTrain=False)

In [18]:
class MeanPooling(nn.Module):
    def __init__(self):
        super(MeanPooling, self).__init__()
        
    def forward(self, last_hidden_state, attention_mask):
        input_mask_expanded = attention_mask.unsqueeze(-1).expand(last_hidden_state.size()).float()
        sum_embeddings = torch.sum(last_hidden_state * input_mask_expanded, 1)
        sum_mask = input_mask_expanded.sum(1)
        sum_mask = torch.clamp(sum_mask, min=1e-9) #
        mean_embeddings = sum_embeddings / sum_mask
        return mean_embeddings

In [19]:
class CustomModel1(nn.Module):
    def __init__(self, model_name):
        super(CustomModel1, self).__init__()
        # Header (fast or normal)
        self.model = AutoModel.from_pretrained(model_name)
        
        # Gradient_checkpointing
        if CFG1.gradient_checkpointing:
            (self.model).gradient_checkpointing_enable()
        
        # Freezing
        if CFG1.freezing:
            # freezing embeddings and first 2 layers of encoder
            freeze((self.model).embeddings)
            freeze((self.model).encoder.layer[:2])
            CFG1.after_freezed_parameters = filter(lambda parameter: parameter.requires_grad, (self.model).parameters())
        
        self.config = AutoConfig.from_pretrained(model_name)
        self.drop = nn.Dropout(p=CFG1.dropout)
        #self.pooler = MeanPooling()
        self.dropout1 = nn.Dropout(0.1)
        self.dropout2 = nn.Dropout(0.2)
        self.dropout3 = nn.Dropout(0.3)
        self.dropout4 = nn.Dropout(0.4)
        self.dropout5 = nn.Dropout(0.5)
        self.output = nn.Sequential( nn.Linear(self.config.hidden_size, CFG1.target_size) )

    def _init_weights(self, module):
        if isinstance(module, nn.Linear):
            module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
            if module.bias is not None:
                module.bias.data.zero_()
        elif isinstance(module, nn.Embedding):
            module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
            if module.padding_idx is not None:
                module.weight.data[module.padding_idx].zero_()
        elif isinstance(module, nn.LayerNorm):
            module.bias.data.zero_()
            module.weight.data.fill_(1.0)
        
    def forward(self, ids, mask):        
        output = self.model(input_ids=ids, 
                         attention_mask=mask,
                         output_hidden_states=False)
        output = output[0][:, 0, :]
        logits1 = self.output(self.dropout1(output))
        logits2 = self.output(self.dropout2(output))
        logits3 = self.output(self.dropout3(output))
        logits4 = self.output(self.dropout4(output))
        logits5 = self.output(self.dropout5(output))
        outputs = (logits1 + logits2 + logits3 + logits4 + logits5) / 5
        return outputs

In [20]:
class CustomModel2(nn.Module):
    def __init__(self, model_name):
        super(CustomModel2, self).__init__()
        # Header (fast or normal)
        self.model = AutoModel.from_pretrained(model_name)
        
        # Gradient_checkpointing
        if CFG2.gradient_checkpointing:
            (self.model).gradient_checkpointing_enable()
        
        # Freezing
        if CFG2.freezing:
            # freezing embeddings and first 2 layers of encoder
            freeze((self.model).embeddings)
            freeze((self.model).encoder.layer[:2])
            CFG2.after_freezed_parameters = filter(lambda parameter: parameter.requires_grad, (self.model).parameters())
        
        self.config = AutoConfig.from_pretrained(model_name)
        self.drop = nn.Dropout(p=CFG2.dropout)
        self.pooler = MeanPooling()
        self.fc = nn.Linear(self.config.hidden_size, CFG2.target_size)
        
    def forward(self, ids, mask):        
        out = self.model(input_ids=ids, 
                         attention_mask=mask,
                         output_hidden_states=False)
        out = self.pooler(out.last_hidden_state, mask)
        out = self.drop(out)
        outputs = self.fc(out)
        return outputs

In [21]:
class CustomModel3(nn.Module):
    def __init__(self, model_name):
        super(CustomModel3, self).__init__()
        # Header (fast or normal)
        self.model = AutoModel.from_pretrained(model_name)
        
        # Gradient_checkpointing
        if CFG3.gradient_checkpointing:
            (self.model).gradient_checkpointing_enable()
        
        # Freezing
        if CFG3.freezing:
            # freezing embeddings and first 2 layers of encoder
            freeze((self.model).embeddings)
            freeze((self.model).encoder.layer[:2])
            CFG3.after_freezed_parameters = filter(lambda parameter: parameter.requires_grad, (self.model).parameters())
        
        self.config = AutoConfig.from_pretrained(model_name)
        self.fc = nn.Linear(self.config.hidden_size, CFG3.target_size)
        
    def forward(self, ids, mask):        
        out = self.model(input_ids=ids, 
                         attention_mask=mask,
                         output_hidden_states=False)
        outputs = out[0][:, 0, :]
        outputs = self.fc(outputs)
        return outputs

In [22]:
def inference_one_epoch(model, dataloader, device):
    model.eval()
    pred = []
    model.to(device)
    for step, data in enumerate(dataloader):
        ids = data['input_ids'].to(device, dtype=torch.long)
        mask = data['attention_mask'].to(device, dtype=torch.long)
        with torch.no_grad():
            outputs = model(ids, mask)
        pred.append(outputs.to('cpu').numpy())
    pred = np.concatenate(pred)
    return pred

In [23]:
testdataset = Dataset1(test, CFG1.tokenizer, CFG1.max_len)

test_loader = DataLoader(testdataset, 
                         batch_size=CFG1.batch_size,
                         shuffle=False,
                         collate_fn = collate_fn1,
                         num_workers = CFG1.num_workers,
                         pin_memory = True,
                         drop_last = False,
                         )

predictions1 = []

for fold in CFG1.trn_fold:
    model = CustomModel1(CFG1.model)
    config_path=CFG1.config_path
    state = torch.load(CFG1.path+f"{CFG1.model.replace('/', '-')}_fold{fold}_best.pth",
                       map_location = torch.device('cpu'))
    model.load_state_dict(state['model'])

    prediction = inference_one_epoch(model, test_loader, device)
    prediction = F.softmax(torch.tensor(prediction)).numpy().astype(float)
    predictions1.append(prediction)
    del model, state, prediction
    gc.collect()
    torch.cuda.empty_cache()

del testdataset,test_loader

Some weights of the model checkpoint at microsoft/deberta-large were not used when initializing DebertaModel: ['lm_predictions.lm_head.LayerNorm.weight', 'lm_predictions.lm_head.bias', 'lm_predictions.lm_head.dense.weight', 'lm_predictions.lm_head.LayerNorm.bias', 'lm_predictions.lm_head.dense.bias']
- This IS expected if you are initializing DebertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DebertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of the model checkpoint at microsoft/deberta-large were not used when initializing DebertaModel: ['lm_predictions.lm_head.LayerNorm.weight', 'lm_predictions.lm_head.bias', 'lm_predictions.lm_head.dense.weight', 'lm_predictions

In [24]:
sub1 = submission_df.copy()
sub1.columns = ["id","label"]


#method1
sub1_predictions = np.mean(predictions1, axis=0)
sub1['Data scientist'] = sub1_predictions[:, 0]
sub1['Machine learning engineer'] = sub1_predictions[:, 1]
sub1['Software engineer'] = sub1_predictions[:, 2]
sub1['Consultant'] = sub1_predictions[:, 3]

display(sub1)

Unnamed: 0,id,label,Data scientist,Machine learning engineer,Software engineer,Consultant
0,1516,1,0.992574,0.002775,0.000357,0.004294
1,1517,1,0.010402,0.000219,0.002974,0.986406
2,1518,1,0.000332,0.002413,0.993055,0.004200
3,1519,1,0.005065,0.000023,0.000847,0.994065
4,1520,1,0.378666,0.023385,0.586371,0.011578
...,...,...,...,...,...,...
1512,3028,1,0.073787,0.020873,0.859536,0.045804
1513,3029,1,0.993717,0.000814,0.000463,0.005006
1514,3030,1,0.000591,0.003917,0.990050,0.005441
1515,3031,1,0.689201,0.001036,0.002863,0.306900


In [25]:
testdataset = Dataset3(test, CFG3.tokenizer, CFG3.max_len)

test_loader = DataLoader(testdataset, 
                         batch_size=CFG3.batch_size,
                         shuffle=False,
                         collate_fn = collate_fn3,
                         num_workers = CFG3.num_workers,
                         pin_memory = True,
                         drop_last = False,
                         )

predictions3 = []

for fold in CFG3.trn_fold:
    model = CustomModel3(CFG3.model)
    config_path=CFG3.config_path
    state = torch.load(CFG3.path+f"{CFG3.model.replace('/', '-')}_fold{fold}_best.pth",
                       map_location = torch.device('cpu'))
    model.load_state_dict(state['model'])

    prediction = inference_one_epoch(model, test_loader, device)
    prediction = F.softmax(torch.tensor(prediction)).numpy().astype(float)
    predictions3.append(prediction)
    del model, state, prediction
    gc.collect()
    torch.cuda.empty_cache()

del testdataset,test_loader

Some weights of the model checkpoint at roberta-large were not used when initializing RobertaModel: ['lm_head.dense.bias', 'lm_head.dense.weight', 'lm_head.decoder.weight', 'lm_head.bias', 'lm_head.layer_norm.bias', 'lm_head.layer_norm.weight']
- This IS expected if you are initializing RobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of the model checkpoint at roberta-large were not used when initializing RobertaModel: ['lm_head.dense.bias', 'lm_head.dense.weight', 'lm_head.decoder.weight', 'lm_head.bias', 'lm_head.layer_norm.bias', 'lm_head.layer_norm.weight']
- This IS expected if you are initializing RobertaM

In [26]:
sub3 = submission_df.copy()
sub3.columns = ["id","label"]


#method1
sub3_predictions = np.mean(predictions3, axis=0)
sub3['Data scientist'] = sub3_predictions[:, 0]
sub3['Machine learning engineer'] = sub3_predictions[:, 1]
sub3['Software engineer'] = sub3_predictions[:, 2]
sub3['Consultant'] = sub3_predictions[:, 3]

display(sub3)

Unnamed: 0,id,label,Data scientist,Machine learning engineer,Software engineer,Consultant
0,1516,1,0.996578,0.000880,0.000109,0.002433
1,1517,1,0.000700,0.000229,0.003241,0.995829
2,1518,1,0.000319,0.000522,0.997456,0.001704
3,1519,1,0.003260,0.000050,0.001089,0.995601
4,1520,1,0.180065,0.012080,0.805354,0.002502
...,...,...,...,...,...,...
1512,3028,1,0.006764,0.002124,0.964696,0.026416
1513,3029,1,0.997963,0.000995,0.000116,0.000925
1514,3030,1,0.000547,0.000670,0.996919,0.001864
1515,3031,1,0.548783,0.000077,0.000830,0.450310


In [27]:
testdataset = Dataset2(test, CFG2.tokenizer, CFG2.max_len)

test_loader = DataLoader(testdataset, 
                         batch_size=CFG2.batch_size,
                         shuffle=False,
                         collate_fn = collate_fn2,
                         num_workers = CFG2.num_workers,
                         pin_memory = True,
                         drop_last = False,
                         )

predictions2 = []

for fold in CFG2.trn_fold:
    model = CustomModel2(CFG2.model)
    config_path=CFG2.config_path
    state = torch.load(CFG2.path+f"{CFG2.model.replace('/', '-')}_fold{fold}_best.pth",
                       map_location = torch.device('cpu'))
    model.load_state_dict(state['model'])

    prediction = inference_one_epoch(model, test_loader, device)
    prediction = F.softmax(torch.tensor(prediction)).numpy().astype(float)
    predictions2.append(prediction)
    del model, state, prediction
    gc.collect()
    torch.cuda.empty_cache()

del testdataset,test_loader

Some weights of the model checkpoint at microsoft/deberta-v3-base were not used when initializing DebertaV2Model: ['mask_predictions.dense.bias', 'lm_predictions.lm_head.LayerNorm.weight', 'mask_predictions.LayerNorm.weight', 'mask_predictions.LayerNorm.bias', 'mask_predictions.dense.weight', 'lm_predictions.lm_head.bias', 'lm_predictions.lm_head.dense.weight', 'mask_predictions.classifier.weight', 'lm_predictions.lm_head.LayerNorm.bias', 'lm_predictions.lm_head.dense.bias', 'mask_predictions.classifier.bias']
- This IS expected if you are initializing DebertaV2Model from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DebertaV2Model from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of the m

In [28]:
sub2 = submission_df.copy()
sub2.columns = ["id","label"]


#method1
sub2_predictions = np.mean(predictions2, axis=0)
sub2['Consultant'] = sub2_predictions[:, 0]
sub2['Machine learning engineer'] = sub2_predictions[:, 2]
sub2['Software engineer'] = sub2_predictions[:, 3]
sub2['Data scientist'] = sub2_predictions[:, 1]

display(sub2)

Unnamed: 0,id,label,Consultant,Machine learning engineer,Software engineer,Data scientist
0,1516,1,0.002438,0.008911,0.000421,0.988230
1,1517,1,0.955653,0.003524,0.031781,0.009043
2,1518,1,0.003658,0.001237,0.994690,0.000416
3,1519,1,0.997450,0.000164,0.000700,0.001686
4,1520,1,0.010466,0.031057,0.854887,0.103590
...,...,...,...,...,...,...
1512,3028,1,0.108634,0.079283,0.748733,0.063350
1513,3029,1,0.002041,0.003801,0.000191,0.993967
1514,3030,1,0.002691,0.002433,0.994510,0.000366
1515,3031,1,0.853466,0.000740,0.002419,0.143376


In [29]:
sub = submission_df.copy()
sub.columns = ["id","label"]

sub['Data scientist'] = (sub1['Data scientist'] + sub2['Data scientist'] + sub3['Data scientist']) / 3
sub['Machine learning engineer'] = (sub1['Machine learning engineer'] + sub2['Machine learning engineer'] + sub3['Machine learning engineer']) / 3
sub['Software engineer'] = (sub1['Software engineer'] + sub2['Software engineer'] + sub3['Software engineer']) / 3
sub['Consultant'] = (sub1['Consultant'] + sub2['Consultant'] + sub3['Consultant']) / 3
sub["label"] = np.argmax(sub[['Data scientist','Machine learning engineer','Software engineer','Consultant']].values,axis=1)

sub["label"] = sub["label"].astype("int")
sub["label"] = sub["label"] + 1
sub[["id","label"]].to_csv(os.path.join(OUTPUT_SUB_DIR,"0.7473submission22[2+3+5].csv"),index=False,header=False)
display(sub)

Unnamed: 0,id,label,Data scientist,Machine learning engineer,Software engineer,Consultant
0,1516,1,0.992461,0.004188,0.000296,0.003055
1,1517,4,0.006715,0.001324,0.012665,0.979296
2,1518,3,0.000355,0.001391,0.995067,0.003187
3,1519,4,0.003337,0.000079,0.000879,0.995705
4,1520,3,0.220773,0.022174,0.748871,0.008182
...,...,...,...,...,...,...
1512,3028,3,0.047967,0.034093,0.857655,0.060285
1513,3029,1,0.995216,0.001870,0.000257,0.002657
1514,3030,3,0.000501,0.002340,0.993827,0.003332
1515,3031,4,0.460453,0.000618,0.002037,0.536892


In [30]:
submit = submission_df.copy()
submit.columns = ["id","label"]

submit['Data scientist'] = sub1['Data scientist']*.45 + sub2['Data scientist']*.3 + sub3['Data scientist']*.25
submit['Machine learning engineer'] = sub1['Machine learning engineer']*.45 + sub2['Machine learning engineer']*.3 + sub3['Machine learning engineer']*.25
submit['Software engineer'] = sub1['Software engineer']*.45 + sub2['Software engineer']*.3 + sub3['Software engineer']*.25
submit['Consultant'] = sub1['Consultant']*.45 + sub2['Consultant']*.3 + sub3['Consultant']*.25
submit["label"] = np.argmax(sub[['Data scientist','Machine learning engineer','Software engineer','Consultant']].values,axis=1)

submit["label"] = submit["label"].astype("int")
submit["label"] = submit["label"] + 1
submit[["id","label"]].to_csv(os.path.join(OUTPUT_SUB_DIR,"7473submission23[2+3+5].csv"),index=False,header=False)
display(submit)

Unnamed: 0,id,label,Data scientist,Machine learning engineer,Software engineer,Consultant
0,1516,1,0.992272,0.004142,0.000314,0.003272
1,1517,4,0.007569,0.001213,0.011683,0.979536
2,1518,3,0.000354,0.001588,0.994645,0.003413
3,1519,4,0.003600,0.000072,0.000863,0.995464
4,1520,3,0.246493,0.022860,0.721672,0.008976
...,...,...,...,...,...,...
1512,3028,3,0.053900,0.033709,0.852585,0.059806
1513,3029,1,0.994854,0.001755,0.000295,0.003096
1514,3030,3,0.000513,0.002660,0.993106,0.003722
1515,3031,4,0.490349,0.000708,0.002221,0.506722
