ソース：https://github.com/hnoya/StudentCup2020

# 前処理

In [None]:
!pip install -q googletrans==4.0.0-rc1
!pip install -q catboost
!pip install -q transformers
!pip install -q nlp

[K     |████████████████████████████████| 55 kB 2.3 MB/s 
[K     |████████████████████████████████| 1.4 MB 10.6 MB/s 
[K     |████████████████████████████████| 42 kB 1.5 MB/s 
[K     |████████████████████████████████| 65 kB 2.8 MB/s 
[K     |████████████████████████████████| 53 kB 2.1 MB/s 
[?25h  Building wheel for googletrans (setup.py) ... [?25l[?25hdone
[K     |████████████████████████████████| 76.6 MB 1.3 MB/s 
[K     |████████████████████████████████| 4.7 MB 5.1 MB/s 
[K     |████████████████████████████████| 101 kB 11.1 MB/s 
[K     |████████████████████████████████| 596 kB 73.2 MB/s 
[K     |████████████████████████████████| 6.6 MB 56.4 MB/s 
[K     |████████████████████████████████| 1.7 MB 5.1 MB/s 
[K     |████████████████████████████████| 212 kB 74.7 MB/s 
[?25h

In [None]:
import os, gc, sys
import re
import random
import pickle
import csv

import pandas as pd
import numpy as np
from tqdm import tqdm

from googletrans import Translator

from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from nltk.stem.porter import PorterStemmer
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import f1_score

import lightgbm as lgb
from catboost import CatBoost, Pool

import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from transformers import AutoTokenizer, AutoModel, AdamW
import nlp

import matplotlib.pyplot as plt
import seaborn as sns

import time

import warnings
warnings.filterwarnings('ignore')

In [None]:
def seed_everything(seed):
    """
    GPU+Pytorchを使用する場合の再現性確保のための関数.
    Parameters
    ----------
    seed: int
        固定するシードの値.
    """
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True


In [None]:
def get_googletranslate(params):
    """
    再翻訳してデータ増強を行うための関数.
    APIを使用する。リクエスト間隔は3秒間取っているため、実行時間が8時間ほど必要.
    
    Parameters
    ----------
    params: class(object)
        パラメータを保管したParametersクラス.
    """
    train = pd.read_csv(params.TRAIN_FILE)
    test = pd.read_csv(params.TEST_FILE)
    train_texts = train[params.TEXT_COL].tolist()
    test_texts = test[params.TEXT_COL].tolist()
 
    tslr = Translator()
    """
    train_texts_en2fr2en = []
    for train_text in tqdm(train_texts):
        fr = tslr.translate(train_text, dest="fr")
        en = tslr.translate(fr.text, dest="en")
        train_texts_en2fr2en.append(en.text)
        time.sleep(1)
    train_fr = pd.DataFrame({
        "id":train["id"].tolist(),
        params.TEXT_COL:train_texts_en2fr2en,
        params.TARGET:train["jobflag"].tolist()
    })
    train_fr.to_csv(params.BASE_PATH+"data/train_fr.csv", index=False)
    """
    #train_fr = pd.load_csv(params.BASE_PATH+"data/train_fr.csv", index=False)
    with open(params.BASE_PATH+"data/train_fr.csv", newline='') as f:
      reader = csv.reader(f)
      train_fr = list(reader)

    """
    test_texts_en2fr2en = []
    for test_text in tqdm(test_texts):
        try:
          fr = tslr.translate(test_text, dest="fr")
          en = tslr.translate(fr.text, dest="en")
          test_texts_en2fr2en.append(en.text)
          time.sleep(1)
        except:
          test_texts_en2fr2en.append(test_text)

    test_fr = pd.DataFrame({
        "id":test["id"].tolist(),
        params.TEXT_COL:test_texts_en2fr2en,
    })
    """
    #test_fr.to_csv(params.BASE_PATH+"data/test_fr.csv", index=False)
    with open(params.BASE_PATH+"data/test_fr.csv", newline='') as f:
      reader = csv.reader(f)
      test_fr = list(reader)
 
    """
    train_texts_en2de2en = []
    for train_text in tqdm(train_texts):
        de = tslr.translate(train_text, dest="de")
        en = tslr.translate(de.text, dest="en")
        train_texts_en2de2en.append(en.text)
        time.sleep(1)
    train_de = pd.DataFrame({
        "id":train["id"].tolist(),
        params.TEXT_COL:train_texts_en2de2en,
        params.TARGET:train["jobflag"].tolist()
    })
    """
    #train_de.to_csv(params.BASE_PATH+"data/train_de.csv", index=False)
    with open(params.BASE_PATH+"data/train_de.csv", newline='') as f:
      reader = csv.reader(f)
      train_de = list(reader)
 
    """
    test_texts_en2de2en = []
    for test_text in tqdm(test_texts):
        de = tslr.translate(test_text, dest="de")
        en = tslr.translate(de.text, dest="en")
        test_texts_en2de2en.append(en.text)
        time.sleep(1)
    test_de = pd.DataFrame({
        "id":test["id"].tolist(),
        params.TEXT_COL:test_texts_en2de2en,
    })
    """
    #test_de.to_csv(params.BASE_PATH+"data/test_de.csv", index=False)
    with open(params.BASE_PATH+"data/test_de.csv", newline='') as f:
      reader = csv.reader(f)
      test_de = list(reader)

    return train_fr, test_fr, train_de, test_de

In [None]:
def del_space(x):
    """
    クリーニング用の関数.
    余計な空白を除去する.
    
    Parameters:
    -----------
    x: str
        クリーニングしたいテキスト
    
    Returns:
    -----------
    x: str
        クリーニングしたテキスト
    """
    while '  ' in x:
        x = x.replace('  ', ' ')
    return x

In [None]:
def cleaning(texts):
    """from https://signate.jp/competitions/281/tutorials/17
    SGINATEチュートリアルから参照.
    データクリーニング用の関数.
    
    Parameters:
    -----------
    texts: List[str]
        クリーニングしたいテキストのリスト.
    Returns:
    -----------
    clean_texts: List[str]
        クリーニングしたテキストのリスト.
    """
    clean_texts = []
    stemmer = PorterStemmer()
    for text in texts:
        clean_punc = re.sub(r'[^a-zA-Z]', ' ', text)
        clean_short_tokenized = [word for word in clean_punc.split() if len(word) > 3]
        clean_normalize = [stemmer.stem(word) for word in clean_short_tokenized]
        clean_text = ' '.join(clean_normalize)
        clean_texts.append(clean_text)
    return clean_texts

In [None]:
def feature_extraction_vc(df, bottom_thld=0.0025, upper_thld=0.5):
    """
    Countベースのテキスト特徴量抽出関数.
    
    Parameters:
    -----------
    df: pd.DataFrame
        特徴量抽出をしたいデータフレーム.
    bottom_thld, upper_thld: float, float
        使用する特徴量の出現頻度の下限と上限.
        
    Returns:
    -----------
    voc_df: pd.DataFrame
        Countベースで特徴抽出したデータフレーム
    """
    vc = CountVectorizer()
    df = vc.fit_transform(df[params.TEXT_COL])
    voc_df = pd.DataFrame(df.toarray(), columns=vc.get_feature_names())
    use_cols = []
    for col in voc_df.columns:
        if voc_df.shape[0]*bottom_thld<voc_df[col].sum()<voc_df.shape[0]*upper_thld:
            use_cols.append(col)
    voc_df = voc_df[use_cols]
    voc_cols = {col:col+'_voc' for col in voc_df.columns}
    voc_df = voc_df.rename(columns=voc_cols)
    return voc_df


In [None]:
def feature_extraction_tfidf(df, bottom_thld=0.9):
    """
    tfidfベースのテキスト特徴量抽出関数.
    
    Parameters:
    -----------
    df: pd.DataFrame
        特徴量抽出をしたいデータフレーム.
    bottom_thld: float
        使用する特徴量の標準偏差の下限.
    Returns:
    -----------
    tdidf_df: pd.DataFrame
        tfidfベースで特徴抽出したデータフレーム
    """
    tfidf = TfidfVectorizer()
    df = tfidf.fit_transform(df)
    tfidf_df = pd.DataFrame(df.toarray(), columns=tfidf.get_feature_names())
    use_cols = []
    thld = np.percentile(tfidf_df.std().values, bottom_thld*100)
    for col in tfidf_df.columns:
        if thld < tfidf_df[col].std():
            use_cols.append(col)
    tfidf_df = tfidf_df[use_cols]
    tfidf_cols = {col:col+'_tfidf' for col in tfidf_df.columns}
    tfidf_df = tfidf_df.rename(columns=tfidf_cols)
    return tfidf_df

In [None]:
def preprocessing_lgb(vc_btm_thld=0.0025, vc_upr_thld=0.5, tfidf_thld=0.9):
    """
    lightgbm用の前処理関数.
    Parameters:
    ------------
    vc_btm_thld: float
        使用する特徴量の出現頻度の下限.
    vc_upr_thld: float
        使用する特徴量の出現頻度の上限.
    tfidf_thld: float
        tfidfで使用する特徴量の標準偏差の下限.
    Returns:
    ------------
    train: pd.DataFrame
        訓練用データフレーム.
    test: pd.DataFrame
        テスト用データフレーム.
    """
    train = pd.read_csv(params.TRAIN_FILE)
    test = pd.read_csv(params.TEST_FILE)
    test[params.TARGET] = -1

    df = pd.concat([train, test]).reset_index(drop=True)
    df[params.TEXT_COL] = df[params.TEXT_COL].apply(lambda x: del_space(x))
    
    df['description'] = cleaning(df['description'])
    voc_df = feature_extraction_vc(df, vc_btm_thld, vc_upr_thld)
    tfidf_df = feature_extraction_tfidf(df, tfidf_thld)
    df = pd.concat([pd.concat([train,test]).reset_index(drop=True), voc_df, tfidf_df], axis=1)
    train = df.iloc[:train.shape[0], :]
    test = df.iloc[train.shape[0]:, :]
    
    del voc_df, tfidf_df
    gc.collect()

    col = [c for c in train.columns if c not in ['id', params.TEXT_COL]]
    train = train[col]
    test = test[col].drop([params.TARGET], axis=1)
    return train, test

In [None]:
def get_train_data(params):
    """訓練用のデータフレームを返す.
    """
    return main(params)[0]

def get_test_data(params):
    """テスト用のデータフレームを返す.
    """
    return main(params)[1]

In [None]:
def main(params):
    train_fr, test_fr, train_de, test_de = get_googletranslate(params)

    lgb_train, lgb_test = preprocessing_lgb()
    lgb_train.to_csv(params.BASE_PATH+"data/lgb_train.csv", index=False)
    lgb_test.to_csv(params.BASE_PATH+"data/lgb_test.csv", index=False)

    train = pd.read_csv(params.TRAIN_FILE)
    test = pd.read_csv(params.TEST_FILE)
    return (train, train_fr, train_de, lgb_train), (test, test_fr, test_de, lgb_test)


class Parameters(object):
    """
    パラメータ管理用のクラス.
    """
    def __init__(self):
        self.SEED = 2022
        # コードのパス. os.getcwd()が動かない場合はstrで直接渡す.
        self.BASE_PATH = "/content/drive/MyDrive/Competition/SIGNATE/Datascientist/"
        #self.BASE_PATH = os.getcwd() + '/'
        self.TRAIN_FILE = self.BASE_PATH + "data/train.csv"
        self.TEST_FILE = self.BASE_PATH + "data/test.csv"
        self.TRAIN_FILE_FR = self.BASE_PATH+"data/train_fr.csv"
        self.TEST_FILE_FR = self.BASE_PATH+"data/test_fr.csv"
        self.TRAIN_FILE_DE = self.BASE_PATH+"data/train_de.csv"
        self.TEST_FILE_DE = self.BASE_PATH+"data/test_de.csv"
        self.TEXT_COL = "description"
        self.TARGET = "jobflag"
        self.NUM_CLASS = 4
params = Parameters()

In [None]:
# if __name__ == "__main__":
seed_everything(params.SEED)
if "models" not in os.listdir(params.BASE_PATH):
    os.mkdir(params.BASE_PATH + "models/")
main(params)

# 学習

In [None]:
def metric_f1(labels, preds):
    """
    クラスごとに重みづけしたF1評価関数.
    Parameters:
    -----------
    labels: np.array
        正解ラベル.
    preds: np.array
        予測ラベル. 予測確率ではないことに注意.
    Returns:
    -----------
    score: params.CLASS_WEIGHTで重みづけされたF1スコア.
    """
    return f1_score(labels, preds, average=None) @ params.CLASS_WEIGHT

In [None]:
def metric_f1_lgb(preds, data):
    """
    lightgbmのためのF1評価関数.
    詳細はlightgbmのドキュメント参照.
    
    Parameters:
    -----------
    preds: np.array
        予測値, flattenされているのでreshapeする必要あり.
    data: lightgbm.Dataset
        学習データ.
    
    Returns:
    -----------
    "metric_f1": str
        評価関数名.
    score: float
        スコア.
    True: bool
        評価値が高い方が良いモデルか否か.
        損失関数を使う場合はFalse.
    """
    y_true = data.get_label()
    preds = preds.reshape(params.NUM_CLASS, len(preds) // params.NUM_CLASS)
    y_pred = np.argmax(preds, axis=0)
    score = f1_score(y_true, y_pred, average=None) @ params.CLASS_WEIGHT
    return "metric_f1", score, True

In [None]:
def make_weight(x):
    """
    Lightgbmのための重みづけ関数.
    
    Parameters:
    -----------
    x: int
        ラベル番号.
    
    Returns:
    -----------
    params.CLASS_WEIGHT[x]: float
        対応するラベルの重み.
    """
    return params.CLASS_WEIGHT[x]

In [None]:
def make_folded_df(csv_file, num_splits=4):
    """
    fold番号を振るための関数.
    StratifiedKFoldを使用するため、labelsという列名でラベルを保持する必要がある.
    Parameters:
    -----------
    csv_file: str
        csvファイルのパス.
    num_splits: int
        フォールド数.
    
    Returns:
    -----------
    df: pd.DataFrame
        foldにフォールド番号が入ったdf.
    """
    df = pd.read_csv(csv_file)
    df[params.TARGET] = df[params.TARGET] - 1
    df["fold"] = -1
    df = df.rename(columns={params.TARGET: 'labels'})
    label = df["labels"].tolist()

    skfold = StratifiedKFold(num_splits, shuffle=True, random_state=params.SEED)
    for fold, (train_index, valid_index) in enumerate(skfold.split(range(len(label)), label)):
        df['fold'].iloc[valid_index] = fold
    return df

In [None]:
def make_dataset(df, tokenizer, device, model_name):
    """
    NLPモデル用のデータセットを作成するための関数.
    Parameters:
    -----------
    df: pd.DataFrame
        モデル用のデータセット.
    tokenizer: transformers.AutoTokenizer.from_pretrained
        モデル用のtokenizer.
    device: str
        使用するデバイス. "cpu" or "cuda".
    model_name: str
        使用するモデルの名前.
    
    Returns:
    ----------
    dataset: nlp.Dataset.from_pandas
        NLP用のデータセット.
    """
    dataset = nlp.Dataset.from_pandas(df)
    dataset = dataset.map(
        lambda example: tokenizer(example[params.TEXT_COL],
                                  padding="max_length",
                                  truncation=True,
                                  max_length=params.MAX_TOKEN_LEN))
    if not model_name in ["roberta-base", "distilbert-base-cased"]:
        dataset.set_format(type='torch', 
                           columns=['input_ids', 'token_type_ids', 'attention_mask', 'labels'], 
                           device=device)
    else:
        dataset.set_format(type='torch', 
                           columns=['input_ids', 'attention_mask', 'labels'], 
                           device=device)
    return dataset

In [None]:
def train_lgb(X, y, weight, n_folds=4):
    """
    lightgbm用の訓練関数.
    
    Parameters:
    -----------
    X: pd.DataFrame
        訓練用の説明変数.
    y: pd.DataFrame
        訓練用の被説明変数.
    weight: List[float]
        訓練時のサンプルの重み.
    n_folds: int
        フォールド数.
    
    Returns:
    -----------
    scores: float
        訓練時のOOFスコア.
    feature_importances: pd.DataFrame
        モデルの特徴量の重要度.
    train_pred: np.array
        訓練時のOOF予測値.
    """
    train_pred = np.zeros((X.shape[0], y.nunique()), dtype='float32')
    feature_importances = pd.DataFrame()
    feature_importances['feature'] = X.columns
    kfold = StratifiedKFold(n_splits=n_folds, shuffle=True, random_state=params.SEED)

    print("LightGBM Training...")
    for fold, (train_idx, valid_idx) in enumerate(tqdm(kfold.split(X, y))):
        X_train, X_valid = X.iloc[train_idx], X.iloc[valid_idx]
        y_train, y_valid = y.iloc[train_idx], y.iloc[valid_idx]
        weight_train, weight_valid = weight.iloc[train_idx], weight.iloc[valid_idx]
        train_data = lgb.Dataset(X_train, label=y_train, weight=weight_train)
        valid_data = lgb.Dataset(X_valid, label=y_valid, weight=weight_valid)
        lgb_params = {
            'objective': 'multiclass',
            'num_class': 4,
            'metric': 'None',
            'learning_rate': 0.01,
            'max_depth': -1,
            'num_leaves': 31,
            'max_bin': 31,
            'min_data_in_leaf': 3,
            'verbose': -1,
            'seed': params.SEED,
            'drop_seed': params.SEED,
            'data_random_seed':params.SEED
        }
        model = lgb.train(lgb_params, train_data, valid_sets=[train_data,valid_data],
                          num_boost_round=params.GBDT_ROUNDS,
                          early_stopping_rounds=params.GBDT_EARLY_STOPPING,
                          feval=metric_f1_lgb,
                          verbose_eval=False, )
        pickle.dump(model, open(params.MODELS_DIR+"lgb_fold{}.lgbmodel".format(fold),
                                "wb"))
        y_val_pred = model.predict(X_valid)
        train_pred[valid_idx,:] = y_val_pred
        feature_importances['fold_{}'.format(fold)] = model.feature_importance(importance_type='gain')
        gc.collect()

    feature_importances['importance'] = feature_importances.iloc[:,1:1+n_folds].mean(axis=1)
    feature_importances = feature_importances.sort_values(by='importance', ascending=False)
    scores = f1_score(y, np.argmax(train_pred, axis=1), average=None) @ params.CLASS_WEIGHT
    return scores, feature_importances, train_pred

In [None]:
def train_ctb(X, y, n_folds=4):
    """
    catboost用の訓練関数.
    
    Parameters:
    -----------
    X: pd.DataFrame
        訓練用の説明変数.
    y: pd.DataFrame
        訓練用の被説明変数.
    n_folds: int
        フォールド数.
    
    Returns:
    -----------
    scores: float
        訓練時のOOFスコア.
    train_pred: np.array
        訓練時のOOF予測値.
    """
    train_pred = np.zeros((X.shape[0], y.nunique()), dtype='float32')
    kfold = StratifiedKFold(n_splits=n_folds, shuffle=True, random_state=params.SEED)

    print("CatBoost Training...")
    for fold, (train_idx, valid_idx) in enumerate(tqdm(kfold.split(X, y))):
        X_train, X_valid = X.iloc[train_idx], X.iloc[valid_idx]
        y_train, y_valid = y.iloc[train_idx], y.iloc[valid_idx]
        train_data = Pool(X_train, label=y_train, text_features=[params.TEXT_COL])
        valid_data = Pool(X_valid, label=y_valid, text_features=[params.TEXT_COL])
        ctb_params = {
            'objective': 'MultiClass',
            'loss_function': 'TotalF1',
            'class_weights': params.CLASS_WEIGHT.tolist(),
            'num_boost_round':params.GBDT_ROUNDS,
            'early_stopping_rounds':params.GBDT_EARLY_STOPPING,
            'learning_rate':0.03,
            'l2_leaf_reg':3.0,
            #'subsample':0.66,
            'max_depth':6,
            'grow_policy':'SymmetricTree',
            'min_data_in_leaf':1,
            'max_leaves':31,
            'verbose':False,
            'random_seed':params.SEED,
        }
        model = CatBoost(ctb_params)
        model.fit(train_data, eval_set=[valid_data], use_best_model=True, plot=False)
        pickle.dump(model, open(params.MODELS_DIR+"ctb_fold{}.ctbmodel".format(fold),
                                "wb"))
        train_pred[valid_idx, :] = model.predict(X_valid)
        gc.collect()
    scores = f1_score(y, np.argmax(train_pred, axis=1), average=None) @ params.CLASS_WEIGHT
    return scores, train_pred

In [None]:
def train_fn(dataloader, model, criterion, optimizer, device, epoch):
    """
    NLPモデル訓練EPOCH用関数.
    Parameters:
    -----------
    dataloader: torch.dataset.dataloader
        NLP用のデータローダー.
    model: torch.nn.Module
        NLP用のtorchのモデル.
    criterion: torch.nn.*Loss
        NLP用の損失関数. 自分で作成した関数も可能.
    optimizer: torch.optim.*
        NLP用の最適化関数.
    device: str
        使用するデバイス. "cuda" or "cpu".
    epoch: int
        学習するエポック数.
    
    Returns:
    ---------
    train_losses: float
        訓練時の累積損失.
    train_acc: float
        訓練時の正解率.
    train_f1: float
        訓練時のF1.
    """
    model.train()
    train_losses = 0
    correct_counts = 0
    train_labels = []
    train_preds = []
    for i, batch in enumerate(dataloader):
        if len(batch.values())==4:
            attention_mask, input_ids, labels, token_type_ids = batch.values()
        else:
            attention_mask, input_ids, labels = batch.values()
            token_type_ids = None
        optimizer.zero_grad()
        outputs = model(input_ids, attention_mask, token_type_ids)
        loss = criterion(outputs, labels)
        _, preds = torch.max(outputs, axis=1)
        loss.backward()
        optimizer.step()

        train_losses += loss.item()
        correct_counts += torch.sum(preds == labels)

        train_labels += labels.tolist()
        train_preds += preds.tolist()

    train_losses = train_losses / len(dataloader)
    train_acc = correct_counts.double().cpu().detach().numpy() / len(dataloader.dataset)
    train_f1 = metric_f1(train_labels, train_preds)

    return train_losses, train_acc, train_f1


def eval_fn(dataloader, model, criterion, device):
    """
    NLPモデル検証EPOCH用関数.
    Parameters:
    -----------
    dataloader: torch.dataset.dataloader
        NLP用のデータローダー.
    model: torch.nn.Module
        NLP用のtorchのモデル.
    criterion: torch.nn.*Loss
        NLP用の損失関数. 自分で作成した関数も可能.
    device: str
        使用するデバイス. "cuda" or "cpu".
    Returns:
    ---------
    valid_losses: float
        検証時の累積損失.
    valid_acc: float
        検証時の正解率.
    valid_f1: float
        検証時のF1.
    """
    model.eval()
    valid_losses = 0
    total_corrects = 0
    all_labels = []
    all_preds = []

    with torch.no_grad():
        for i, batch in enumerate(dataloader):
            if len(batch.values())==4:
                attention_mask, input_ids, labels, token_type_ids = batch.values()
            else:
                attention_mask, input_ids, labels = batch.values()
                token_type_ids = None
            outputs = model(input_ids, attention_mask, token_type_ids)
            loss = criterion(outputs, labels)
            _, preds = torch.max(outputs, 1)
            valid_losses += loss.item()
            total_corrects += torch.sum(preds == labels)
            all_labels += labels.tolist()
            all_preds += preds.tolist()

    valid_losses = valid_losses / len(dataloader)
    valid_acc = total_corrects.double().cpu().detach().numpy() / len(dataloader.dataset)

    valid_f1 = metric_f1(all_labels, all_preds)

    return valid_losses, valid_acc, valid_f1

In [None]:
def trainer(fold, df, model_name, oof_pred, typ):
    """
    NLP訓練全Fold用関数.
    F1で保存する.
    Parameters:
    -----------
    fold: int
        検証に使用するフォールドの番号.
    df: pd.DataFrame
        学習に使用するデータフレーム.
    model_name: str
        NLPモデルの名前.
    oof_pred: np.array
        OOF予測値.
    typ: str
        NLPモデルから特徴量を取る位置.
    Returns:
    ----------
    best_f1: float
        保存したモデルのF1.
    oof_pred: np.array
        OOF予測値.
    """
    train_df = df[df.fold != fold].reset_index(drop=True)
    valid_df = df[df.fold == fold].reset_index(drop=True)
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    train_dataset = make_dataset(train_df, tokenizer, params.DEVICE, model_name)
    valid_dataset = make_dataset(valid_df, tokenizer, params.DEVICE, model_name)
    train_dataloader = torch.utils.data.DataLoader(
        train_dataset, batch_size=params.TRAIN_BATCH_SIZE, shuffle=True
    )
    valid_dataloader = torch.utils.data.DataLoader(
        valid_dataset, batch_size=params.VALID_BATCH_SIZE, shuffle=False
    )

    model = Classifier(model_name, typ, num_classes=params.NUM_CLASS)
    model = model.to(params.DEVICE)

    criterion = nn.CrossEntropyLoss(weight=params.CLASS_WEIGHT_TENSOR.float())
    optimizer = AdamW(model.parameters(), lr=2e-5)

    train_losses = []
    train_accs = []
    train_f1s = []
    valid_losses = []
    valid_accs = []
    valid_f1s = []

    best_loss = np.inf
    best_acc = 0
    best_f1 = 0

    for epoch in range(params.EPOCHS):
        train_loss, train_acc, train_f1 = train_fn(train_dataloader, model, criterion, optimizer, params.DEVICE, epoch)
        valid_loss, valid_acc, valid_f1 = eval_fn(valid_dataloader, model, criterion, params.DEVICE)
        #print(f"Loss: {valid_loss}  Acc: {valid_acc}  f1: {valid_f1}  ", end="")

        train_losses.append(train_loss)
        train_accs.append(train_acc)
        train_f1s.append(train_f1)
        valid_losses.append(valid_loss)
        valid_accs.append(valid_acc)
        valid_f1s.append(valid_f1)

        best_loss = valid_loss if valid_loss < best_loss else best_loss
        besl_acc = valid_acc if valid_acc > best_acc else best_acc
        if valid_f1 > best_f1:
            best_f1 = valid_f1
            #print("model saving!", end="")
            torch.save(model.state_dict(), params.MODELS_DIR + f"best_{model_name}_{typ}_{fold}.pth")
        #print("\n")

    valid_pred = []
    model.load_state_dict(torch.load(params.MODELS_DIR + f"best_{model_name}_{typ}_{fold}.pth"))
    model.to(params.DEVICE)
    with torch.no_grad():
        for i, batch in enumerate(valid_dataloader):
            if len(batch.values())==4:
                attention_mask, input_ids, labels, token_type_ids = batch.values()
            else:
                attention_mask, input_ids, labels = batch.values()
                token_type_ids = None
            outputs = model(input_ids, attention_mask, token_type_ids)
            valid_pred += outputs.tolist()
    oof_pred[df[df.fold == fold].index, :] = valid_pred
    return best_f1, oof_pred

In [None]:
class Classifier(nn.Module):
    """
    NLPタスク分類用モデルクラス.
    Parameters:
    -----------
    model_name: str
        使用するモデルの名前.
    typ: str
        NLPモデルから特徴量を取る位置.
    num_classes: int
        学習するデータのクラス数.
    """
    def __init__(self, model_name, typ, num_classes=4):
        super().__init__()

        self.name = model_name
        self.typ = typ
        if model_name in ["albert-large-v2", "xlm-mlm-ende-1024"]:
            nodes = 1024
        else:
            nodes = 768

        self.bert = AutoModel.from_pretrained(model_name)
        self.dropout = nn.Dropout(0.3)
        if typ != "ht":
            self.linear = nn.Linear(nodes, num_classes)
        else:
            self.linear = nn.Linear(nodes*2, num_classes)
        nn.init.normal_(self.linear.weight, std=0.02)

    def forward(self, input_ids, attention_mask, token_type_ids=None):
        if self.name in ["bert-base-cased", "albert-large-v2"]:
            output, _ = self.bert(
                input_ids = input_ids,
                attention_mask = attention_mask,
                token_type_ids = token_type_ids)
            #output = output[:, 0, :]
        elif self.name in ["xlnet-base-cased", "xlm-mlm-ende-1024"]:
            output = self.bert(
                input_ids = input_ids,
                attention_mask = attention_mask,
                token_type_ids = token_type_ids)
            output = output[0]
            #output = output[:, 0, :]
        elif self.name in ["roberta-base", "distilbert-base-cased"]:
            output = self.bert(
                input_ids = input_ids,
                attention_mask = attention_mask,
                )
            output = output[0]
            #output = output[:, 0, :]
        
        if self.typ == "h":
            output = output[:, 0, :]
        elif self.typ == "m":
            output = torch.mean(output, dim=1)
        elif self.typ == "t" or self.typ=="FRt" or self.typ=="DEt":
            output = output[:, -1, :]
        elif self.typ ==  "ht":
            output = torch.cat((output[:, 0, :], output[:, -1, :]), dim=-1)
        else:
            output = output[:, 0, :]
        output = self.dropout(output)
        output = self.linear(output)
        return output

In [None]:
class Parameters(object):
    """
    パラメータ管理用のクラス.
    """
    def __init__(self):
        self.SEED = 2020
        # コードのパス. os.getcwd()が動かない場合はstrで直接渡す.
        #BASE_PATH = "C:/StudentCup2020/2nd/"
        self.BASE_PATH = "/content/drive/MyDrive/Competition/SIGNATE/Datascientist/"
        self.TRAIN_FILE = self.BASE_PATH + "data/train.csv"
        self.TRAIN_FILE_FR = self.BASE_PATH+"data/train_fr.csv"
        self.TRAIN_FILE_DE = self.BASE_PATH+"data/train_de.csv"
        self.TEXT_COL = "description"
        self.TARGET = "jobflag"
        self.NUM_CLASS = 4
        
        self.LGB_TRAIN_FILE = self.BASE_PATH+"data/lgb_train.csv"
        self.OUTPUT_PATH = self.BASE_PATH + "outputs/"
        
        if True: ##
            self.TRAIN_WEIGHT = np.array([0.31, 0.07, 0.31, 0.31])
            self.TEST_WEIGHT = np.array([0.31, 0.07, 0.31, 0.31])  ## 0.25
        else:
            assert True, "TRAIN, TESTの重みをdf[TARGET].value_counts()/len(df), 予測値から決めてください"
            self.TRAIN_WEIGHT = np.array([0.25, 0.25, 0.25, 0.25])
            self.TEST_WEIGHT = np.array([0.25, 0.25, 0.25, 0.25])

        self.CLASS_WEIGHT = self.TEST_WEIGHT / self.TRAIN_WEIGHT
        self.CLASS_WEIGHT /= sum(self.CLASS_WEIGHT)
        self.CLASS_WEIGHT_TENSOR = torch.tensor(self.CLASS_WEIGHT).cuda()
        
        self.DEVICE = "cuda"
        self.MODELS_DIR = self.BASE_PATH + "models/"
        self.EPOCHS = 10
        self.GBDT_ROUNDS = 2000
        self.GBDT_EARLY_STOPPING = 100
        self.NUM_SPLITS = 4
        
        self.TRAIN_BATCH_SIZE = 32
        self.VALID_BATCH_SIZE = 128
        self.MAX_TOKEN_LEN = 128
params = Parameters()

In [None]:
def main(params):
    # --- lightgbm --- #
    lgb_df = pd.read_csv(params.LGB_TRAIN_FILE)
    X = lgb_df.drop([params.TARGET], axis=1)
    y = lgb_df[params.TARGET] - 1
    weight = y.apply(lambda x: make_weight(x))
    scores, feature_importances, train_pred = train_lgb(X, y, weight, n_folds=params.NUM_SPLITS)
    print("LightGBM Score: {}".format(scores))
    feature_importances.to_csv(params.OUTPUT_PATH+"lgb_feature_importances.csv", index=False)
    np.save(params.OUTPUT_PATH+"lgb_trap", train_pred)

    # --- catboost --- #
    """
    train = pd.read_csv(params.TRAIN_FILE).drop(['id'], axis=1)
    train[params.TARGET] -= 1
    col = [c for c in train.columns if c not in ['id', params.TARGET]]
    X = train[col]
    y = train[params.TARGET].astype(int)
    scores, train_pred = train_ctb(X, y, n_folds=params.NUM_SPLITS)
    print("CatBoost Score: {}".format(scores))
    np.save(params.OUTPUT_PATH+"cat_trap", train_pred)
    """

    # --- roberta --- #
    print("roberta Training...")
    df = make_folded_df(params.TRAIN_FILE, params.NUM_SPLITS)
    model_name = "roberta-base"
    typs = ["h", "m", "t", "ht"]
    for typ in typs:
        f1_scores = []
        oof_pred = np.zeros((len(df), params.NUM_CLASS), dtype='float32')
        print("="*10 + "roberta {} Training".format(typ) + "="*10)
        for fold in tqdm(range(params.NUM_SPLITS)):
            f1, oof_pred = trainer(fold, df, model_name, oof_pred, typ)
            f1_scores.append(f1)
        scores = metric_f1(df['labels'], np.argmax(oof_pred, axis=1))
        print("roberta {} Score: {}".format(typ, scores))
        np.save(params.OUTPUT_PATH+model_name+"_"+typ+"_trap", oof_pred)

    df = make_folded_df(params.TRAIN_FILE_FR, params.NUM_SPLITS)
    model_name = "roberta-base"
    typ = "FRt"
    f1_scores = []
    oof_pred = np.zeros((len(df), params.NUM_CLASS), dtype='float32')
    print("="*10 + "roberta {} Training".format(typ) + "="*10)
    for fold in range(params.NUM_SPLITS):
        f1, oof_pred = trainer(fold, df, model_name, oof_pred, typ)
        f1_scores.append(f1)
    scores = metric_f1(df['labels'], np.argmax(oof_pred, axis=1))
    print("roberta {} Score: {}".format(typ, scores))
    np.save(params.OUTPUT_PATH+model_name+"_"+typ+"_trap", oof_pred)

    df = make_folded_df(params.TRAIN_FILE_DE, params.NUM_SPLITS)
    model_name = "roberta-base"
    typ = "DEt"
    f1_scores = []
    oof_pred = np.zeros((len(df), params.NUM_CLASS), dtype='float32')
    print("="*10 + "roberta {} Training".format(typ) + "="*10)
    for fold in range(params.NUM_SPLITS):
        f1, oof_pred = trainer(fold, df, model_name, oof_pred, typ)
        f1_scores.append(f1)
    scores = metric_f1(df['labels'], np.argmax(oof_pred, axis=1))
    print("roberta {} Score: {}".format(typ, scores))
    np.save(params.OUTPUT_PATH+model_name+"_"+typ+"_trap", oof_pred)

    # --- ensemble --- #
    print("Ensemble...")
    model_names = ["lgb", #"cat",
                   "roberta-base_h", "roberta-base_m", "roberta-base_t", "roberta-base_ht",
                   "roberta-base_FRt", "roberta-base_DEt"]
    train = pd.read_csv(params.TRAIN_FILE)
    train["label"] = train[params.TARGET] - 1
    train_pred = np.zeros((train.shape[0], 4, len(model_names)))
    for i, model_name in enumerate(model_names):
        trap = np.load(params.OUTPUT_PATH+model_name+"_trap.npy")
        train_pred[:, :, i] = trap

    best_w = np.ones(len(model_names))
    best_w /= sum(best_w)
    trap = np.average(train_pred, axis=2, weights=best_w)
    best_cw = 0.5 + np.ones(4)
    best_cw /= sum(best_cw)
    trap *= best_cw
    best_score = f1_score(train['label'], np.argmax(trap, axis=1), average=None) @ params.CLASS_WEIGHT
    for i in tqdm(range(100_000)):
        w = np.random.random(len(model_names))
        w /= sum(w)
        trap = np.average(train_pred, axis=2, weights=w)
        cw = 0.5 + np.random.random(4)
        cw /= sum(cw)
        trap = trap * cw
        score = f1_score(train['label'], np.argmax(trap, axis=1), average=None) @ params.CLASS_WEIGHT
        if score > best_score:
            best_score = score
            best_w = w
            best_cw = cw
    print("Best Ensemble Score: {}".format(best_score))
    oof_pred = np.average(train_pred, axis=2, weights=best_w)
    oof_pred = oof_pred * best_cw
    np.save(params.OUTPUT_PATH+"trap_ensemble", oof_pred)
    np.save(params.OUTPUT_PATH+"config_ensemble_bestw", best_w)
    np.save(params.OUTPUT_PATH+"config_ensemble_bestcw", best_cw)


In [None]:
#if __name__ == "__main__":
seed_everything(params.SEED)
if "models" not in os.listdir(params.BASE_PATH):
    os.mkdir(params.BASE_PATH + "models/")
if "outputs" not in os.listdir(params.BASE_PATH):
    os.mkdir(params.BASE_PATH + "outputs/")
main(params)

LightGBM Training...


4it [00:45, 11.34s/it]


LightGBM Score: 0.6302219533886811
roberta Training...


  0%|          | 0/4 [00:00<?, ?it/s]

  0%|          | 0/1137 [00:00<?, ?it/s]

  0%|          | 0/379 [00:00<?, ?it/s]

Some weights of the model checkpoint at roberta-base were not used when initializing RobertaModel: ['lm_head.layer_norm.bias', 'lm_head.bias', 'lm_head.layer_norm.weight', 'lm_head.dense.weight', 'lm_head.dense.bias', 'lm_head.decoder.weight']
- This IS expected if you are initializing RobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
 25%|██▌       | 1/4 [02:56<08:48, 176.01s/it]

  0%|          | 0/1137 [00:00<?, ?it/s]

  0%|          | 0/379 [00:00<?, ?it/s]

Some weights of the model checkpoint at roberta-base were not used when initializing RobertaModel: ['lm_head.layer_norm.bias', 'lm_head.bias', 'lm_head.layer_norm.weight', 'lm_head.dense.weight', 'lm_head.dense.bias', 'lm_head.decoder.weight']
- This IS expected if you are initializing RobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
 50%|█████     | 2/4 [06:01<06:03, 181.76s/it]

  0%|          | 0/1137 [00:00<?, ?it/s]

  0%|          | 0/379 [00:00<?, ?it/s]

Some weights of the model checkpoint at roberta-base were not used when initializing RobertaModel: ['lm_head.layer_norm.bias', 'lm_head.bias', 'lm_head.layer_norm.weight', 'lm_head.dense.weight', 'lm_head.dense.bias', 'lm_head.decoder.weight']
- This IS expected if you are initializing RobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
 75%|███████▌  | 3/4 [09:24<03:11, 191.34s/it]

  0%|          | 0/1137 [00:00<?, ?it/s]

  0%|          | 0/379 [00:00<?, ?it/s]

Some weights of the model checkpoint at roberta-base were not used when initializing RobertaModel: ['lm_head.layer_norm.bias', 'lm_head.bias', 'lm_head.layer_norm.weight', 'lm_head.dense.weight', 'lm_head.dense.bias', 'lm_head.decoder.weight']
- This IS expected if you are initializing RobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
100%|██████████| 4/4 [12:18<00:00, 184.67s/it]


roberta h Score: 0.7096381454615299


  0%|          | 0/4 [00:00<?, ?it/s]

  0%|          | 0/1137 [00:00<?, ?it/s]

  0%|          | 0/379 [00:00<?, ?it/s]

Some weights of the model checkpoint at roberta-base were not used when initializing RobertaModel: ['lm_head.layer_norm.bias', 'lm_head.bias', 'lm_head.layer_norm.weight', 'lm_head.dense.weight', 'lm_head.dense.bias', 'lm_head.decoder.weight']
- This IS expected if you are initializing RobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
 25%|██▌       | 1/4 [02:58<08:54, 178.01s/it]

  0%|          | 0/1137 [00:00<?, ?it/s]

  0%|          | 0/379 [00:00<?, ?it/s]

Some weights of the model checkpoint at roberta-base were not used when initializing RobertaModel: ['lm_head.layer_norm.bias', 'lm_head.bias', 'lm_head.layer_norm.weight', 'lm_head.dense.weight', 'lm_head.dense.bias', 'lm_head.decoder.weight']
- This IS expected if you are initializing RobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
 50%|█████     | 2/4 [05:56<05:57, 178.55s/it]

  0%|          | 0/1137 [00:00<?, ?it/s]

  0%|          | 0/379 [00:00<?, ?it/s]

Some weights of the model checkpoint at roberta-base were not used when initializing RobertaModel: ['lm_head.layer_norm.bias', 'lm_head.bias', 'lm_head.layer_norm.weight', 'lm_head.dense.weight', 'lm_head.dense.bias', 'lm_head.decoder.weight']
- This IS expected if you are initializing RobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
 75%|███████▌  | 3/4 [08:53<02:57, 177.78s/it]

  0%|          | 0/1137 [00:00<?, ?it/s]

  0%|          | 0/379 [00:00<?, ?it/s]

Some weights of the model checkpoint at roberta-base were not used when initializing RobertaModel: ['lm_head.layer_norm.bias', 'lm_head.bias', 'lm_head.layer_norm.weight', 'lm_head.dense.weight', 'lm_head.dense.bias', 'lm_head.decoder.weight']
- This IS expected if you are initializing RobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
100%|██████████| 4/4 [11:48<00:00, 177.00s/it]


roberta m Score: 0.7128366209305326


  0%|          | 0/4 [00:00<?, ?it/s]

  0%|          | 0/1137 [00:00<?, ?it/s]

  0%|          | 0/379 [00:00<?, ?it/s]

Some weights of the model checkpoint at roberta-base were not used when initializing RobertaModel: ['lm_head.layer_norm.bias', 'lm_head.bias', 'lm_head.layer_norm.weight', 'lm_head.dense.weight', 'lm_head.dense.bias', 'lm_head.decoder.weight']
- This IS expected if you are initializing RobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
 25%|██▌       | 1/4 [02:53<08:41, 173.80s/it]

  0%|          | 0/1137 [00:00<?, ?it/s]

  0%|          | 0/379 [00:00<?, ?it/s]

Some weights of the model checkpoint at roberta-base were not used when initializing RobertaModel: ['lm_head.layer_norm.bias', 'lm_head.bias', 'lm_head.layer_norm.weight', 'lm_head.dense.weight', 'lm_head.dense.bias', 'lm_head.decoder.weight']
- This IS expected if you are initializing RobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
 50%|█████     | 2/4 [05:51<05:52, 176.37s/it]

  0%|          | 0/1137 [00:00<?, ?it/s]

  0%|          | 0/379 [00:00<?, ?it/s]

Some weights of the model checkpoint at roberta-base were not used when initializing RobertaModel: ['lm_head.layer_norm.bias', 'lm_head.bias', 'lm_head.layer_norm.weight', 'lm_head.dense.weight', 'lm_head.dense.bias', 'lm_head.decoder.weight']
- This IS expected if you are initializing RobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
 75%|███████▌  | 3/4 [08:48<02:56, 176.49s/it]

  0%|          | 0/1137 [00:00<?, ?it/s]

  0%|          | 0/379 [00:00<?, ?it/s]

Some weights of the model checkpoint at roberta-base were not used when initializing RobertaModel: ['lm_head.layer_norm.bias', 'lm_head.bias', 'lm_head.layer_norm.weight', 'lm_head.dense.weight', 'lm_head.dense.bias', 'lm_head.decoder.weight']
- This IS expected if you are initializing RobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
100%|██████████| 4/4 [11:50<00:00, 177.63s/it]


roberta t Score: 0.7041361256361457


  0%|          | 0/4 [00:00<?, ?it/s]

  0%|          | 0/1137 [00:00<?, ?it/s]

  0%|          | 0/379 [00:00<?, ?it/s]

Some weights of the model checkpoint at roberta-base were not used when initializing RobertaModel: ['lm_head.layer_norm.bias', 'lm_head.bias', 'lm_head.layer_norm.weight', 'lm_head.dense.weight', 'lm_head.dense.bias', 'lm_head.decoder.weight']
- This IS expected if you are initializing RobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
 25%|██▌       | 1/4 [02:57<08:52, 177.42s/it]

  0%|          | 0/1137 [00:00<?, ?it/s]

  0%|          | 0/379 [00:00<?, ?it/s]

Some weights of the model checkpoint at roberta-base were not used when initializing RobertaModel: ['lm_head.layer_norm.bias', 'lm_head.bias', 'lm_head.layer_norm.weight', 'lm_head.dense.weight', 'lm_head.dense.bias', 'lm_head.decoder.weight']
- This IS expected if you are initializing RobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
 50%|█████     | 2/4 [05:54<05:54, 177.17s/it]

  0%|          | 0/1137 [00:00<?, ?it/s]

  0%|          | 0/379 [00:00<?, ?it/s]

Some weights of the model checkpoint at roberta-base were not used when initializing RobertaModel: ['lm_head.layer_norm.bias', 'lm_head.bias', 'lm_head.layer_norm.weight', 'lm_head.dense.weight', 'lm_head.dense.bias', 'lm_head.decoder.weight']
- This IS expected if you are initializing RobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
 75%|███████▌  | 3/4 [08:55<02:58, 178.83s/it]

  0%|          | 0/1137 [00:00<?, ?it/s]

  0%|          | 0/379 [00:00<?, ?it/s]

Some weights of the model checkpoint at roberta-base were not used when initializing RobertaModel: ['lm_head.layer_norm.bias', 'lm_head.bias', 'lm_head.layer_norm.weight', 'lm_head.dense.weight', 'lm_head.dense.bias', 'lm_head.decoder.weight']
- This IS expected if you are initializing RobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
100%|██████████| 4/4 [11:53<00:00, 178.36s/it]


roberta ht Score: 0.708753234421042


  0%|          | 0/1137 [00:00<?, ?it/s]

  0%|          | 0/379 [00:00<?, ?it/s]

Some weights of the model checkpoint at roberta-base were not used when initializing RobertaModel: ['lm_head.layer_norm.bias', 'lm_head.bias', 'lm_head.layer_norm.weight', 'lm_head.dense.weight', 'lm_head.dense.bias', 'lm_head.decoder.weight']
- This IS expected if you are initializing RobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


  0%|          | 0/1137 [00:00<?, ?it/s]

  0%|          | 0/379 [00:00<?, ?it/s]

Some weights of the model checkpoint at roberta-base were not used when initializing RobertaModel: ['lm_head.layer_norm.bias', 'lm_head.bias', 'lm_head.layer_norm.weight', 'lm_head.dense.weight', 'lm_head.dense.bias', 'lm_head.decoder.weight']
- This IS expected if you are initializing RobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


  0%|          | 0/1137 [00:00<?, ?it/s]

  0%|          | 0/379 [00:00<?, ?it/s]

Some weights of the model checkpoint at roberta-base were not used when initializing RobertaModel: ['lm_head.layer_norm.bias', 'lm_head.bias', 'lm_head.layer_norm.weight', 'lm_head.dense.weight', 'lm_head.dense.bias', 'lm_head.decoder.weight']
- This IS expected if you are initializing RobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


  0%|          | 0/1137 [00:00<?, ?it/s]

  0%|          | 0/379 [00:00<?, ?it/s]

Some weights of the model checkpoint at roberta-base were not used when initializing RobertaModel: ['lm_head.layer_norm.bias', 'lm_head.bias', 'lm_head.layer_norm.weight', 'lm_head.dense.weight', 'lm_head.dense.bias', 'lm_head.decoder.weight']
- This IS expected if you are initializing RobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


roberta FRt Score: 0.675273585897328


  0%|          | 0/1137 [00:00<?, ?it/s]

  0%|          | 0/379 [00:00<?, ?it/s]

Some weights of the model checkpoint at roberta-base were not used when initializing RobertaModel: ['lm_head.layer_norm.bias', 'lm_head.bias', 'lm_head.layer_norm.weight', 'lm_head.dense.weight', 'lm_head.dense.bias', 'lm_head.decoder.weight']
- This IS expected if you are initializing RobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


  0%|          | 0/1137 [00:00<?, ?it/s]

  0%|          | 0/379 [00:00<?, ?it/s]

Some weights of the model checkpoint at roberta-base were not used when initializing RobertaModel: ['lm_head.layer_norm.bias', 'lm_head.bias', 'lm_head.layer_norm.weight', 'lm_head.dense.weight', 'lm_head.dense.bias', 'lm_head.decoder.weight']
- This IS expected if you are initializing RobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


  0%|          | 0/1137 [00:00<?, ?it/s]

  0%|          | 0/379 [00:00<?, ?it/s]

Some weights of the model checkpoint at roberta-base were not used when initializing RobertaModel: ['lm_head.layer_norm.bias', 'lm_head.bias', 'lm_head.layer_norm.weight', 'lm_head.dense.weight', 'lm_head.dense.bias', 'lm_head.decoder.weight']
- This IS expected if you are initializing RobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


  0%|          | 0/1137 [00:00<?, ?it/s]

  0%|          | 0/379 [00:00<?, ?it/s]

Some weights of the model checkpoint at roberta-base were not used when initializing RobertaModel: ['lm_head.layer_norm.bias', 'lm_head.bias', 'lm_head.layer_norm.weight', 'lm_head.dense.weight', 'lm_head.dense.bias', 'lm_head.decoder.weight']
- This IS expected if you are initializing RobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


roberta DEt Score: 0.6597612259788233
Ensemble...


100%|██████████| 100000/100000 [02:44<00:00, 607.50it/s]


Best Ensemble Score: 0.7335066012787663


# 推論

In [None]:
!pip install -q pulp

[K     |████████████████████████████████| 14.2 MB 5.2 MB/s 
[?25h

In [None]:
import pulp

In [None]:
def np_rounder(x):
    """
    numpyの四捨五入用関数.
    Parameters:
    -----------
    x: np.array[float]
    Returns:
    ----------
    (int_array + float_array).astype(int): np.array[int]
    """
    int_array = x // 1
    float_array = x % 1
    float_array[float_array<0.5] = 0
    float_array[float_array>=0.5] = 1
    return (int_array + float_array).astype(int)

In [None]:
def sigmoid(x):
    """
    尤度を確率に変換する関数.
    Parameters:
    -----------
    x: np.array[float]
    Returns:
    1 / (1+np.exp(-x)) : np.array[float]
    """
    return 1 / (1+np.exp(-x))

In [None]:
def make_dataset(df, tokenizer, device, model_name):
    """
    NLPモデル用のデータセットを作成するための関数.
    Parameters:
    -----------
    df: pd.DataFrame
        モデル用のデータセット.
    tokenizer: transformers.AutoTokenizer.from_pretrained
        モデル用のtokenizer.
    device: str
        使用するデバイス. "cpu" or "cuda".
    model_name: str
        使用するモデルの名前.
    
    Returns:
    ----------
    dataset: nlp.Dataset.from_pandas
        NLP用のデータセット.
    """
    dataset = nlp.Dataset.from_pandas(df)
    dataset = dataset.map(
        lambda example: tokenizer(example[params.TEXT_COL],
                                  padding="max_length",
                                  truncation=True,
                                  max_length=params.MAX_TOKEN_LEN))
    if not model_name in ["roberta-base", "distilbert-base-cased"]:
        dataset.set_format(type='torch', 
                           columns=['input_ids', 'token_type_ids', 'attention_mask', 'labels'], 
                           device=device)
    else:
        dataset.set_format(type='torch', 
                           columns=['input_ids', 'attention_mask', 'labels'], 
                           device=device)
    return dataset

In [None]:
def predict_lgb(X_test, n_folds=4):
    """
    lightgbm予測用関数.
    Parameters:
    -----------
    X_test: pd.DataFrame
        予測用データセット.
    n_folds: int
        予測時のFold数. 訓練時のFold数より大きくしないこと.
    
    Returns:
    ----------
    y_pred: np.array[float]
        予測した尤度.
    """
    y_pred = np.zeros((X_test.shape[0], params.NUM_CLASS), dtype='float32')
    for fold in range(n_folds):
        model = pickle.load(open(params.MODELS_DIR+"lgb_fold{}.lgbmodel".format(fold), "rb"))
        y_pred += model.predict(X_test, num_iteration=model.best_iteration) / n_folds
    return y_pred

In [None]:
def predict_ctb(X_test, n_folds=4):
    """
    catboost予測用関数.
    Parameters:
    -----------
    X_test: pd.DataFrame
        予測用データセット.
    n_folds: int
        予測時のFold数. 訓練時のFold数より大きくしないこと.
    
    Returns:
    ----------
    y_pred: np.array[float]
        予測した尤度.
    """
    y_pred = np.zeros((X_test.shape[0], params.NUM_CLASS), dtype='float32')
    for fold in range(n_folds):
        model = pickle.load(open(params.MODELS_DIR+"ctb_fold{}.ctbmodel".format(fold), "rb"))
        y_pred += model.predict(X_test) / n_folds
    return y_pred

In [None]:
def predict_nlp(model_name, typ, file_path):
    """
    nlp予測用関数.
    Parameters:
    -----------
    model_name: str
        使用するモデルの名前.
    type: str
        使用する特徴量の部分.
    file_path: str
        予測するデータセットのパス.
    
    Returns:
    ----------
    preds: np.array[float]
        予測した尤度.
    """
    models = []
    for fold in range(params.NUM_SPLITS):
        model = Classifier(model_name, typ)
        model.load_state_dict(torch.load(params.MODELS_DIR + f"best_{model_name}_{typ}_{fold}.pth"))
        model.to(params.DEVICE)
        model.eval()
        models.append(model)
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    test_df = pd.read_csv(file_path)
    test_df["labels"] = -1
    test_dataset = make_dataset(test_df, tokenizer, params.DEVICE, model_name)
    test_dataloader = torch.utils.data.DataLoader(
        test_dataset, batch_size=params.VALID_BATCH_SIZE, shuffle=False)

    with torch.no_grad():
        final_output = []
        preds = []
        for batch in test_dataloader:            
            if len(batch.values())==4:
                attention_mask, input_ids, labels, token_type_ids = batch.values()
            else:
                attention_mask, input_ids, labels = batch.values()
                token_type_ids = None
            pred = np.zeros((labels.shape[0], params.NUM_CLASS))
            for model in models:
                pred += model(input_ids, attention_mask, token_type_ids).cpu().numpy()
            preds += (pred/params.NUM_SPLITS).tolist()
    return preds

In [None]:
def hack(prob):
    """
    from: https://signate.jp/competitions/281/discussions/20200816040343-8180
    尤度最大化用関数.
    Parameters:
    ------------
    prob: np.array[float]
        予測した確率.
    
    Returns:
    ------------
    x_ast.argmax(axis=1): np.array[int]
        予測したラベル.
    """
    logp = np.log(prob + 1e-16)
    N = prob.shape[0]
    K = prob.shape[1]
    m = pulp.LpProblem('Problem', pulp.LpMaximize)
    x = pulp.LpVariable.dicts('x', [(i, j) for i in range(N) for j in range(K)], 0, 1, pulp.LpBinary)
    log_likelihood = pulp.lpSum([x[(i, j)] * logp[i, j] for i in range(N) for j in range(K)])
    m += log_likelihood
    for i in range(N):
        m += pulp.lpSum([x[(i, k)] for k in range(K)]) == 1
    for k in range(K):
        m += pulp.lpSum([x[(i, k)] for i in range(N)]) == params.N_CLASSES[k]
    m.solve()
    assert m.status == 1
    x_ast = np.array([[int(x[(i, j)].value()) for j in range(K)] for i in range(N)])
    return x_ast.argmax(axis=1)


In [None]:
class Classifier(nn.Module):
    """
    NLPタスク分類用モデルクラス.
    Parameters:
    -----------
    model_name: str
        使用するモデルの名前.
    typ: str
        NLPモデルから特徴量を取る位置.
    num_classes: int
        学習するデータのクラス数.
    """
    def __init__(self, model_name, typ, num_classes=4):
        super().__init__()

        self.name = model_name
        self.typ = typ
        if model_name in ["albert-large-v2", "xlm-mlm-ende-1024"]:
            nodes = 1024
        else:
            nodes = 768

        self.bert = AutoModel.from_pretrained(model_name)
        self.dropout = nn.Dropout(0.3)
        if typ != "ht":
            self.linear = nn.Linear(nodes, num_classes)
        else:
            self.linear = nn.Linear(nodes*2, num_classes)
        nn.init.normal_(self.linear.weight, std=0.02)

    def forward(self, input_ids, attention_mask, token_type_ids=None):
        if self.name in ["bert-base-cased", "albert-large-v2"]:
            output, _ = self.bert(
                input_ids = input_ids,
                attention_mask = attention_mask,
                token_type_ids = token_type_ids)
            #output = output[:, 0, :]
        elif self.name in ["xlnet-base-cased", "xlm-mlm-ende-1024"]:
            output = self.bert(
                input_ids = input_ids,
                attention_mask = attention_mask,
                token_type_ids = token_type_ids)
            output = output[0]
            #output = output[:, 0, :]
        elif self.name in ["roberta-base", "distilbert-base-cased"]:
            output = self.bert(
                input_ids = input_ids,
                attention_mask = attention_mask,
                )
            output = output[0]
            #output = output[:, 0, :]
        
        if self.typ == "h":
            output = output[:, 0, :]
        elif self.typ == "m":
            output = torch.mean(output, dim=1)
        elif self.typ == "t" or self.typ=="FRt" or self.typ=="DEt":
            output = output[:, -1, :]
        elif self.typ ==  "ht":
            output = torch.cat((output[:, 0, :], output[:, -1, :]), dim=-1)
        else:
            output = output[:, 0, :]
        output = self.dropout(output)
        output = self.linear(output)
        return output

In [None]:
class Parameters(object):
    """
    パラメータ管理用クラス.
    """
    def __init__(self):
        self.SEED = 2020
        # コードのパス. os.getcwd()が動かない場合はstrで直接渡す.
        #BASE_PATH = "C:/StudentCup2020/2nd/"
        self.BASE_PATH = "/content/drive/MyDrive/Competition/SIGNATE/Datascientist/"
        self.TEST_FILE = self.BASE_PATH + "data/test.csv"
        self.TEST_FILE_FR = self.BASE_PATH+"data/test_fr.csv"
        self.TEST_FILE_DE = self.BASE_PATH+"data/test_de.csv"
        self.TEXT_COL = "description"
        self.TARGET = "jobflag"
        self.NUM_CLASS = 4
        
        self.LGB_TEST_FILE = self.BASE_PATH+"data/lgb_test.csv"
        self.OUTPUT_PATH = self.BASE_PATH + "outputs/"
        
        if True:
            self.TRAIN_WEIGHT = np.array([0.31, 0.07, 0.31, 0.31])
            self.TEST_WEIGHT = np.array([0.31, 0.07, 0.31, 0.31])
        else:
            assert True, "TRAIN, TESTの重みをdf[TARGET].value_counts()/len(df), 予測値から決めてください"
            self.TRAIN_WEIGHT = np.array([0.25, 0.25, 0.25, 0.25])
            self.TEST_WEIGHT = np.array([0.25, 0.25, 0.25, 0.25])

        self.CLASS_WEIGHT = self.TEST_WEIGHT / self.TRAIN_WEIGHT
        self.CLASS_WEIGHT /= sum(self.CLASS_WEIGHT)
        self.CLASS_WEIGHT_TENSOR = torch.tensor(self.CLASS_WEIGHT).cuda()

        len_test = len(pd.read_csv(self.TEST_FILE))
        self.N_CLASSES = np_rounder(len_test*self.TEST_WEIGHT).tolist()
        while sum(self.N_CLASSES) < len_test:
            diff = np.abs(0.5 - len_test*self.TEST_WEIGHT%1)
            self.N_CLASSES[np.argmin(diff)] += 1
        while sum(self.N_CLASSES) > len_test:
            diff = np.abs(0.5 - len_test*self.TEST_WEIGHT%1)
            self.N_CLASSES[np.argmin(diff)] -= 1

        self.DEVICE = "cuda"
        self.MODELS_DIR = self.BASE_PATH + "models/"
        self.NUM_SPLITS = 4
        
        self.VALID_BATCH_SIZE = 128
        self.MAX_TOKEN_LEN = 128
params = Parameters()

In [None]:
def main(params):
    # --- lightgbm --- #
    print("LightGBM Predicting...")
    X_test = pd.read_csv(params.LGB_TEST_FILE)
    y_pred = predict_lgb(X_test, n_folds=params.NUM_SPLITS)
    np.save(params.OUTPUT_PATH+"lgb_yprd", y_pred)
    
    # --- catboost --- #
    print("CatBoost Predicting...")
    test = pd.read_csv(params.TEST_FILE)
    col = [c for c in test.columns if c not in ['id', params.TARGET]]
    X_test = test[col]
    y_pred = predict_ctb(X_test, n_folds=params.NUM_SPLITS)
    np.save(params.OUTPUT_PATH+"cat_yprd", y_pred)
    
    # --- roberta --- #
    model_name = "roberta-base"
    typs = ["h", "m", "t", "ht"]
    for typ in typs:
        print("Robert {} Predicting...".format(typ))
        preds = predict_nlp(model_name, typ, params.TEST_FILE)
        np.save(params.OUTPUT_PATH+model_name+"_"+typ+"_yprd", preds)
    typ = "FRt"
    print("Robert {} Predicting...".format(typ))
    preds = predict_nlp(model_name, typ, params.TEST_FILE_FR)
    np.save(params.OUTPUT_PATH+model_name+"_"+typ+"_yprd", preds)
    typ = "DEt"
    print("Robert {} Predicting...".format(typ))
    preds = predict_nlp(model_name, typ, params.TEST_FILE_DE)
    np.save(params.OUTPUT_PATH+model_name+"_"+typ+"_yprd", preds)

    # --- ensemble --- #
    model_names = ["lgb", #"cat",
                   "roberta-base_h", "roberta-base_m", "roberta-base_t", "roberta-base_ht",
                   "roberta-base_FRt", "roberta-base_DEt"]
    test = pd.read_csv(params.TEST_FILE)
    y_pred = np.zeros((test.shape[0], 4, len(model_names)))
    for i, model_name in enumerate(model_names):
        yprd = np.load(params.OUTPUT_PATH+model_name+"_yprd.npy")
        y_pred[:, :, i] = yprd
    best_w = np.load(params.OUTPUT_PATH+"config_ensemble_bestw.npy")
    best_cw = np.load(params.OUTPUT_PATH+"config_ensemble_bestcw.npy")
    test_pred = np.average(y_pred, axis=2, weights=best_w)
    test_pred = test_pred * best_cw

    # --- post processing --- #
    test_pred = sigmoid(test_pred)
    test_pred = test_pred / np.sum(test_pred, axis=1).reshape(test.shape[0], -1)
    test_pred = hack(test_pred) + 1
    
    test = pd.read_csv(params.TEST_FILE)
    submit = pd.DataFrame({'index':test['id'], 'pred':test_pred})
    submit.to_csv(params.BASE_PATH+"data/submission.csv", index=False, header=False)

In [None]:
#if __name__ == "__main__":
seed_everything(params.SEED)
if "models" not in os.listdir(params.BASE_PATH):
    os.mkdir(params.BASE_PATH + "models/")
if "outputs" not in os.listdir(params.BASE_PATH):
    os.mkdir(params.BASE_PATH + "outputs/")
main(params)

LightGBM Predicting...
CatBoost Predicting...
Robert h Predicting...


Some weights of the model checkpoint at roberta-base were not used when initializing RobertaModel: ['lm_head.layer_norm.bias', 'lm_head.bias', 'lm_head.layer_norm.weight', 'lm_head.dense.weight', 'lm_head.dense.bias', 'lm_head.decoder.weight']
- This IS expected if you are initializing RobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of the model checkpoint at roberta-base were not used when initializing RobertaModel: ['lm_head.layer_norm.bias', 'lm_head.bias', 'lm_head.layer_norm.weight', 'lm_head.dense.weight', 'lm_head.dense.bias', 'lm_head.decoder.weight']
- This IS expected if you are initializing RobertaMod

  0%|          | 0/1517 [00:00<?, ?it/s]

Robert m Predicting...


Some weights of the model checkpoint at roberta-base were not used when initializing RobertaModel: ['lm_head.layer_norm.bias', 'lm_head.bias', 'lm_head.layer_norm.weight', 'lm_head.dense.weight', 'lm_head.dense.bias', 'lm_head.decoder.weight']
- This IS expected if you are initializing RobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of the model checkpoint at roberta-base were not used when initializing RobertaModel: ['lm_head.layer_norm.bias', 'lm_head.bias', 'lm_head.layer_norm.weight', 'lm_head.dense.weight', 'lm_head.dense.bias', 'lm_head.decoder.weight']
- This IS expected if you are initializing RobertaMod

  0%|          | 0/1517 [00:00<?, ?it/s]

Robert t Predicting...


Some weights of the model checkpoint at roberta-base were not used when initializing RobertaModel: ['lm_head.layer_norm.bias', 'lm_head.bias', 'lm_head.layer_norm.weight', 'lm_head.dense.weight', 'lm_head.dense.bias', 'lm_head.decoder.weight']
- This IS expected if you are initializing RobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of the model checkpoint at roberta-base were not used when initializing RobertaModel: ['lm_head.layer_norm.bias', 'lm_head.bias', 'lm_head.layer_norm.weight', 'lm_head.dense.weight', 'lm_head.dense.bias', 'lm_head.decoder.weight']
- This IS expected if you are initializing RobertaMod

  0%|          | 0/1517 [00:00<?, ?it/s]

Robert ht Predicting...


Some weights of the model checkpoint at roberta-base were not used when initializing RobertaModel: ['lm_head.layer_norm.bias', 'lm_head.bias', 'lm_head.layer_norm.weight', 'lm_head.dense.weight', 'lm_head.dense.bias', 'lm_head.decoder.weight']
- This IS expected if you are initializing RobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of the model checkpoint at roberta-base were not used when initializing RobertaModel: ['lm_head.layer_norm.bias', 'lm_head.bias', 'lm_head.layer_norm.weight', 'lm_head.dense.weight', 'lm_head.dense.bias', 'lm_head.decoder.weight']
- This IS expected if you are initializing RobertaMod

  0%|          | 0/1517 [00:00<?, ?it/s]

Robert FRt Predicting...


Some weights of the model checkpoint at roberta-base were not used when initializing RobertaModel: ['lm_head.layer_norm.bias', 'lm_head.bias', 'lm_head.layer_norm.weight', 'lm_head.dense.weight', 'lm_head.dense.bias', 'lm_head.decoder.weight']
- This IS expected if you are initializing RobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of the model checkpoint at roberta-base were not used when initializing RobertaModel: ['lm_head.layer_norm.bias', 'lm_head.bias', 'lm_head.layer_norm.weight', 'lm_head.dense.weight', 'lm_head.dense.bias', 'lm_head.decoder.weight']
- This IS expected if you are initializing RobertaMod

  0%|          | 0/1517 [00:00<?, ?it/s]

Robert DEt Predicting...


Some weights of the model checkpoint at roberta-base were not used when initializing RobertaModel: ['lm_head.layer_norm.bias', 'lm_head.bias', 'lm_head.layer_norm.weight', 'lm_head.dense.weight', 'lm_head.dense.bias', 'lm_head.decoder.weight']
- This IS expected if you are initializing RobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of the model checkpoint at roberta-base were not used when initializing RobertaModel: ['lm_head.layer_norm.bias', 'lm_head.bias', 'lm_head.layer_norm.weight', 'lm_head.dense.weight', 'lm_head.dense.bias', 'lm_head.decoder.weight']
- This IS expected if you are initializing RobertaMod

  0%|          | 0/1517 [00:00<?, ?it/s]