In [1]:
# ====================================================
# Directory settings
# ====================================================
import os

OUTPUT_DIR = './'
if not os.path.exists(OUTPUT_DIR):
    os.makedirs(OUTPUT_DIR)

In [2]:
# ====================================================
# CFG
# ====================================================
class CFG:
    debug=False
    apex=True
    print_freq=20
    num_workers=4
    model="microsoft/deberta-v3-large"
    gradient_checkpointing=True
    scheduler='cosine' # ['linear', 'cosine']
    batch_scheduler=True
    num_cycles=0.5
    num_warmup_steps=0
    epochs=4
    encoder_lr=2e-5
    decoder_lr=2e-5
    min_lr=1e-6
    eps=1e-6
    betas=(0.9, 0.999)
    batch_size=8
    max_len=1024
    weight_decay=0.01
    gradient_accumulation_steps=1
    max_grad_norm=1000
    target_size=6
    fc_dropout=0.2
    target_cols=['cohesion', 'syntax', 'vocabulary', 'phraseology', 'grammar', 'conventions']
    seed=42
    n_fold=4
    trn_fold=[0, 1, 2, 3]
    train=True
    freezing=True
    
if CFG.debug:
    CFG.epochs = 2
    CFG.trn_fold = [0]

In [3]:
# ====================================================
# Library
# ====================================================
import os
import gc
import re
import ast
import sys
import copy
import json
import time
import math
import string
import pickle
import random
import joblib
import itertools
import warnings
warnings.filterwarnings("ignore")

import scipy as sp
import numpy as np
import pandas as pd
pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000)
from tqdm.auto import tqdm
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import StratifiedKFold, GroupKFold, KFold

from text_unidecode import unidecode
from typing import Dict, List, Tuple
import codecs

os.system('pip install iterative-stratification==0.1.7')
from iterstrat.ml_stratifiers import MultilabelStratifiedKFold
from sklearn.model_selection._split import _BaseKFold

import torch
import torch.nn as nn
from torch.nn import Parameter
import torch.nn.functional as F
from torch.optim import Adam, SGD, AdamW
from torch.utils.data import DataLoader, Dataset

os.system('pip uninstall -y transformers')
os.system('pip uninstall -y tokenizers')
os.system('python -m pip install --no-index --find-links=../input/fb3-pip-wheels transformers')
os.system('python -m pip install --no-index --find-links=../input/fb3-pip-wheels tokenizers')
import tokenizers
import transformers
print(f"tokenizers.__version__: {tokenizers.__version__}")
print(f"transformers.__version__: {transformers.__version__}")
from transformers import AutoTokenizer, AutoModel, AutoConfig
from transformers import get_linear_schedule_with_warmup, get_cosine_schedule_with_warmup
%env TOKENIZERS_PARALLELISM=true

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

# CPMP: declare the two GPUs
os.environ['CUDA_VISIBLE_DEVICES'] = "0,1"

# CPMP: avoids some issues when using more than one worker
os.environ["TOKENIZERS_PARALLELISM"] = "false"

Collecting iterative-stratification==0.1.7
  Downloading iterative_stratification-0.1.7-py3-none-any.whl (8.5 kB)
Installing collected packages: iterative-stratification
Successfully installed iterative-stratification-0.1.7




Found existing installation: transformers 4.20.1
Uninstalling transformers-4.20.1:
  Successfully uninstalled transformers-4.20.1




Found existing installation: tokenizers 0.12.1
Uninstalling tokenizers-0.12.1:
  Successfully uninstalled tokenizers-0.12.1




Looking in links: ../input/fb3-pip-wheels
Processing /kaggle/input/fb3-pip-wheels/transformers-4.21.2-py3-none-any.whl
Processing /kaggle/input/fb3-pip-wheels/tokenizers-0.12.1-cp37-cp37m-manylinux_2_12_x86_64.manylinux2010_x86_64.whl
Installing collected packages: tokenizers, transformers


ERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
allennlp 2.10.1 requires transformers<4.21,>=4.1, but you have transformers 4.21.2 which is incompatible.


Successfully installed tokenizers-0.12.1 transformers-4.21.2
Looking in links: ../input/fb3-pip-wheels




tokenizers.__version__: 0.12.1
transformers.__version__: 4.21.2
env: TOKENIZERS_PARALLELISM=true


In [4]:
df_fb2021 = pd.read_csv('../input/feedback-prize-2021/train.csv', dtype={'discourse_id':int})
df_fb2021['textlen'] = df_fb2021.discourse_text.str.len()
df_fb2021

Unnamed: 0,id,discourse_id,discourse_start,discourse_end,discourse_text,discourse_type,discourse_type_num,predictionstring,textlen
0,423A1CA112E2,1622627660524,8.0,229.0,Modern humans today are always on their phone....,Lead,Lead 1,1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 1...,221
1,423A1CA112E2,1622627653021,230.0,312.0,They are some really bad consequences when stu...,Position,Position 1,45 46 47 48 49 50 51 52 53 54 55 56 57 58 59,82
2,423A1CA112E2,1622627671020,313.0,401.0,Some certain areas in the United States ban ph...,Evidence,Evidence 1,60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75,88
3,423A1CA112E2,1622627696365,402.0,758.0,"When people have phones, they know about certa...",Evidence,Evidence 2,76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 9...,356
4,423A1CA112E2,1622627759780,759.0,886.0,Driving is one of the way how to get around. P...,Claim,Claim 1,139 140 141 142 143 144 145 146 147 148 149 15...,127
...,...,...,...,...,...,...,...,...,...
144288,4C471936CD75,1618153340639,2234.0,3203.0,if I'm not sure what college I want to attend...,Evidence,Evidence 2,386 387 388 389 390 391 392 393 394 395 396 39...,969
144289,4C471936CD75,1618153383399,3221.0,4509.0,seeking multiple opinions before making a har...,Evidence,Evidence 3,576 577 578 579 580 581 582 583 584 585 586 58...,1288
144290,4C471936CD75,1618024996127,4510.0,4570.0,it is better to seek multiple opinions instead...,Position,Position 1,828 829 830 831 832 833 834 835 836 837 838,60
144291,4C471936CD75,1618025268756,4570.0,4922.0,The impact of asking people to help you make a...,Evidence,Evidence 4,839 840 841 842 843 844 845 846 847 848 849 85...,352


In [5]:
def join_texts(v):
    return re.sub(' +', ' ', ' '.join(v.discourse_text).replace("\n", ' ').replace("\a", ' ') )

df_fb2021_agg = df_fb2021.groupby('id').apply(lambda v: join_texts(v)).to_frame('text')
df_fb2021_agg

Unnamed: 0_level_0,text
id,Unnamed: 1_level_1
0000D23A521A,"Some people belive that the so called ""face"" o..."
00066EA9880D,Driverless cars are exaclty what you would exp...
000E6DE9E817,I am arguing against the policy change even th...
001552828BD0,Would you be able to give your car up? Having ...
0016926B079C,I think that students would benefit from learn...
...,...
FFF1442D6698,"Every student looks forward to summer break, i..."
FFF1ED4F8544,Many citizens argue that the Electoral college...
FFF868E06176,"Every summer break, students are given project..."
FFFD0AF13501,they get to see tons of awesome landmarks. If...


In [6]:
df_fb3 = pd.read_csv('../input/feedback-prize-english-language-learning/train.csv')

In [7]:
common_ids = set(df_fb2021_agg.index) & set(df_fb3.text_id)
len(common_ids)

452

In [8]:
df_fb2021_agg.reset_index().rename(columns={'id': 'text_id', 'text': 'full_text'})

Unnamed: 0,text_id,full_text
0,0000D23A521A,"Some people belive that the so called ""face"" o..."
1,00066EA9880D,Driverless cars are exaclty what you would exp...
2,000E6DE9E817,I am arguing against the policy change even th...
3,001552828BD0,Would you be able to give your car up? Having ...
4,0016926B079C,I think that students would benefit from learn...
...,...,...
15589,FFF1442D6698,"Every student looks forward to summer break, i..."
15590,FFF1ED4F8544,Many citizens argue that the Electoral college...
15591,FFF868E06176,"Every summer break, students are given project..."
15592,FFFD0AF13501,they get to see tons of awesome landmarks. If...


In [9]:
df_sample = df_fb2021_agg.reset_index().rename(columns={'id': 'text_id', 'text': 'full_text'})

In [10]:
print(len(df_sample))
df_sample = df_sample[~df_sample["text_id"].isin(common_ids)].reset_index(drop=True)
print(len(df_sample))

15594
15142


In [11]:
df_sample = df_sample.sample(n=10000,random_state=CFG.seed).reset_index(drop=True)

In [12]:
# ====================================================
# CV split
# ====================================================
Fold = KFold(n_splits=CFG.n_fold, shuffle=True, random_state=CFG.seed)
for n, (train_index, val_index) in enumerate(Fold.split(df_sample)):
    df_sample.loc[val_index, 'fold_a4'] = int(n)
df_sample['fold_a4'] = df_sample['fold_a4'].astype(int)
display(df_sample.groupby('fold_a4').size())

fold_a4
0    2500
1    2500
2    2500
3    2500
dtype: int64

In [13]:
# ====================================================
# CV split
# ====================================================
Fold = KFold(n_splits=5, shuffle=True, random_state=CFG.seed)
for n, (train_index, val_index) in enumerate(Fold.split(df_sample)):
    df_sample.loc[val_index, 'fold_a5'] = int(n)
df_sample['fold_a5'] = df_sample['fold_a5'].astype(int)
display(df_sample.groupby('fold_a5').size())

fold_a5
0    2000
1    2000
2    2000
3    2000
4    2000
dtype: int64

In [14]:
df_sample

Unnamed: 0,text_id,full_text,fold_a4,fold_a5
0,09DB075C4ACE,Every teacher should use this in their class r...,0,0
1,476DAA9251E5,In my opinon I don't think that driverless car...,3,4
2,23D479DD8197,"When you are seeking advice, it is best to see...",2,2
3,8F8BCC081DC6,"Home schooling, also known as home education, ...",0,0
4,A737705B81A4,some people need help on choosing what car the...,3,4
...,...,...,...,...
9995,E7BF5968CCDA,"In the article ""The Challenge of Exploring Ven...",2,2
9996,28A31D350C06,In 1976 NASA's Viking 1 spacecraft was circlin...,1,1
9997,B15CAAF6DF04,I don't think your ideas on the grade B averag...,3,4
9998,46372618BF29,"We should work to abolish electoral collage, n...",0,0


In [15]:
topic_pred_df = pd.read_csv('../input/fb3-bert-topics/topic_model_feedback.csv')
topic_pred_df = topic_pred_df.drop(columns={'prob'})
topic_pred_df = topic_pred_df.rename(columns={'id': 'essay_id'})

topic_meta_df = pd.read_csv('../input/fb3-bert-topics/topic_model_metadata.csv')
topic_meta_df = topic_meta_df.rename(columns={'Topic': 'topic', 'Name': 'topic_name'}).drop(columns=['Count'])
topic_meta_df.topic_name = topic_meta_df.topic_name.apply(lambda n: ' '.join(n.split('_')[1:]))

topic_pred_df = topic_pred_df.merge(topic_meta_df, on='topic', how='left')

In [16]:
import sys
sys.path.append('../input/fb3-bert-topics/site-packages')
from bertopic import BERTopic
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

topic_model = BERTopic.load("../input/fb3-bert-topics/feedback3_topic_model")

sws = stopwords.words("english") + ["n't",  "'s", "'ve"]
docs = []
for fl in tqdm(df_sample['full_text']):
    word_tokens = word_tokenize(fl)
    txt = " ".join([w for w in word_tokens if not w.lower() in sws])
    docs.append(txt)

topics, probs = topic_model.transform(docs)

  0%|          | 0/10000 [00:00<?, ?it/s]

Batches:   0%|          | 0/313 [00:00<?, ?it/s]

In [17]:
pred_topics = pd.DataFrame()
dids = list(map(lambda fl: fl.split("/")[-1].split(".")[0], df_sample["text_id"]))
pred_topics["id"] = dids
pred_topics["topic"] = topics
pred_topics['prob'] = probs
pred_topics = pred_topics.drop(columns={'prob'})
pred_topics = pred_topics.rename(columns={'id': 'text_id'})
pred_topics = pred_topics.merge(topic_meta_df, on='topic', how='left')
pred_topics

Unnamed: 0,text_id,topic,topic_name
0,09DB075C4ACE,25,older younger younger students older students
1,476DAA9251E5,3,technology people use use technology
2,23D479DD8197,14,advice ask multiple choice
3,8F8BCC081DC6,0,online classes home students
4,A737705B81A4,14,advice ask multiple choice
...,...,...,...
9995,E7BF5968CCDA,3,technology people use use technology
9996,28A31D350C06,6,first impression impressions change
9997,B15CAAF6DF04,19,extracurricular activities students extracurri...
9998,46372618BF29,4,accomplish something always idle


In [18]:
# test & BERTopic merge

df_sample = df_sample.merge(pred_topics, on='text_id', how='left')

In [19]:
# https://www.kaggle.com/c/lish-moa/discussion/195195

class MultilabelStratifiedGroupKFold(_BaseKFold):
    def __init__(self, n_splits=5, random_state=None, shuffle=None):
        super().__init__(n_splits=n_splits, random_state=random_state, shuffle=shuffle)

    def _iter_test_indices(self, X=None, y=None, groups=None):
        cv = MultilabelStratifiedKFold(
            n_splits=self.n_splits,
            random_state=self.random_state,
            shuffle=self.shuffle,
        )

        value_counts = groups.value_counts()
        regluar_indices = value_counts.loc[value_counts <= 30].index.sort_values()
        irregluar_indices = value_counts.loc[value_counts > 30].index.sort_values()

        group_to_fold = {}
        tmp = y.groupby(groups).mean().loc[regluar_indices]

        for fold, (_, test) in enumerate(cv.split(tmp, tmp)):
            group_to_fold.update({group: fold for group in tmp.index[test]})

        sample_to_fold = {}
        tmp = y.loc[groups.isin(irregluar_indices)]

        for fold, (_, test) in enumerate(cv.split(tmp, tmp)):
            sample_to_fold.update({sample: fold for sample in tmp.index[test]})

        folds = groups.map(group_to_fold)
        is_na = folds.isna()
        folds[is_na] = folds[is_na].index.map(sample_to_fold).values

        for i in range(self.n_splits):
            yield np.where(folds == i)[0]

In [20]:
# ====================================================
# CV split
# ====================================================
Fold = GroupKFold(n_splits=CFG.n_fold)
for n, (train_index, val_index) in enumerate(Fold.split(df_sample, groups = df_sample["topic"])):
    df_sample.loc[val_index, 'fold_c4'] = int(n)
df_sample['fold_c4'] = df_sample['fold_c4'].astype(int)
display(df_sample.groupby('fold_c4').size())

fold_c4
0    2500
1    2500
2    2500
3    2500
dtype: int64

In [21]:
# ====================================================
# CV split
# ====================================================
Fold = GroupKFold(n_splits=5)
for n, (train_index, val_index) in enumerate(Fold.split(df_sample, groups = df_sample["topic"])):
    df_sample.loc[val_index, 'fold_c5'] = int(n)
df_sample['fold_c5'] = df_sample['fold_c5'].astype(int)
display(df_sample.groupby('fold_c5').size())

fold_c5
0    2000
1    2000
2    2000
3    2000
4    2000
dtype: int64

In [22]:
df_sample.head()

Unnamed: 0,text_id,full_text,fold_a4,fold_a5,topic,topic_name,fold_c4,fold_c5
0,09DB075C4ACE,Every teacher should use this in their class r...,0,0,25,older younger younger students older students,0,0
1,476DAA9251E5,In my opinon I don't think that driverless car...,3,4,3,technology people use use technology,0,0
2,23D479DD8197,"When you are seeking advice, it is best to see...",2,2,14,advice ask multiple choice,2,2
3,8F8BCC081DC6,"Home schooling, also known as home education, ...",0,0,0,online classes home students,3,3
4,A737705B81A4,some people need help on choosing what car the...,3,4,14,advice ask multiple choice,2,2


In [23]:
df_sample.to_csv('feedback3_pseudo_dataset.csv',index=False)

In [24]:
# ====================================================
# CFG
# ====================================================
class CFG1:
    num_workers=4
    #path="../input/fb3-deberta-v3-base-train/"
    path="../input/fb3-debertav3base-meanpool-exp030/"
    config_path='../input/fb3-deberta-v3-base-train/config.pth'
    model="microsoft/deberta-v3-base"
    gradient_checkpointing=False
    batch_size=24
    max_len = 800
    target_size=6
    fc_dropout=0.2
    target_cols=['cohesion', 'syntax', 'vocabulary', 'phraseology', 'grammar', 'conventions']
    seed=42
    n_fold=5
    trn_fold=[0, 1, 2, 3, 4]
    freezing=False
    is_reinit_layer=False
    is_mixout=False

In [25]:
class CFG2:
    num_workers=4
    #path="../input/fb3-deberta-v3-base-train/"
    path="../input/fb3-debertav3base-lhl-exp031/"
    config_path='../input/fb3-deberta-v3-base-train/config.pth'
    model="microsoft/deberta-v3-base"
    gradient_checkpointing=False
    batch_size=24
    max_len = 800
    target_size=6
    fc_dropout=0.2
    target_cols=['cohesion', 'syntax', 'vocabulary', 'phraseology', 'grammar', 'conventions']
    seed=42
    n_fold=5
    trn_fold=[0, 1, 2, 3, 4]
    freezing=False
    is_reinit_layer=False
    is_mixout=False

In [26]:
class CFG3:
    num_workers=4
    #path="../input/fb3-deberta-v3-base-train/"
    path="../input/fb3-debertav3base-lhl-exp032/"
    config_path='../input/fb3-deberta-v3-base-train/config.pth'
    model="microsoft/deberta-v3-base"
    gradient_checkpointing=False
    batch_size=24
    max_len = 800
    target_size=6
    fc_dropout=0.2
    target_cols=['cohesion', 'syntax', 'vocabulary', 'phraseology', 'grammar', 'conventions']
    seed=42
    n_fold=5
    trn_fold=[0, 1, 2, 3, 4]
    freezing=False
    is_reinit_layer=False
    is_mixout=False

In [27]:
# ====================================================
# tokenizer
# ====================================================
CFG1.tokenizer = AutoTokenizer.from_pretrained('../input/fb3-deberta-v3-base-train/tokenizer/')
CFG2.tokenizer = AutoTokenizer.from_pretrained('../input/fb3-deberta-v3-base-train/tokenizer/')
CFG3.tokenizer = AutoTokenizer.from_pretrained('../input/fb3-deberta-v3-base-train/tokenizer/')

In [28]:
# ====================================================
# Utils
# ====================================================
def MCRMSE(y_trues, y_preds):
    scores = []
    idxes = y_trues.shape[1]
    for i in range(idxes):
        y_true = y_trues[:,i]
        y_pred = y_preds[:,i]
        score = mean_squared_error(y_true, y_pred, squared=False) # RMSE
        scores.append(score)
    mcrmse_score = np.mean(scores)
    return mcrmse_score, scores


def get_score(y_trues, y_preds):
    mcrmse_score, scores = MCRMSE(y_trues, y_preds)
    return mcrmse_score, scores


def get_logger(filename='inference'):
    from logging import getLogger, INFO, StreamHandler, FileHandler, Formatter
    logger = getLogger(__name__)
    logger.setLevel(INFO)
    handler1 = StreamHandler()
    handler1.setFormatter(Formatter("%(message)s"))
    handler2 = FileHandler(filename=f"{filename}.log")
    handler2.setFormatter(Formatter("%(message)s"))
    logger.addHandler(handler1)
    logger.addHandler(handler2)
    return logger

LOGGER = get_logger()


def seed_everything(seed=42):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True
    
seed_everything(seed=CFG.seed)

In [29]:
# ====================================================
# oof
# ====================================================
def oof_score(CFG):
    oof_df = pd.read_pickle(CFG.path+'oof_df.pkl')
    labels = oof_df[CFG.target_cols].values
    preds = oof_df[[f"pred_{c}" for c in CFG.target_cols]].values
    score, scores = get_score(labels, preds)
    LOGGER.info(f'Score: {score:<.4f}  Scores: {scores}')
    return preds,labels
    
print("CFG1 meanpool")
oof_preds1, oof_labels = oof_score(CFG1)
print()
print("CFG2 Last hidden layer")
oof_preds2, oof_labels = oof_score(CFG2)
print()
print("CFG3 MLP")
oof_preds3, oof_labels = oof_score(CFG3)
print()
oof_preds = (oof_preds1+oof_preds2+oof_preds3)/3
print("ensemble score")
score, scores = get_score(oof_labels, oof_preds)
LOGGER.info(f'Score: {score:<.4f}  Scores: {scores}')

Score: 0.4529  Scores: [0.48548174861770604, 0.449232026659146, 0.41372176682887285, 0.45265425139941706, 0.4718113115607885, 0.4443987631087937]


CFG1 meanpool

CFG2 Last hidden layer


Score: 0.4541  Scores: [0.48860057205567936, 0.4474881525438986, 0.4164453838811333, 0.4533338333317574, 0.47056790109757873, 0.44804496599067967]
Score: 0.4531  Scores: [0.4889163803498122, 0.44698736838296643, 0.4145833086049907, 0.4523602645462454, 0.4683720779773138, 0.44760953065221415]
Score: 0.4506  Scores: [0.4846499126590049, 0.4453008330337343, 0.4118317158138092, 0.45035050001907356, 0.4675350293003804, 0.4438148764573951]



CFG3 MLP

ensemble score


In [30]:
def freeze(module):
    """
    Freezes module's parameters.
    """
    
    for parameter in module.parameters():
        parameter.requires_grad = False

In [31]:
def replace_encoding_with_utf8(error: UnicodeError) -> Tuple[bytes, int]:
    return error.object[error.start : error.end].encode("utf-8"), error.end


def replace_decoding_with_cp1252(error: UnicodeError) -> Tuple[str, int]:
    return error.object[error.start : error.end].decode("cp1252"), error.end

# Register the encoding and decoding error handlers for `utf-8` and `cp1252`.
codecs.register_error("replace_encoding_with_utf8", replace_encoding_with_utf8)
codecs.register_error("replace_decoding_with_cp1252", replace_decoding_with_cp1252)

def resolve_encodings_and_normalize(text: str) -> str:
    """Resolve the encoding problems and normalize the abnormal characters."""
    text = (
        text.encode("raw_unicode_escape")
        .decode("utf-8", errors="replace_decoding_with_cp1252")
        .encode("cp1252", errors="replace_encoding_with_utf8")
        .decode("utf-8", errors="replace_decoding_with_cp1252")
    )
    text = unidecode(text)
    return text

def del_empty_text(text):
    """ Custom text cleaning. """
    text = text.strip()
    text = text.replace('\n', ' ')
    text = text.replace('\t', ' ')
    text = re.sub(' +', ' ', text)
    text = text.strip()

    return text

In [32]:
df_sample['full_text'] = df_sample['full_text'].apply(resolve_encodings_and_normalize)
df_sample['full_text'] = df_sample['full_text'].apply(del_empty_text)

In [33]:
# ====================================================
# Dataset
# ====================================================
def prepare_input(cfg, text):
    inputs = cfg.tokenizer.encode_plus(
        text, 
        return_tensors=None, 
        add_special_tokens=True, 
        max_length=CFG.max_len,
        pad_to_max_length=True,
        truncation=True
    )
    for k, v in inputs.items():
        inputs[k] = torch.tensor(v, dtype=torch.int32)
    return inputs


class TestDataset(Dataset):
    def __init__(self, cfg, df):
        self.cfg = cfg
        self.texts = df['full_text'].values

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, item):
        inputs = prepare_input(self.cfg, self.texts[item])
        return inputs
    
    
def collate(inputs):
    mask_len = int(inputs["attention_mask"].sum(axis=1).max())
    for k, v in inputs.items():
        inputs[k] = inputs[k][:,:mask_len]
    return inputs

In [34]:
# ====================================================
# Model
# ====================================================
class MeanPooling(nn.Module):
    def __init__(self):
        super(MeanPooling, self).__init__()
        
    def forward(self, last_hidden_state, attention_mask):
        input_mask_expanded = attention_mask.unsqueeze(-1).expand(last_hidden_state.size()).float()
        sum_embeddings = torch.sum(last_hidden_state * input_mask_expanded, 1)
        sum_mask = input_mask_expanded.sum(1)
        sum_mask = torch.clamp(sum_mask, min=1e-9)
        mean_embeddings = sum_embeddings / sum_mask
        return mean_embeddings
    
class MaxPooling(nn.Module):
    def __init__(self):
        super(MaxPooling, self).__init__()
        
    def forward(self, last_hidden_state, attention_mask):
        input_mask_expanded = attention_mask.unsqueeze(-1).expand(last_hidden_state.size()).float()
        embeddings = last_hidden_state.clone()
        embeddings[input_mask_expanded == 0] = -1e4
        max_embeddings, _ = torch.max(embeddings, dim=1)
        return max_embeddings
    

class CustomModel1(nn.Module):
    def __init__(self, cfg, config_path=None, pretrained=False):
        super().__init__()
        self.cfg = cfg
        if config_path is None:
            self.config = AutoConfig.from_pretrained(cfg.model, output_hidden_states=True)
            self.config.hidden_dropout = 0.
            self.config.hidden_dropout_prob = 0.
            self.config.attention_dropout = 0.
            self.config.attention_probs_dropout_prob = 0.
            LOGGER.info(self.config)
        else:
            self.config = torch.load(config_path)
        if pretrained:
            self.model = AutoModel.from_pretrained(cfg.model, config=self.config)
        else:
            self.model = AutoModel.from_config(self.config)
        if self.cfg.gradient_checkpointing:
            self.model.gradient_checkpointing_enable()
            
        # Freezing
        if cfg.freezing:
            # freezing embeddings and first 2 layers of encoder
            freeze((self.model).embeddings)
            freeze((self.model).encoder.layer[:2])
            cfg.after_freezed_parameters = filter(lambda parameter: parameter.requires_grad, (self.model).parameters())
            
        #if 'deberta-v2-xxlarge' in cfg.model:
        #    self.model.embeddings.requires_grad_(False)
        #    self.model.encoder.layer[:24].requires_grad_(False) # 冻结24/48
        #if 'deberta-v2-xlarge' in cfg.model:
        #    self.model.embeddings.requires_grad_(False)
        #    self.model.encoder.layer[:12].requires_grad_(False) # 冻结12/24
        #if 'funnel-transformer-xlarge' in cfg.model:
        #    self.model.embeddings.requires_grad_(False)
        #    self.model.encoder.blocks[:1].requires_grad_(False)
            
        self.high_dropout = nn.Dropout(p=0.5)
        
        self.pool = MeanPooling()

        #self.fc = nn.Linear(self.config.hidden_size, cfg.target_size)
        self.fc = nn.Sequential(
            nn.Linear(self.config.hidden_size, CFG.target_size)
        )
        self.layer_norm1 = nn.LayerNorm(self.config.hidden_size,eps=1e-5)
        
        self._init_weights(self.fc)
        self._init_weights(self.layer_norm1)
        
    def _init_weights(self, module):
        if isinstance(module, nn.Linear):
            module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
            if module.bias is not None:
                module.bias.data.zero_()
        elif isinstance(module, nn.Embedding):
            module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
            if module.padding_idx is not None:
                module.weight.data[module.padding_idx].zero_()
        elif isinstance(module, nn.LayerNorm):
            module.bias.data.zero_()
            module.weight.data.fill_(1.0)
        
    def feature(self, inputs):
        outputs = self.model(**inputs)
        #feature = outputs[0][:, 0, :]
        last_hidden_states = outputs[0]
        feature = self.pool(last_hidden_states, inputs['attention_mask'])
        feature = self.layer_norm1(feature)
        return feature, outputs
    

    def forward(self, inputs=None, labels=None):
        feature,outputs = self.feature(inputs)
        logits = torch.mean(
            torch.stack(
                [self.fc(self.high_dropout(feature)) for _ in range(5)],
                dim=0,
            ),
            dim=0,
        )

        # calculate loss
        loss = None
        if labels is not None:
            loss_fn = nn.SmoothL1Loss(reduction='mean')
            loss = loss_fn(logits, labels)
        
        output = (logits,) + outputs[2:]
        output = output[0].detach().cpu().numpy().squeeze().tolist()
        return ((loss,) + output) if loss is not None else output

In [35]:
class CustomModel2(nn.Module):
    def __init__(self, cfg, config_path=None, pretrained=False):
        super().__init__()
        self.cfg = cfg
        if config_path is None:
            self.config = AutoConfig.from_pretrained(cfg.model, output_hidden_states=True)
            self.config.hidden_dropout = 0.
            self.config.hidden_dropout_prob = 0.
            self.config.attention_dropout = 0.
            self.config.attention_probs_dropout_prob = 0.
            LOGGER.info(self.config)
        else:
            self.config = torch.load(config_path)
        if pretrained:
            self.model = AutoModel.from_pretrained(cfg.model, config=self.config)
        else:
            self.model = AutoModel.from_config(self.config)
        if cfg.is_mixout:
            self.model = replace_mixout(self.model, CFG.mixout)
            print(f"Initialized Mixout (p={CFG.mixout}) Regularization")
        if cfg.is_reinit_layer:
            self.model = reinit_layers(self.model)
            print(f'Reinitializing Last {CFG.num_reinit_layers} Layers.')
        if self.cfg.gradient_checkpointing:
            self.model.gradient_checkpointing_enable()
            
        # Freezing
        if cfg.freezing:
            # freezing embeddings and first 2 layers of encoder
            freeze((self.model).embeddings)
            freeze((self.model).encoder.layer[:2])
            cfg.after_freezed_parameters = filter(lambda parameter: parameter.requires_grad, (self.model).parameters())
            
        #if 'microsoft/deberta-xlarge' in CFG.model:
        #    self.model.embeddings.requires_grad_(False)
        #    self.model.encoder.layer[:24].requires_grad_(False)
        #if 'microsoft/deberta-v2-xlarge' in CFG.model:
        #    self.model.embeddings.requires_grad_(False)
        #    self.model.encoder.layer[:12].requires_grad_(False)
            

        self.high_dropout = nn.Dropout(p=0.5)
        
        self.fc = nn.Linear(self.config.hidden_size, cfg.target_size)

        self._init_weights(self.fc)
        
    def _init_weights(self, module):
        if isinstance(module, nn.Linear):
            module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
            if module.bias is not None:
                module.bias.data.zero_()
        elif isinstance(module, nn.Embedding):
            module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
            if module.padding_idx is not None:
                module.weight.data[module.padding_idx].zero_()
        elif isinstance(module, nn.LayerNorm):
            module.bias.data.zero_()
            module.weight.data.fill_(1.0)
        
    def feature(self, inputs):
        outputs = self.model(**inputs)
        #feature = outputs[0][:, 0, :]
        all_hidden_states = torch.stack(outputs.hidden_states)
        layer_index = 11 # second to last hidden layer
        feature = all_hidden_states[layer_index+1, :, 0] 
        return feature, outputs
    

    def forward(self, inputs=None, labels=None):
        feature,outputs = self.feature(inputs)
        logits = torch.mean(
            torch.stack(
                [self.fc(self.high_dropout(feature)) for _ in range(5)],
                dim=0,
            ),
            dim=0,
        )

        # calculate loss
        loss = None
        if labels is not None:
            loss_fn = nn.SmoothL1Loss(reduction='mean')
            loss = loss_fn(logits, labels)
        
        output = (logits,) + outputs[2:]
        output = output[0].detach().cpu().numpy().squeeze().tolist()
        return ((loss,) + output) if loss is not None else output

In [36]:
class CustomModel3(nn.Module):
    def __init__(self, cfg, config_path=None, pretrained=False):
        super().__init__()
        self.cfg = cfg
        if config_path is None:
            self.config = AutoConfig.from_pretrained(cfg.model, output_hidden_states=True)
            self.config.hidden_dropout = 0.
            self.config.hidden_dropout_prob = 0.
            self.config.attention_dropout = 0.
            self.config.attention_probs_dropout_prob = 0.
            LOGGER.info(self.config)
        else:
            self.config = torch.load(config_path)
        if pretrained:
            self.model = AutoModel.from_pretrained(cfg.model, config=self.config)
        else:
            self.model = AutoModel.from_config(self.config)
        if cfg.is_mixout:
            self.model = replace_mixout(self.model, CFG.mixout)
            print(f"Initialized Mixout (p={CFG.mixout}) Regularization")
        if cfg.is_reinit_layer:
            self.model = reinit_layers(self.model)
            print(f'Reinitializing Last {CFG.num_reinit_layers} Layers.')
        if self.cfg.gradient_checkpointing:
            self.model.gradient_checkpointing_enable()
            
        # Freezing
        if cfg.freezing:
            # freezing embeddings and first 2 layers of encoder
            freeze((self.model).embeddings)
            freeze((self.model).encoder.layer[:2])
            cfg.after_freezed_parameters = filter(lambda parameter: parameter.requires_grad, (self.model).parameters())
            
        #if 'microsoft/deberta-xlarge' in CFG.model:
        #    self.model.embeddings.requires_grad_(False)
        #    self.model.encoder.layer[:24].requires_grad_(False)
        #if 'microsoft/deberta-v2-xlarge' in CFG.model:
        #    self.model.embeddings.requires_grad_(False)
        #    self.model.encoder.layer[:12].requires_grad_(False)
            

        self.high_dropout = nn.Dropout(p=0.5)
        self.layer_norm1 = nn.LayerNorm(self.config.hidden_size,eps=1e-5)
        
        self.pool = MeanPooling()
        self.concat_pool = nn.Linear(self.config.hidden_size*3, self.config.hidden_size)
        self.fc = nn.Linear(self.config.hidden_size, cfg.target_size)
        self._init_weights(self.fc)
        self._init_weights(self.concat_pool)
        self._init_weights(self.layer_norm1)

        # defining attention network for attention scores 
        self.attention = nn.Sequential(
            nn.Linear(self.config.hidden_size, 512),
            nn.LayerNorm(512),
            nn.GELU(),
            #nn.Tanh(),
            nn.Linear(512, 1),
            #nn.Softmax(dim=1)
        )
        self._init_weights(self.attention)

        
    def _init_weights(self, module):
        if isinstance(module, nn.Linear):
            module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
            if module.bias is not None:
                module.bias.data.zero_()
        elif isinstance(module, nn.Embedding):
            module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
            if module.padding_idx is not None:
                module.weight.data[module.padding_idx].zero_()
        elif isinstance(module, nn.LayerNorm):
            module.bias.data.zero_()
            module.weight.data.fill_(1.0)
        
    def feature(self, inputs):
        outputs = self.model(**inputs)
        last_hidden_states = outputs[0]
        # mean pooled sentence representation
        mean_feature = self.pool(last_hidden_states, inputs['attention_mask'])
        
        # attention based sentence representation
        weights = self.attention(last_hidden_states).float()
        weights[inputs['attention_mask']==0]=float('-inf')
        weights = torch.softmax(weights,1)
        attention_feature = torch.sum(weights * last_hidden_states, dim=1)
        
        # CLS Token representation
        cls_token_feature = last_hidden_states[:, 0, :] # only cls token
        # Concat them
        combine_feature = torch.cat([mean_feature, attention_feature, cls_token_feature], dim = -1)
        
        # MLP
        feature = self.concat_pool(combine_feature)
        
        feature = self.layer_norm1(feature)
        return feature, outputs
    

    def forward(self, inputs=None, labels=None):
        feature,outputs = self.feature(inputs)
        logits = torch.mean(
            torch.stack(
                [self.fc(self.high_dropout(feature)) for _ in range(5)],
                dim=0,
            ),
            dim=0,
        )

        # calculate loss
        loss = None
        if labels is not None:
            loss_fn = nn.SmoothL1Loss(reduction='mean')
            loss = loss_fn(logits, labels)
        
        output = (logits,) + outputs[2:]
        output = output[0].detach().cpu().numpy().squeeze().tolist()
        return ((loss,) + output) if loss is not None else output

In [37]:
# ====================================================
# inference
# ====================================================
def inference_fn(test_loader, model, device):
    preds = []
    model.eval()
    model.to(device)
    tk0 = tqdm(test_loader, total=len(test_loader))
    for inputs in tk0:
        #inputs = collate(inputs)
        for k, v in inputs.items():
            inputs[k] = v.to(device)
        with torch.no_grad():
            y_preds = model(inputs)
        preds.append(y_preds)
    predictions = np.concatenate(preds)
    return predictions

In [38]:
def make_prediction1(CFG,pseudo_folds):
    test_dataset = TestDataset(CFG, pseudo_folds)
    test_loader = DataLoader(test_dataset,
                             batch_size=CFG.batch_size,
                             shuffle=False,
                             #collate_fn=collator,
                             num_workers=CFG.num_workers, pin_memory=True, drop_last=False)
    model = CustomModel1(CFG, config_path=CFG.config_path, pretrained=False)
    state = torch.load(CFG.path+f"{CFG.model.replace('/', '-')}_fold{fold}_best.pth",
                       map_location=torch.device('cpu'))
    model.load_state_dict(state['model'])
    prediction = inference_fn(test_loader, model, device)
    del model,state
    return prediction

def make_prediction2(CFG,pseudo_folds):
    test_dataset = TestDataset(CFG, pseudo_folds)
    test_loader = DataLoader(test_dataset,
                             batch_size=CFG.batch_size,
                             shuffle=False,
                             #collate_fn=collator,
                             num_workers=CFG.num_workers, pin_memory=True, drop_last=False)
    model = CustomModel2(CFG, config_path=CFG.config_path, pretrained=False)
    state = torch.load(CFG.path+f"{CFG.model.replace('/', '-')}_fold{fold}_best.pth",
                       map_location=torch.device('cpu'))
    model.load_state_dict(state['model'])
    prediction = inference_fn(test_loader, model, device)
    del model,state
    return prediction

def make_prediction3(CFG,pseudo_folds):
    test_dataset = TestDataset(CFG, pseudo_folds)
    test_loader = DataLoader(test_dataset,
                             batch_size=CFG.batch_size,
                             shuffle=False,
                             #collate_fn=collator,
                             num_workers=CFG.num_workers, pin_memory=True, drop_last=False)
    model = CustomModel3(CFG, config_path=CFG.config_path, pretrained=False)
    state = torch.load(CFG.path+f"{CFG.model.replace('/', '-')}_fold{fold}_best.pth",
                       map_location=torch.device('cpu'))
    model.load_state_dict(state['model'])
    prediction = inference_fn(test_loader, model, device)
    del model,state
    return prediction

In [39]:
pseudo_df = pd.DataFrame()
for fold in CFG1.trn_fold:
    pseudo_folds = df_sample[df_sample['fold_c5']==fold].reset_index(drop=True)
    pred1 = make_prediction1(CFG1,pseudo_folds)
    pred2 = make_prediction2(CFG2,pseudo_folds)
    pred3 = make_prediction3(CFG3,pseudo_folds)
    prediction = (pred1+pred2+pred3)/3
    pseudo_folds[CFG.target_cols] = prediction
    pseudo_df = pd.concat([pseudo_df, pseudo_folds])
    del prediction, pred1, pred2, pred3; gc.collect()
    torch.cuda.empty_cache()

  0%|          | 0/84 [00:00<?, ?it/s]

  0%|          | 0/84 [00:00<?, ?it/s]

  0%|          | 0/84 [00:00<?, ?it/s]

Exception ignored in: <function _MultiProcessingDataLoaderIter.__del__ at 0x7f0ba1d160e0>
Traceback (most recent call last):
  File "/opt/conda/lib/python3.7/site-packages/torch/utils/data/dataloader.py", line 1358, in __del__
    self._shutdown_workers()
  File "/opt/conda/lib/python3.7/site-packages/torch/utils/data/dataloader.py", line 1341, in _shutdown_workers
    if w.is_alive():
  File "/opt/conda/lib/python3.7/multiprocessing/process.py", line 151, in is_alive
    assert self._parent_pid == os.getpid(), 'can only test a child process'
AssertionError: can only test a child process
Exception ignored in: <function _MultiProcessingDataLoaderIter.__del__ at 0x7f0ba1d160e0>
Traceback (most recent call last):
  File "/opt/conda/lib/python3.7/site-packages/torch/utils/data/dataloader.py", line 1358, in __del__
    self._shutdown_workers()
  File "/opt/conda/lib/python3.7/site-packages/torch/utils/data/dataloader.py", line 1341, in _shutdown_workers
    if w.is_alive():
  File "/opt/con

  0%|          | 0/84 [00:00<?, ?it/s]

  0%|          | 0/84 [00:00<?, ?it/s]

  0%|          | 0/84 [00:00<?, ?it/s]

  0%|          | 0/84 [00:00<?, ?it/s]

  0%|          | 0/84 [00:00<?, ?it/s]

  0%|          | 0/84 [00:00<?, ?it/s]

  0%|          | 0/84 [00:00<?, ?it/s]

  0%|          | 0/84 [00:00<?, ?it/s]

  0%|          | 0/84 [00:00<?, ?it/s]

  0%|          | 0/84 [00:00<?, ?it/s]

  0%|          | 0/84 [00:00<?, ?it/s]

  0%|          | 0/84 [00:00<?, ?it/s]

In [40]:
pseudo_df.head()

Unnamed: 0,text_id,full_text,fold_a4,fold_a5,topic,topic_name,fold_c4,fold_c5,cohesion,syntax,vocabulary,phraseology,grammar,conventions
0,09DB075C4ACE,Every teacher should use this in their class r...,0,0,25,older younger younger students older students,0,0,3.073352,3.059598,3.168643,3.166605,3.336765,3.029826
1,476DAA9251E5,In my opinon I don't think that driverless car...,3,4,3,technology people use use technology,0,0,3.783849,3.760673,3.734389,3.885926,4.013044,3.577289
2,FB9569BDE4BB,My option to this device is it is valueable to...,0,0,3,technology people use use technology,0,0,2.808153,2.689648,2.974849,2.784267,2.777584,2.659379
3,049B62690A5E,The new technology that is called the Facial A...,3,3,3,technology people use use technology,0,0,3.387714,3.298499,3.411449,3.366476,3.343423,2.966201
4,2D10F2D7A580,Driver good is very important for all people. ...,2,3,3,technology people use use technology,0,0,3.101705,3.026651,3.204725,3.066348,3.115303,2.757733


In [41]:
pseudo_df.to_csv('feedback3_pseudo_v3base_ensemble.csv',index=False)