In [1]:
import sys

if 'google.colab' in sys.modules:

    from google.colab import drive
    drive.mount('/content/drive')
    
    COMPETION_DATA_DIR = '/content/drive/MyDrive/input/kaggle/feedback-prize-english-language-learning'

    !pip install tokenizers
    !pip install transformers

    from sklearn.svm import SVR

    train_path = COMPETION_DATA_DIR + '/train.csv'
    test_path = COMPETION_DATA_DIR + '/test.csv'
    sub_path = COMPETION_DATA_DIR + '/sample_submission.csv'
    iter_path = COMPETION_DATA_DIR + '/iterativestratification'
    npy_path = COMPETION_DATA_DIR + '/embedding-ko'

elif 'kaggle_web_client' in sys.modules:

    COMPETION_DATA_DIR = '/kaggle/input'
    sys.path.append('../input/iterativestratification')
    from iterstrat.ml_stratifiers import MultilabelStratifiedKFold
    from cuml.svm import SVR
    import cuml

    train_path = COMPETION_DATA_DIR + '/feedback-prize-english-language-learning/train.csv'
    test_path = COMPETION_DATA_DIR + '/feedback-prize-english-language-learning/test.csv'
    sub_path = COMPETION_DATA_DIR + '/feedback-prize-english-language-learning/sample_submission.csv'
    iter_path = COMPETION_DATA_DIR + '/iterativestratification'
    npy_path = COMPETION_DATA_DIR + '/embedding-ko'

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [2]:
import numpy as np
import pandas as pd
import os
import os
import gc
import re
import ast
# import sys
import copy
import json
import time
import math
import string
import pickle
import random
import joblib
import itertools
import warnings
warnings.filterwarnings("ignore")

import scipy as sp
import numpy as np
import pandas as pd
import seaborn as sns
from matplotlib import pyplot as plt
pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000)
from tqdm.auto import tqdm
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import StratifiedKFold, GroupKFold, KFold
sys.path.append(iter_path)
from iterstrat.ml_stratifiers import MultilabelStratifiedKFold

import torch
import torch.nn as nn
from torch.nn import Parameter
import torch.nn.functional as F
from torch.optim import Adam, SGD, AdamW
from torch.utils.data import DataLoader, Dataset
from torch.utils import checkpoint
import tokenizers
import transformers
from transformers import AutoTokenizer, AutoModel, AutoConfig
from transformers import get_linear_schedule_with_warmup, get_cosine_schedule_with_warmup
%env TOKENIZERS_PARALLELISM=true
%matplotlib inline


device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

env: TOKENIZERS_PARALLELISM=true


In [3]:
class CFG:
    eda = False
    debug = True
    load_bert_data = False
    train_staking = True
    train_ensumbling = True
    headrow = 3
    num_workers=4
    gradient_checkpointing=True
    scheduler='cosine' # ['linear', 'cosine']
    batch_scheduler=True
    num_cycles=0.5
    num_warmup_steps=0
    epochs=4
    encoder_lr=2e-5
    decoder_lr=2e-5
    min_lr=1e-6
    eps=1e-6
    betas=(0.9, 0.999)
    batch_size=8
    max_len=512
    weight_decay=0.01
    gradient_accumulation_steps=1
    max_grad_norm=1000
    target_cols=['cohesion', 'syntax', 'vocabulary', 'phraseology', 'grammar', 'conventions']
    seed=42
    n_fold=25
    train=True
    
if CFG.debug:
    CFG.n_fold = 5

In [4]:
train = pd.read_csv(train_path)
train.head()

Unnamed: 0,text_id,full_text,cohesion,syntax,vocabulary,phraseology,grammar,conventions
0,0016926B079C,I think that students would benefit from learn...,3.5,3.5,3.0,3.0,4.0,3.0
1,0022683E9EA5,When a problem is a change you have to let it ...,2.5,2.5,3.0,2.0,2.0,2.5
2,00299B378633,"Dear, Principal\n\nIf u change the school poli...",3.0,3.5,3.0,3.0,3.0,2.5
3,003885A45F42,The best time in life is when you become yours...,4.5,4.5,4.5,4.5,4.0,5.0
4,0049B1DF5CCC,Small act of kindness can impact in other peop...,2.5,3.0,3.0,3.0,2.5,2.5


In [5]:
dftr = pd.read_csv(train_path)
dftr["src"]="train"
dfte = pd.read_csv(test_path)
dfte["src"]="test"
print('Train shape:',dftr.shape,'Test shape:',dfte.shape,'Test columns:',dfte.columns)
df = pd.concat([dftr,dfte],ignore_index=True)

dftr.head()

Train shape: (3911, 9) Test shape: (3, 3) Test columns: Index(['text_id', 'full_text', 'src'], dtype='object')


Unnamed: 0,text_id,full_text,cohesion,syntax,vocabulary,phraseology,grammar,conventions,src
0,0016926B079C,I think that students would benefit from learn...,3.5,3.5,3.0,3.0,4.0,3.0,train
1,0022683E9EA5,When a problem is a change you have to let it ...,2.5,2.5,3.0,2.0,2.0,2.5,train
2,00299B378633,"Dear, Principal\n\nIf u change the school poli...",3.0,3.5,3.0,3.0,3.0,2.5,train
3,003885A45F42,The best time in life is when you become yours...,4.5,4.5,4.5,4.5,4.0,5.0,train
4,0049B1DF5CCC,Small act of kindness can impact in other peop...,2.5,3.0,3.0,3.0,2.5,2.5,train


In [6]:
target_cols = ['cohesion', 'syntax', 'vocabulary', 'phraseology', 'grammar', 'conventions',]

In [7]:
FOLDS = CFG.n_fold

skf = MultilabelStratifiedKFold(n_splits=FOLDS, shuffle=True, random_state=42)
for i,(train_index, val_index) in enumerate(skf.split(dftr,dftr[target_cols])):
    dftr.loc[val_index,'FOLD'] = i
print('Train samples per fold:')
dftr.FOLD.value_counts()

Train samples per fold:


1.0    783
0.0    782
4.0    782
3.0    782
2.0    782
Name: FOLD, dtype: int64

In [8]:
from transformers import AutoModel,AutoTokenizer
import torch
import torch.nn.functional as F
from tqdm import tqdm

In [9]:
def mean_pooling(model_output, attention_mask):
    token_embeddings = model_output.last_hidden_state.detach().cpu()
    input_mask_expanded = (
        attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float()
    )
    return torch.sum(token_embeddings * input_mask_expanded, 1) / torch.clamp(
        input_mask_expanded.sum(1), min=1e-9
    )

In [10]:
BATCH_SIZE = 4

class EmbedDataset(torch.utils.data.Dataset):
    def __init__(self,df):
        self.df = df.reset_index(drop=True)
    def __len__(self):
        return len(self.df)
    def __getitem__(self,idx):
        text = self.df.loc[idx,"full_text"]
        tokens = tokenizer(
                text,
                None,
                add_special_tokens=True,
                padding='max_length',
                truncation=True,
                max_length=MAX_LEN,return_tensors="pt")
        tokens = {k:v.squeeze(0) for k,v in tokens.items()}
        return tokens

ds_tr = EmbedDataset(dftr)
embed_dataloader_tr = torch.utils.data.DataLoader(ds_tr,\
                        batch_size=BATCH_SIZE,\
                        shuffle=False)
ds_te = EmbedDataset(dfte)
embed_dataloader_te = torch.utils.data.DataLoader(ds_te,\
                        batch_size=BATCH_SIZE,\
                        shuffle=False)

In [11]:
tokenizer = None
MAX_LEN = 640

def get_embeddings(MODEL_NM='', MAX=640, BATCH_SIZE=4, verbose=True):
    global tokenizer, MAX_LEN
    DEVICE="cuda"
    model = AutoModel.from_pretrained( MODEL_NM )
    tokenizer = AutoTokenizer.from_pretrained( MODEL_NM )
    MAX_LEN = MAX
    
    model = model.to(DEVICE)
    model.eval()
    all_train_text_feats = []
    for batch in tqdm(embed_dataloader_tr,total=len(embed_dataloader_tr)):
        input_ids = batch["input_ids"].to(DEVICE)
        attention_mask = batch["attention_mask"].to(DEVICE)
        with torch.no_grad():
            model_output = model(input_ids=input_ids,attention_mask=attention_mask)
        sentence_embeddings = mean_pooling(model_output, attention_mask.detach().cpu())
        # Normalize the embeddings
        sentence_embeddings = F.normalize(sentence_embeddings, p=2, dim=1)
        sentence_embeddings =  sentence_embeddings.squeeze(0).detach().cpu().numpy()
        all_train_text_feats.extend(sentence_embeddings)
    all_train_text_feats = np.array(all_train_text_feats)
    if verbose:
        print('Train embeddings shape',all_train_text_feats.shape)
        
    te_text_feats = []
    for batch in tqdm(embed_dataloader_te,total=len(embed_dataloader_te)):
        input_ids = batch["input_ids"].to(DEVICE)
        attention_mask = batch["attention_mask"].to(DEVICE)
        with torch.no_grad():
            model_output = model(input_ids=input_ids,attention_mask=attention_mask)
        sentence_embeddings = mean_pooling(model_output, attention_mask.detach().cpu())
        # Normalize the embeddings
        sentence_embeddings = F.normalize(sentence_embeddings, p=2, dim=1)
        sentence_embeddings =  sentence_embeddings.squeeze(0).detach().cpu().numpy()
        te_text_feats.extend(sentence_embeddings)
    te_text_feats = np.array(te_text_feats)
    if verbose:
        print('Test embeddings shape',te_text_feats.shape)
        
    return all_train_text_feats, te_text_feats

In [12]:
if CFG.load_bert_data:
    MODEL_NM = COMPETION_DATA_DIR + '/huggingface-deberta-variants/deberta-base/deberta-base'
    all_train_text_feats, te_text_feats = get_embeddings(MODEL_NM)

In [13]:
if CFG.load_bert_data:
    MODEL_NM =  COMPETION_DATA_DIR + '/deberta-v3-large/deberta-v3-large'
    all_train_text_feats2, te_text_feats2 = get_embeddings(MODEL_NM)

In [14]:
if CFG.load_bert_data:
    MODEL_NM = COMPETION_DATA_DIR +  '/huggingface-deberta-variants/deberta-large/deberta-large'
    all_train_text_feats3, te_text_feats3 = get_embeddings(MODEL_NM)

In [15]:
if CFG.load_bert_data:
    MODEL_NM = COMPETION_DATA_DIR + '/huggingface-deberta-variants/deberta-large-mnli/deberta-large-mnli'
    all_train_text_feats4, te_text_feats4 = get_embeddings(MODEL_NM, MAX=512)

In [16]:
if CFG.load_bert_data:
    MODEL_NM = COMPETION_DATA_DIR + '/huggingface-deberta-variants/deberta-xlarge/deberta-xlarge'
    all_train_text_feats5, te_text_feats5 = get_embeddings(MODEL_NM, MAX=512)

In [17]:
if CFG.load_bert_data:
    all_train_text_feats = np.concatenate([
        all_train_text_feats,
        all_train_text_feats2,
        all_train_text_feats3,
        all_train_text_feats4,
        all_train_text_feats5
    ],
        axis=1
    )

    te_text_feats = np.concatenate([
        te_text_feats,
        te_text_feats2,
        te_text_feats3,
        te_text_feats4,
        te_text_feats5
    ],
        axis=1
    )

    del all_train_text_feats2, te_text_feats2
    del all_train_text_feats3, te_text_feats3
    del all_train_text_feats4, te_text_feats4
    del all_train_text_feats5, te_text_feats5
    gc.collect()

    print('Our concatenated embeddings have shape', all_train_text_feats.shape )

In [18]:
if CFG.load_bert_data == False:
    all_train_text_feats = np.load(npy_path + '/all_train_text_feats.npy')
    te_text_feats = np.load(npy_path + '/te_text_feats.npy')

In [19]:
from sklearn.linear_model import Ridge
from sklearn.linear_model import Lasso
from sklearn.linear_model import LinearRegression as LR
from sklearn.ensemble import RandomForestRegressor as RFR

In [20]:
if CFG.train_staking:
    models = [
        SVR(C = 1), 
        Ridge(alpha = 1), 
        # Ridge(alpha = 0.01), 
        # SVR(C=2), 
        Lasso(alpha = 1), 
        RFR(max_depth = 5, n_estimators = 10)
    ]
else:
    models = [
        Ridge(alpha = 1), 
    ]

In [21]:
from sklearn.metrics import mean_squared_error

preds = []
scores = []
stacks = []
def comp_score(y_true,y_pred):
    rmse_scores = []
    for i in range(len(target_cols)):
        rmse_scores.append(np.sqrt(mean_squared_error(y_true[:,i],y_pred[:,i])))
    return np.mean(rmse_scores)

for model in models:

    #for fold in tqdm(range(FOLDS),total=FOLDS):
    for fold in range(FOLDS):
        print('#'*25)
        print('### Fold',fold+1)
        print('#'*25)
        
        dftr_ = dftr[dftr["FOLD"]!=fold]
        dfev_ = dftr[dftr["FOLD"]==fold]
        
        tr_text_feats = all_train_text_feats[list(dftr_.index),:]
        ev_text_feats = all_train_text_feats[list(dfev_.index),:]
        
        ev_preds = np.zeros((len(ev_text_feats),6))
        test_preds = np.zeros((len(te_text_feats),6))
        stack_preds = np.zeros((len(all_train_text_feats[list(dftr.index),:]),6))
        
        for i,t in enumerate(target_cols):
            print(t,', ',end='')
            clf = model
            clf.fit(tr_text_feats, dftr_[t].values)
            ev_preds[:,i] = clf.predict(ev_text_feats)
            test_preds[:,i] = clf.predict(te_text_feats)
            
            stack_preds[:,i] = clf.predict(all_train_text_feats[list(dftr.index),:])
            
        print()
        score = comp_score(dfev_[target_cols].values,ev_preds)
        scores.append(score)
        
        stacks.append(stack_preds)
        
        print("Fold : {} RSME score: {}".format(fold,score))
        preds.append(test_preds)
    
print('#'*25)
print('Overall CV RSME =',np.mean(scores))

#########################
### Fold 1
#########################
cohesion , syntax , vocabulary , phraseology , grammar , conventions , 
Fold : 0 RSME score: 0.44901525564904227
#########################
### Fold 2
#########################
cohesion , syntax , vocabulary , phraseology , grammar , conventions , 
Fold : 1 RSME score: 0.45505588784493683
#########################
### Fold 3
#########################
cohesion , syntax , vocabulary , phraseology , grammar , conventions , 
Fold : 2 RSME score: 0.4549194404128242
#########################
### Fold 4
#########################
cohesion , syntax , vocabulary , phraseology , grammar , conventions , 
Fold : 3 RSME score: 0.4544619324404233
#########################
### Fold 5
#########################
cohesion , syntax , vocabulary , phraseology , grammar , conventions , 
Fold : 4 RSME score: 0.44908381558417226
#########################
### Fold 1
#########################
cohesion , syntax , vocabulary , phraseology , grammar , co

In [36]:
if CFG.train_staking:
    stack_scores = []
    stack_models = []
    for i,t in enumerate(target_cols):
        # stack_model = Ridge(alpha=1)
        stack_model = LR()
        stack_train = pd.DataFrame()
        stack_train_mean = pd.DataFrame()

        for ii in range(len(stacks)):
            stack_train = pd.concat([stack_train, pd.DataFrame(stacks[ii])[i]], axis = 1)

        for jj in range(1, len(models)+1):
            stack_train_mean = pd.concat([stack_train_mean, stack_train.iloc[:, CFG.n_fold * (jj-1): CFG.n_fold * (jj)].mean(axis = 1)], axis = 1)
            
        stack_model.fit(stack_train_mean, dftr[t].values)
        stack_score = np.sqrt(mean_squared_error(stack_model.predict(stack_train_mean) ,dftr[t].values))
        stack_models.append(stack_model)
        stack_scores.append(stack_score)
    print(stack_scores)
    print(np.mean(stack_scores))

[0.4273038260720669, 0.3992602696861912, 0.36648094464951275, 0.40296297107336226, 0.4103651194561909, 0.4008378413713191]
0.40120182871810717


In [37]:
if CFG.train_staking:
    sub = dfte.copy()
    for i,t in enumerate(target_cols): 

        stack_final_pred = pd.DataFrame()
        stack_final_pred_mean = pd.DataFrame()

        for ii in range(len(stacks)):
            stack_final_pred = pd.concat([stack_final_pred, pd.DataFrame(preds[ii])[i]], axis = 1)

        for jj in range(1, len(models)+1):
            stack_final_pred_mean = pd.concat([stack_final_pred_mean, stack_final_pred.iloc[:, CFG.n_fold * (jj-1): CFG.n_fold * (jj)].mean(axis = 1)], axis = 1)

        sub[t] = stack_models[i].predict(stack_final_pred_mean)

    sub_columns = pd.read_csv(sub_path).columns
    sub = sub[sub_columns]

In [34]:
sub.head()

Unnamed: 0,text_id,cohesion,syntax,vocabulary,phraseology,grammar,conventions
0,0000C359D63E,3.174865,2.70234,2.976938,2.872545,2.654279,2.590044
1,000BAD50D026,2.538735,2.390987,2.649609,2.354741,2.18272,2.497001
2,00367BB2546B,3.528941,3.358224,3.521134,3.600692,3.364029,3.287919


In [45]:
if CFG.train_ensumbling == True:
    sub = dfte.copy()
    # sub.loc[:,target_cols] = np.average(np.array(preds),axis=0) #,weights=[1/s for s in scores]
    sub.loc[:,target_cols] = np.median(np.array(preds),axis=0) #,weights=[1/s for s in scores]
    sub_columns = pd.read_csv(sub_path).columns
    sub = sub[sub_columns]

# Postprocessing

In [46]:
for target_col in target_cols:
    sub[target_col] = sub[target_col].clip(1, 5)

In [47]:
sub.to_csv("submission.csv",index=None)
sub.head()

Unnamed: 0,text_id,cohesion,syntax,vocabulary,phraseology,grammar,conventions
0,0000C359D63E,2.9992,2.817235,3.107124,2.967416,2.699145,2.666275
1,000BAD50D026,2.713738,2.490091,2.715764,2.404659,2.176832,2.650931
2,00367BB2546B,3.542935,3.382604,3.527941,3.583837,3.384495,3.333607


# Compare with high-score ensemble sub

In [48]:
sub_pred = pd.DataFrame(np.array([
    [
    '0000C359D63E', 
    2.94845644, 
    2.8085049300000002, 
    3.123012555,
    2.9891588150000006, 
    2.685284665, 
    2.66401342
    ], [
    '000BAD50D026', 
    2.67736871, 
    2.4952658199999997, 
    2.719941535,
    2.3817005, 
    2.1445083800000004, 
    2.636744025
    ], [
    '00367BB2546B', 
    3.60374894, 
    3.3973600150000003, 
    3.59641946, 
    3.518199885, 
    3.361896565, 
    3.240423105
    ]], dtype=object))
sub_pred.columns = sub.columns

In [49]:
sub_pred

Unnamed: 0,text_id,cohesion,syntax,vocabulary,phraseology,grammar,conventions
0,0000C359D63E,2.948456,2.808505,3.123013,2.989159,2.685285,2.664013
1,000BAD50D026,2.677369,2.495266,2.719942,2.3817,2.144508,2.636744
2,00367BB2546B,3.603749,3.39736,3.596419,3.5182,3.361897,3.240423


In [50]:
sub.copy().drop(columns = 'text_id') - sub_pred.copy().drop(columns = 'text_id')

Unnamed: 0,cohesion,syntax,vocabulary,phraseology,grammar,conventions
0,0.050743,0.00873,-0.015888,-0.021743,0.01386,0.002261
1,0.036369,-0.005174,-0.004177,0.022958,0.032323,0.014187
2,-0.060814,-0.014756,-0.068478,0.065637,0.022598,0.093184


In [51]:
np.mean(sub.copy().drop(columns = 'text_id') - sub_pred.copy().drop(columns = 'text_id'))

cohesion       0.008766
syntax        -0.003734
vocabulary    -0.029515
phraseology    0.022284
grammar        0.022927
conventions    0.036544
dtype: float64

In [52]:
np.mean(np.mean(sub.copy().drop(columns = 'text_id') - sub_pred.copy().drop(columns = 'text_id')))

0.009545623877222858