This notebook uses below given notebooks to make predictions.

1. Pretrain Roberta Model: https://www.kaggle.com/maunish/clrp-pytorch-roberta-pretrain
2. Finetune Roberta Model: https://www.kaggle.com/maunish/clrp-pytorch-roberta-finetune
3. Inference Notebook: https://www.kaggle.com/maunish/clrp-pytorch-roberta-inference
4. Roberta + SVM: this notebook

In [1]:
import os
import gc
import sys
import cv2
import math
import time
import tqdm
import random
import numpy as np
import pandas as pd
import seaborn as sns
from tqdm import tqdm
import matplotlib.pyplot as plt

import warnings
warnings.filterwarnings('ignore')

from sklearn.svm import SVR
from sklearn.ensemble import AdaBoostRegressor
from sklearn.ensemble import RandomForestRegressor

from sklearn.metrics import mean_squared_error
from sklearn.model_selection import KFold,StratifiedKFold

import torch
import torchvision
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from torch.optim import Adam, lr_scheduler
from torch.utils.data import Dataset, DataLoader

from transformers import (AutoModel, AutoTokenizer, 
                          AutoModelForSequenceClassification)
from sklearn.preprocessing import MinMaxScaler

import plotly.express as px
import plotly.graph_objs as go
import plotly.figure_factory as ff


from colorama import Fore, Back, Style
y_ = Fore.YELLOW
r_ = Fore.RED
g_ = Fore.GREEN
b_ = Fore.BLUE
m_ = Fore.MAGENTA
c_ = Fore.CYAN
sr_ = Style.RESET_ALL

In [2]:
import tensorflow as tf
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Flatten, Softmax, Conv2D, MaxPooling2D, Dropout, BatchNormalization
from tensorflow.keras import regularizers

In [3]:
%%capture
!pip install ../input/nlp-packages/nltk-3.6.2-py3-none-any.whl
!pip install ../input/nlp-packages/pyphen-0.11.0-py3-none-any.whl

In [4]:
#import pandas as pd
#import plotly.express as px
#import matplotlib as plt
import spacy
#import numpy as np
import string
import nltk
from nltk.tokenize import SyllableTokenizer
from nltk import word_tokenize
import pyphen
import itertools
#from g2p_en import G2p
#from wordfreq import zipf_frequency
from collections import Counter
from spacy.matcher import Matcher
from spacy.util import filter_spans
#from sklearn.model_selection import train_test_split

nlp = spacy.load("en_core_web_lg")


In [5]:
df_train = pd.read_csv('/kaggle/input/commonlitreadabilityprize/train.csv')
df_test = pd.read_csv('/kaggle/input/commonlitreadabilityprize/test.csv')
sample = pd.read_csv('../input/commonlitreadabilityprize/sample_submission.csv')

In [6]:
df_train = df_train.rename(columns={'excerpt':'Text'})
df_test = df_test.rename(columns={'excerpt':'Text'})

In [7]:
def add_feature(df, func, features, on='Text', args=()):
    old_cols = list(df.columns)
    if type(features) != list : features = [features]
    if not set(old_cols).isdisjoint(set(features)):
        print('Overwriting existing features')
    df = df.drop(features, axis=1, errors='ignore')
    old_cols = list(df.columns)
    df1 = df[on].apply(func, args=tuple(args))
    df = pd.concat([df, df1], axis=1)
    df.columns = old_cols + features
    #print(df[[on] + features].head(3))
    print(features)
    return df

def get_sent_list_spacy(doc):
    return list(doc.sents)

def get_sent_count(sent_list):
    return len(sent_list)

def get_token_list_by_sent(sent):
    return [[i for i in j if str(i).isalnum()] for j in sent]

def get_word_list_by_sent(token_list):
    return [[j.text for j in i] for i in token_list]

def get_token_list_full(token_list):
    return [i for j in token_list for i in j]

def get_word_list_full(token_list):
    return [i.text for i in token_list]

def get_sent_len(token_list):
    return [len(i) for i in token_list]

def get_word_list_nltk(text):
    return nltk.word_tokenize(text)

def get_word_len(token_list):
    return [len(i) for i in token_list]

def get_pos_tag_spacy(token_list_sent):
    return [[i.pos_ for i in j] for j in token_list_sent]

def get_dep_tag_spacy(token_list_sent):
    return [[i.dep_ for i in j] for j in token_list_sent]

def get_verb_phrase_count(spacy_sent):
    '''input : spacy doc
       output: number of VP in text'''
    pattern = [{'POS': 'VERB', 'OP': '?'},
               {'POS': 'ADV', 'OP': '*'},
               {'POS': 'AUX', 'OP': '*'},
               {'POS': 'VERB', 'OP': '+'}]
    count_list = []
    for sent in spacy_sent:
        matcher = Matcher(nlp.vocab)
        matcher.add("Verb phrase", [pattern])
        matches = matcher(sent)
        spans = [sent[start:end] for _, start, end in matches]
        count_list.append(len(filter_spans(spans)))

    return count_list

def get_noun_phrase_count(spacy_sent):
    '''input : spacy doc
       output: number of NP in text'''
    count_list = []
    for sent in spacy_sent:
        NP_list = []
        for chunk in sent.noun_chunks:
            NP_list.append(chunk)
        count_list.append(len(NP_list))
    return count_list


In [8]:
def generate_features(data):
    data = add_feature(data, nlp, 'Spacy_Doc')
    data = add_feature(data, get_sent_list_spacy, 'Spacy_Sents', on='Spacy_Doc')
    data = add_feature(data, get_sent_count, 'Num_Sents', on='Spacy_Sents')
    data = add_feature(data, get_token_list_by_sent, 'Token_list_sent', on='Spacy_Sents')
    data = add_feature(data, get_word_list_by_sent, 'Word_list_sent', on='Token_list_sent')
    data = add_feature(data, get_token_list_full, 'Token_list', on='Token_list_sent')
    data = add_feature(data, get_word_list_full, 'Word_list_full', on='Token_list')
    data = add_feature(data, get_sent_len, 'Sent_len', on='Token_list_sent')
    data = add_feature(data, get_word_list_nltk, 'Word_list', on='Text')
    data = add_feature(data, len, 'Num_Words', on='Token_list')
    data = add_feature(data, len, 'Num_Chars', on='Text')
    data = add_feature(data, get_word_len, 'Word_len', on='Token_list')
##     data = add_feature(data, get_phone_count, 'Phone_count', on='Token_list')
##     data = add_feature(data, get_phone_list, 'Phone_list', on='Token_list')
    data = add_feature(data, get_pos_tag_nltk, 'Pos_tag_nltk', on='Word_list_sent')
    data = add_feature(data, get_pos_tag_spacy, 'Pos_tag_spacy', on='Token_list_sent')
    data = add_feature(data, get_dep_tag_spacy, 'Dep_tag_spacy', on='Token_list_sent')
##    data = add_feature(data, get_commonality_of_word, 'Commonality_list', on='Token_list')
    data = add_feature(data, get_stop_word_count, 'Stopwords_ratio', on='Token_list')
    data = add_feature(data, get_unique_word_count, 'Unique_words_ratio', on='Token_list')
    data = add_feature(data, get_pos_count, 'POS_counter', on='Pos_tag_spacy')
    for i in (['ADJ', 'ADP', 'ADV', 'AUX', 'CCONJ', 'DET', 'INTJ', 'NOUN', 
               'NUM', 'PART', 'PRON', 'PROPN', 'SCONJ', 'VERB']):
        data[i+'_ratio'] = data['POS_counter'].apply(lambda x:x[i])
        data[i+'_ratio'] = data[i+'_ratio']*100/data['Num_Words']
    data = add_feature(data, get_dep_count, 'DEP_counter', on='Dep_tag_spacy')
    for i in (['ROOT', 'acl', 'acomp', 'advcl', 'advmod', 'agent', 'amod', 'appos', 'attr', 'aux', 'auxpass', 'case', 'cc', 
               'ccomp', 'compound', 'conj', 'csubj', 'csubjpass', 'dative', 'dep', 'det', 'dobj', 'expl', 'intj', 'mark', 
               'meta', 'neg', 'nmod', 'npadvmod', 'nsubj', 'nsubjpass', 'nummod', 'oprd', 'parataxis', 'pcomp', 'pobj', 'poss', 
               'preconj', 'predet', 'prep', 'prt', 'punct', 'quantmod', 'relcl', 'xcomp']):
        data[i+'_ratio'] = data['DEP_counter'].apply(lambda x:x[i])
        data[i+'_ratio'] = data[i+'_ratio']*100/data['Num_Words']
    data['Content_Word_ratio'] = data['NOUN_ratio'] + data['VERB_ratio'] + data['ADJ_ratio'] + data['ADV_ratio']
    data['Connectives_ratio'] = data['SCONJ_ratio'] + data['CCONJ_ratio']
    data = add_feature(data, get_verb_phrase_count, 'Verb_phrase_count', on='Spacy_Sents')
    data = add_feature(data, get_noun_phrase_count, 'Noun_phrase_count', on='Spacy_Sents')
 
    
    for i in ['Sent_len', 'Word_len', 'Syl_count_pyphen', 'Syl_count_nltk', 
              'Verb_phrase_count', 'Noun_phrase_count']:
        df1 = pd.DataFrame(data[i].apply(lambda x:np.quantile(x, [0.10, 0.25, 0.50, 0.75, 0.90])).tolist(), 
                           columns=[i+'_Q1', i+'_Q2', i+'_Q3', i+'_Q4', i+'_Q5'])
        data = pd.concat([data, df1], axis=1)
        data[i+'_mean'] = data[i].apply(lambda x:np.mean(x))
    return data

In [9]:
columns_to_keep = ['Num_Sents', 'Num_Words', 'Num_Chars', 'Stopwords_ratio', 'Unique_words_ratio', 'ADJ_ratio', 'ADP_ratio', 'ADV_ratio', 'AUX_ratio', 
    'CCONJ_ratio','DET_ratio', 'INTJ_ratio', 'NOUN_ratio', 'NUM_ratio', 'PART_ratio','PRON_ratio', 'PROPN_ratio', 'SCONJ_ratio', 'VERB_ratio',
    'ROOT_ratio', 'acl_ratio', 'acomp_ratio', 'advcl_ratio', 'advmod_ratio', 'agent_ratio', 'amod_ratio', 'appos_ratio', 'attr_ratio', 
    'aux_ratio', 'auxpass_ratio', 'case_ratio', 'cc_ratio', 'ccomp_ratio', 'compound_ratio', 'conj_ratio', 'csubj_ratio', 'csubjpass_ratio',
    'dative_ratio', 'dep_ratio', 'det_ratio', 'dobj_ratio', 'expl_ratio', 'intj_ratio', 'mark_ratio', 'meta_ratio', 'neg_ratio',
    'nmod_ratio', 'npadvmod_ratio', 'nsubj_ratio', 'nsubjpass_ratio', 'nummod_ratio', 'oprd_ratio', 'parataxis_ratio', 'pcomp_ratio',
    'pobj_ratio', 'poss_ratio', 'preconj_ratio', 'predet_ratio', 'prep_ratio', 'prt_ratio', 'punct_ratio', 'quantmod_ratio', 'relcl_ratio',
    'xcomp_ratio', 'Content_Word_ratio', 'Connectives_ratio', 'Sent_len_Q1', 'Sent_len_Q2', 'Sent_len_Q3', 'Sent_len_Q4', 'Sent_len_Q5',
    'Sent_len_mean', 'Word_len_Q1', 'Word_len_Q2', 'Word_len_Q3', 'Word_len_Q4', 'Word_len_Q5', 'Word_len_mean',
    'Verb_phrase_count_Q1', 'Verb_phrase_count_Q2', 'Verb_phrase_count_Q3', 'Verb_phrase_count_Q4', 'Verb_phrase_count_Q5', 'Verb_phrase_count_mean',
    'Noun_phrase_count_Q1', 'Noun_phrase_count_Q2', 'Noun_phrase_count_Q3', 'Noun_phrase_count_Q4', 'Noun_phrase_count_Q5', 'Noun_phrase_count_mean']

In [10]:
#df_train['weight'] = np.log(1 + (1/(df_train['standard_error'] + 1)))

In [11]:
train_features = generate_features(df_train)
train_features = train_features[columns_to_keep]

['Spacy_Doc']
['Spacy_Sents']
['Num_Sents']
['Token_list_sent']
['Word_list_sent']
['Token_list']
['Word_list_full']
['Sent_len']
['Word_list']
['Num_Words']
['Num_Chars']
['Word_len']
['Syl_count_pyphen']
['Syl_count_nltk']
['Syl_list_pyphen']
['Syl_list_nltk']
['Pos_tag_nltk']
['Pos_tag_spacy']
['Dep_tag_spacy']
['Stopwords_ratio']
['Unique_words_ratio']
['POS_counter']
['DEP_counter']
['Verb_phrase_count']
['Noun_phrase_count']
['Adverb_phrase_count']
['Prepositional_phrase_count']
['Tree_max_depth_count']
['Tree_length_count']


In [12]:
test_features = generate_features(df_test)
test_features = test_features[columns_to_keep]

['Spacy_Doc']
['Spacy_Sents']
['Num_Sents']
['Token_list_sent']
['Word_list_sent']
['Token_list']
['Word_list_full']
['Sent_len']
['Word_list']
['Num_Words']
['Num_Chars']
['Word_len']
['Syl_count_pyphen']
['Syl_count_nltk']
['Syl_list_pyphen']
['Syl_list_nltk']
['Pos_tag_nltk']
['Pos_tag_spacy']
['Dep_tag_spacy']
['Stopwords_ratio']
['Unique_words_ratio']
['POS_counter']
['DEP_counter']
['Verb_phrase_count']
['Noun_phrase_count']
['Adverb_phrase_count']
['Prepositional_phrase_count']
['Tree_max_depth_count']
['Tree_length_count']


In [13]:
num_bins = int(np.floor(1 + np.log2(len(df_train))))
print(num_bins)
df_train.loc[:,'bins'] = pd.cut(df_train['target'],bins=num_bins,labels=False)

target = df_train['target'].to_numpy()
bins = df_train.bins.to_numpy()

def rmse_score(y_true,y_pred):
    return np.sqrt(mean_squared_error(y_true,y_pred))

12


In [14]:
train_targets = df_train['target'].to_numpy()

In [15]:
config = {
    'batch_size':128,
    'max_len':256,
    'nfolds':5,
    'seed':42,
}

def seed_everything(seed=42):
    random.seed(seed)
    os.environ['PYTHONASSEED'] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = True

seed_everything(seed=config['seed'])

In [16]:
class CLRPDataset(Dataset):
    def __init__(self,df,tokenizer):
        self.Text = df['Text'].to_numpy()
        self.tokenizer = tokenizer
    
    def __getitem__(self,idx):
        encode = self.tokenizer(self.Text[idx],return_tensors='pt',
                                max_length=config['max_len'],
                                padding='max_length',truncation=True)
        return encode
    
    def __len__(self):
        return len(self.Text)

In [17]:
class AttentionHead(nn.Module):
    def __init__(self, in_features, hidden_dim, num_targets):
        super().__init__()
        self.in_features = in_features
        self.middle_features = hidden_dim

        self.W = nn.Linear(in_features, hidden_dim)
        self.V = nn.Linear(hidden_dim, 1)
        self.out_features = hidden_dim

    def forward(self, features):
        att = torch.tanh(self.W(features))

        score = self.V(att)

        attention_weights = torch.softmax(score, dim=1)

        context_vector = attention_weights * features
        context_vector = torch.sum(context_vector, dim=1)

        return context_vector

In [18]:
class Model(nn.Module):
    def __init__(self):
        super(Model,self).__init__()
        self.roberta = AutoModel.from_pretrained('../input/roberta-base')    
        self.head = AttentionHead(768,768,1)
        self.dropout = nn.Dropout(0.1)
        self.linear = nn.Linear(self.head.out_features,1)

    def forward(self,**xb):
        x = self.roberta(**xb)[0]
        x = self.head(x)
        return x

In [19]:
def get_embeddings(df,path,plot_losses=True, verbose=True):
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    print(f"{device} is used")
            
    model = Model()
    model.load_state_dict(torch.load(path))
    model.to(device)
    model.eval()
    
    tokenizer = AutoTokenizer.from_pretrained('../input/roberta-base')
    
    ds = CLRPDataset(df,tokenizer)
    dl = DataLoader(ds,
                  batch_size = config["batch_size"],
                  shuffle=False,
                  num_workers = 4,
                  pin_memory=True,
                  drop_last=False
                 )
        
    embeddings = list()
    with torch.no_grad():
        for i, inputs in tqdm(enumerate(dl)):
            inputs = {key:val.reshape(val.shape[0],-1).to(device) for key,val in inputs.items()}
            outputs = model(**inputs)
            outputs = outputs.detach().cpu().numpy()
            embeddings.extend(outputs)
    return np.array(embeddings)

In [20]:
train_embeddings1 =  get_embeddings(df_train,'../input/clr-roberta/model0/model0.bin')
test_embeddings1 = get_embeddings(df_test,'../input/clr-roberta/model0/model0.bin')

train_embeddings2 =  get_embeddings(df_train,'../input/clr-roberta/model1/model1.bin')
test_embeddings2 = get_embeddings(df_test,'../input/clr-roberta/model1/model1.bin')

train_embeddings3 =  get_embeddings(df_train,'../input/clr-roberta/model2/model2.bin')
test_embeddings3 = get_embeddings(df_test,'../input/clr-roberta/model2/model2.bin')

train_embeddings4 =  get_embeddings(df_train,'../input/clr-roberta/model3/model3.bin')
test_embeddings4 = get_embeddings(df_test,'../input/clr-roberta/model3/model3.bin')

train_embeddings5 =  get_embeddings(df_train,'../input/clr-roberta/model4/model4.bin')
test_embeddings5 = get_embeddings(df_test,'../input/clr-roberta/model4/model4.bin')

cuda is used


Some weights of the model checkpoint at ../input/roberta-base were not used when initializing RobertaModel: ['lm_head.dense.weight', 'lm_head.dense.bias', 'lm_head.decoder.weight', 'lm_head.layer_norm.bias', 'lm_head.bias', 'lm_head.layer_norm.weight']
- This IS expected if you are initializing RobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
23it [00:22,  1.00it/s]


cuda is used


Some weights of the model checkpoint at ../input/roberta-base were not used when initializing RobertaModel: ['lm_head.dense.weight', 'lm_head.dense.bias', 'lm_head.decoder.weight', 'lm_head.layer_norm.bias', 'lm_head.bias', 'lm_head.layer_norm.weight']
- This IS expected if you are initializing RobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
1it [00:00,  4.85it/s]


cuda is used


Some weights of the model checkpoint at ../input/roberta-base were not used when initializing RobertaModel: ['lm_head.dense.weight', 'lm_head.dense.bias', 'lm_head.decoder.weight', 'lm_head.layer_norm.bias', 'lm_head.bias', 'lm_head.layer_norm.weight']
- This IS expected if you are initializing RobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
23it [00:21,  1.05it/s]


cuda is used


Some weights of the model checkpoint at ../input/roberta-base were not used when initializing RobertaModel: ['lm_head.dense.weight', 'lm_head.dense.bias', 'lm_head.decoder.weight', 'lm_head.layer_norm.bias', 'lm_head.bias', 'lm_head.layer_norm.weight']
- This IS expected if you are initializing RobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
1it [00:00,  4.21it/s]


cuda is used


Some weights of the model checkpoint at ../input/roberta-base were not used when initializing RobertaModel: ['lm_head.dense.weight', 'lm_head.dense.bias', 'lm_head.decoder.weight', 'lm_head.layer_norm.bias', 'lm_head.bias', 'lm_head.layer_norm.weight']
- This IS expected if you are initializing RobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
23it [00:21,  1.05it/s]


cuda is used


Some weights of the model checkpoint at ../input/roberta-base were not used when initializing RobertaModel: ['lm_head.dense.weight', 'lm_head.dense.bias', 'lm_head.decoder.weight', 'lm_head.layer_norm.bias', 'lm_head.bias', 'lm_head.layer_norm.weight']
- This IS expected if you are initializing RobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
1it [00:00,  4.91it/s]


cuda is used


Some weights of the model checkpoint at ../input/roberta-base were not used when initializing RobertaModel: ['lm_head.dense.weight', 'lm_head.dense.bias', 'lm_head.decoder.weight', 'lm_head.layer_norm.bias', 'lm_head.bias', 'lm_head.layer_norm.weight']
- This IS expected if you are initializing RobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
23it [00:22,  1.04it/s]


cuda is used


Some weights of the model checkpoint at ../input/roberta-base were not used when initializing RobertaModel: ['lm_head.dense.weight', 'lm_head.dense.bias', 'lm_head.decoder.weight', 'lm_head.layer_norm.bias', 'lm_head.bias', 'lm_head.layer_norm.weight']
- This IS expected if you are initializing RobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
1it [00:00,  4.97it/s]


cuda is used


Some weights of the model checkpoint at ../input/roberta-base were not used when initializing RobertaModel: ['lm_head.dense.weight', 'lm_head.dense.bias', 'lm_head.decoder.weight', 'lm_head.layer_norm.bias', 'lm_head.bias', 'lm_head.layer_norm.weight']
- This IS expected if you are initializing RobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
23it [00:21,  1.05it/s]


cuda is used


Some weights of the model checkpoint at ../input/roberta-base were not used when initializing RobertaModel: ['lm_head.dense.weight', 'lm_head.dense.bias', 'lm_head.decoder.weight', 'lm_head.layer_norm.bias', 'lm_head.bias', 'lm_head.layer_norm.weight']
- This IS expected if you are initializing RobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
1it [00:00,  4.90it/s]


In [21]:
def concat_data(train_embeddings, train_features, test_embeddings, test_features):
    train_data = np.concatenate((train_embeddings,train_features.to_numpy()), axis=1)
    test_data = np.concatenate((test_embeddings,test_features.to_numpy()), axis=1)
    scaler = MinMaxScaler()
    train_data = scaler.fit_transform(train_data)
    test_data = scaler.transform(test_data)
    return train_data, test_data

In [22]:
train_data1, test_data1 = concat_data(train_embeddings1, train_features, test_embeddings1, test_features)
train_data2, test_data2 = concat_data(train_embeddings2, train_features, test_embeddings2, test_features)
train_data3, test_data3 = concat_data(train_embeddings3, train_features, test_embeddings3, test_features)
train_data4, test_data4 = concat_data(train_embeddings4, train_features, test_embeddings4, test_features)
train_data5, test_data5 = concat_data(train_embeddings5, train_features, test_embeddings5, test_features)

## svm

In [23]:
def get_preds_svm(X,y,X_test,bins=bins,nfolds=5,C=1,kernel='rbf'):
    scores = list()
    preds = np.zeros((X_test.shape[0]))
    
    kfold = StratifiedKFold(n_splits=config['nfolds'],shuffle=True,random_state=config['seed'])
    for k, (train_idx,valid_idx) in enumerate(kfold.split(X,bins)):
        model = SVR(C=C,kernel=kernel,gamma='auto')
        X_train,y_train = X[train_idx], y[train_idx]
        X_valid,y_valid = X[valid_idx], y[valid_idx]
        
        model.fit(X_train,y_train)
        prediction = model.predict(X_valid)
        score = rmse_score(prediction,y_valid)
        print(f'Fold {k} , rmse score: {score}')
        scores.append(score)
        preds += model.predict(X_test)
        
    print("mean rmse",np.mean(scores))
    return np.array(preds)/nfolds

In [24]:
svm_preds1 = get_preds_svm(train_data1,target,test_data1)
svm_preds2 = get_preds_svm(train_data2,target,test_data2)
svm_preds3 = get_preds_svm(train_data3,target,test_data3)
svm_preds4 = get_preds_svm(train_data4,target,test_data4)
svm_preds5 = get_preds_svm(train_data5,target,test_data5)

Fold 0 , rmse score: 0.475105521713043
Fold 1 , rmse score: 0.2814163772050671
Fold 2 , rmse score: 0.28066531135409917
Fold 3 , rmse score: 0.26959251185277494
Fold 4 , rmse score: 0.2809759899547531
mean rmse 0.31755114241594745
Fold 0 , rmse score: 0.2509852681743025
Fold 1 , rmse score: 0.501382279324724
Fold 2 , rmse score: 0.24856747354374117
Fold 3 , rmse score: 0.24100000687770365
Fold 4 , rmse score: 0.2499254345645391
mean rmse 0.2983720924970021
Fold 0 , rmse score: 0.387631166523031
Fold 1 , rmse score: 0.41037792407406376
Fold 2 , rmse score: 0.4950962177771016
Fold 3 , rmse score: 0.3645208490806746
Fold 4 , rmse score: 0.3833122394248934
mean rmse 0.40818767937595285
Fold 0 , rmse score: 0.29535446388767217
Fold 1 , rmse score: 0.28628875452534563
Fold 2 , rmse score: 0.2909706430542025
Fold 3 , rmse score: 0.45483560724655364
Fold 4 , rmse score: 0.2899225503298736
mean rmse 0.3234744038087295
Fold 0 , rmse score: 0.40074538690793726
Fold 1 , rmse score: 0.4239944976419

In [25]:
svm_preds = (svm_preds1 + svm_preds2 + svm_preds3 + svm_preds4 + svm_preds5)/5

In [26]:
def get_preds_random_forest(X,y,X_test,bins=bins,nfolds=5):
    scores = list()
    preds = np.zeros((X_test.shape[0]))
    
    kfold = StratifiedKFold(n_splits=config['nfolds'],shuffle=True,random_state=config['seed'])
    for k, (train_idx,valid_idx) in enumerate(kfold.split(X,bins)):
        model = RandomForestRegressor(n_estimators = 20, max_depth=4)
        X_train,y_train = X[train_idx], y[train_idx]
        X_valid,y_valid = X[valid_idx], y[valid_idx]
        
        model.fit(X_train,y_train)
        prediction = model.predict(X_valid)
        score = rmse_score(prediction,y_valid)
        print(f'Fold {k} , rmse score: {score}')
        scores.append(score)
        preds += model.predict(X_test)
        
    print("mean rmse",np.mean(scores))
    return np.array(preds)/nfolds

In [27]:
rf_preds1 = get_preds_random_forest(train_data1,target,test_data1)
rf_preds2 = get_preds_random_forest(train_data2,target,test_data2)
rf_preds3 = get_preds_random_forest(train_data3,target,test_data3)
rf_preds4 = get_preds_random_forest(train_data4,target,test_data4)
rf_preds5 = get_preds_random_forest(train_data5,target,test_data5)

Fold 0 , rmse score: 0.4867270658484281
Fold 1 , rmse score: 0.31246027624311407
Fold 2 , rmse score: 0.3159134277561694
Fold 3 , rmse score: 0.2954668825123225
Fold 4 , rmse score: 0.3074782426879434
mean rmse 0.34360917900959553
Fold 0 , rmse score: 0.29129574826326177
Fold 1 , rmse score: 0.5318808836283744
Fold 2 , rmse score: 0.2856377091522112
Fold 3 , rmse score: 0.2845229604252749
Fold 4 , rmse score: 0.2913696787897526
mean rmse 0.336941396051775
Fold 0 , rmse score: 0.4148810077674712
Fold 1 , rmse score: 0.43771031708872427
Fold 2 , rmse score: 0.5253526362198212
Fold 3 , rmse score: 0.3966940814584421
Fold 4 , rmse score: 0.40910013757965413
mean rmse 0.43674763602282257
Fold 0 , rmse score: 0.3323736002777699
Fold 1 , rmse score: 0.33045178156127186
Fold 2 , rmse score: 0.33180497461738484
Fold 3 , rmse score: 0.47247894255046613
Fold 4 , rmse score: 0.32949179581259375
mean rmse 0.35932021896389726
Fold 0 , rmse score: 0.4293431875424499
Fold 1 , rmse score: 0.44912802179

In [28]:
rf_preds = (rf_preds1 + rf_preds2 + rf_preds3 + rf_preds4 + rf_preds5)/5

In [29]:
def get_preds_adaboost(X,y,X_test,bins=bins,nfolds=5):
    scores = list()
    preds = np.zeros((X_test.shape[0]))
    
    kfold = StratifiedKFold(n_splits=config['nfolds'],shuffle=True,random_state=config['seed'])
    for k, (train_idx,valid_idx) in enumerate(kfold.split(X,bins)):
        model = AdaBoostRegressor(n_estimators = 72)
        X_train,y_train = X[train_idx], y[train_idx]
        X_valid,y_valid = X[valid_idx], y[valid_idx]
        
        model.fit(X_train,y_train)
        prediction = model.predict(X_valid)
        score = rmse_score(prediction,y_valid)
        print(f'Fold {k} , rmse score: {score}')
        scores.append(score)
        preds += model.predict(X_test)
        
    print("mean rmse",np.mean(scores))
    return np.array(preds)/nfolds

In [30]:
adb_preds1 = get_preds_adaboost(train_data1,target,test_data1)
adb_preds2 = get_preds_adaboost(train_data2,target,test_data2)
adb_preds3 = get_preds_adaboost(train_data3,target,test_data3)
adb_preds4 = get_preds_adaboost(train_data4,target,test_data4)
adb_preds5 = get_preds_adaboost(train_data5,target,test_data5)

Fold 0 , rmse score: 0.4832599080339161
Fold 1 , rmse score: 0.3126622620506336
Fold 2 , rmse score: 0.3167980866278161
Fold 3 , rmse score: 0.29523696739996375
Fold 4 , rmse score: 0.31300414304427887
mean rmse 0.3441922734313217
Fold 0 , rmse score: 0.28953981808201656
Fold 1 , rmse score: 0.516734221805008
Fold 2 , rmse score: 0.2911677215046883
Fold 3 , rmse score: 0.2805088888845369
Fold 4 , rmse score: 0.3034286661633856
mean rmse 0.3362758632879271
Fold 0 , rmse score: 0.40853084897383546
Fold 1 , rmse score: 0.42985831144473324
Fold 2 , rmse score: 0.5066027327275975
Fold 3 , rmse score: 0.3935285885831211
Fold 4 , rmse score: 0.4056741148785237
mean rmse 0.42883891932156215
Fold 0 , rmse score: 0.32325396219320607
Fold 1 , rmse score: 0.3212028542592625
Fold 2 , rmse score: 0.31806155473060405
Fold 3 , rmse score: 0.4633780582287487
Fold 4 , rmse score: 0.3173333916302745
mean rmse 0.34864596420841915
Fold 0 , rmse score: 0.42633565287376685
Fold 1 , rmse score: 0.447856232043

In [31]:
adb_preds = (adb_preds1 + adb_preds2 + adb_preds3 + adb_preds4 + adb_preds5)/5

In [32]:
sample.target = (svm_preds+rf_preds+adb_preds)/3
sample.to_csv('submission.csv',index=False)

In [33]:
sample

Unnamed: 0,id,target
0,c0f722661,-0.434773
1,f0953f0a5,-0.648937
2,0df072751,-0.491356
3,04caf4e0c,-2.453608
4,0e63f8bea,-1.766856
5,12537fe78,-1.360294
6,965e592c0,0.278843
