In [1]:
# !pip install spacy
# !pip install datasets
# !pip install transformers
# !pip install sentencepiece


In [3]:
import pandas as pd
import numpy as np
import spacy
from spacy.lang.en.stop_words import STOP_WORDS
import re
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import KFold
from transformers import AutoTokenizer, AutoConfig, TFAutoModel
import tensorflow as tf
from tensorflow.keras import layers
from tensorflow.keras.layers import Dense, Input, Dropout, GlobalAveragePooling1D,GlobalMaxPooling1D
from tensorflow.keras.models import Model
from tensorflow.keras.optimizers import Adam
import pickle
import os
from tensorflow.keras.backend import clear_session


os.environ["TOKENIZERS_PARALLELISM"] = "false"

In [3]:
path = "/kaggle/working/models/"
# Check whether the specified path exists or not
isExist = os.path.exists(path)
if not isExist:

   # Create a new directory because it does not exist
   os.makedirs(path)

In [8]:
path_dir = "/kaggle/input/commonlit-evaluate-student-summaries"
Prompts_train = pd.read_csv(f'{path_dir}/prompts_train.csv')
Prompts_test = pd.read_csv(f'{path_dir}/prompts_test.csv')
Summaries_train = pd.read_csv(f'{path_dir}/summaries_train.csv')
Summaries_test = pd.read_csv(f'{path_dir}/summaries_test.csv')

In [9]:
Summaries_train = Summaries_train.merge(Prompts_train, on="prompt_id", how="left")
Summaries_test = Summaries_test.merge(Prompts_test, on="prompt_id", how="left")

In [15]:
def clean_data(text):
    text = str(text).lower()
    text = re.sub(r'[\n\r]', '', str(text))
    text = re.sub(r'[^a-zA-Z0-9\s]', '', text)
    text = re.sub(' +', ' ', text)
    text = re.sub('\w*\d\w*', '', text)
    text = ' '.join([word for word in text.split() if word not in STOP_WORDS])
    return text

def get_max_length(df, column):
    max_length = df[column].apply(lambda x: len(x.split())).max()
    return max_length

def select_features(df,x_features, y_features):
    X = df[x_features].values
    y = df[y_features].values
    return X, y

def deberta_tokenizer(model_name):
    tokenizer = AutoTokenizer.from_pretrained(model_name, use_fast=True, add_prefix_space=True)
    return tokenizer

def deberta_encode(texts, tokenizer, MAX_LENGTH):
    input_ids = []
    attention_mask = []
    
    for text in texts.tolist():
        token = tokenizer(text, 
                          add_special_tokens=True, 
                          max_length=MAX_LENGTH, 
                          return_attention_mask=True, 
                          return_tensors="np", 
                          truncation=True, 
                          padding='max_length')
        input_ids.append(token['input_ids'][0])
        attention_mask.append(token['attention_mask'][0])
    return np.array(input_ids, dtype="int32"), np.array(attention_mask, dtype="int32")

def get_dataset(df,input_ids,attention_mask):
    inputs = np.concatenate((input_ids, attention_mask), axis=1)
    targets = np.array(df, dtype="float32")
    return inputs, targets

def MCRMSE(y_trues, y_preds):
    scores = []
    idxes = y_trues.shape[1]
    for i in range(idxes):
        y_true = y_trues[:,i]
        y_pred = y_preds[:,i]
        score = mean_squared_error(y_true, y_pred, squared=False)
        scores.append(score)
    mcrmse_score = np.mean(scores)
    return mcrmse_score, scores

def score_loss(y_trues, y_preds):
    mcrmse_score, scores = MCRMSE(y_trues, y_preds)
    return {
        'mcrmse_score' : mcrmse_score,
        'Content_score' : scores[0],
        'Wording_score' : scores[1]
    }

def split_Kfold(data,splits):
    kf = KFold(n_splits=splits, shuffle=True, random_state=42)
    split = kf.split(data)
   
    for train_index, test_index in split:
        train_data = data.iloc[train_index]
        test_data = data.iloc[test_index]
        yield train_data, test_data

In [16]:
Summaries_train['text'] = Summaries_train['text'].apply(lambda x: clean_data(x))
Summaries_train['prompt_question'] = Summaries_train['prompt_question'].apply(lambda x: clean_data(x))
Summaries_train['prompt_title'] = Summaries_train['prompt_title'].apply(lambda x: clean_data(x))
Summaries_train['prompt_text'] = Summaries_train['prompt_text'].apply(lambda x: clean_data(x))

In [18]:
class CONFIG:
    model_name = "/kaggle/input/debertav3base"
    seed = 42
    batch_size = 4
    epochs = 10
    max_length = 100
    learning_rate = 2e-5
    dropout = 0.5
    n_splits = 4
    shuffle = True
    device = tf.config.list_physical_devices('GPU')
    train_cols = ['text','prompt_question','prompt_title','prompt_text']
    test_cols = ['content', 'wording']
    max_length = get_max_length(Summaries_train, 'prompt_text')
    decay_steps= 2800
    decay_rate = 0.2
    

In [9]:
# class CustomDebertaV2Embeddings(layers.Layer):
#     def __init__(self, config, **kwargs):
#         super(CustomDebertaV2Embeddings, self).__init__(**kwargs)
#         self.embeddings = TFDebertaV2Embeddings(config, name="embeddings")  # Use TFDebertaV2Embeddings here

#     def call(self, input_ids=None, position_ids=None, token_type_ids=None, inputs_embeds=None, mask=None, training=False):
#         if mask is not None:
#             if mask.dtype == tf.float32:  # Check if the mask is in float32 format
#                 mask = tf.cast(mask, dtype=tf.float16)  # Cast mask to float16 if it's in float32 format
#             mask = tf.expand_dims(mask, axis=2)  # Add an extra dimension for element-wise multiplication
#             final_embeddings = self.embeddings(  # Use self.embeddings to calculate embeddings
#                 input_ids=input_ids,
#                 position_ids=position_ids,
#                 token_type_ids=token_type_ids,
#                 inputs_embeds=inputs_embeds,
#                 mask=mask,
#                 training=training,
#             )
#             final_embeddings = final_embeddings * mask

#         return final_embeddings



In [10]:
def get_model(model_name, MAX_LENGTH):
    if CONFIG.device:
        with tf.device('/GPU:0'):
            config = AutoConfig.from_pretrained(model_name)
            config.update({'output_hidden_states': True, 
                        'hidden_dropout_prob': 0.005,
                        'layer_norm_eps': 1e-7,
                        'num_labels': 2,
                        "problem_type": "regression"})

            deberta_model = TFAutoModel.from_pretrained(model_name, config=config)

            input_ids = Input(shape=(MAX_LENGTH,), dtype=tf.int32, name="input_ids")
            attention_mask = Input(shape=(MAX_LENGTH,), dtype=tf.int32, name="attention_mask")

            sequence_output = deberta_model.deberta(input_ids, attention_mask=attention_mask)
            #sequence_output_layer = sequence_output.last_hidden_state[:, 0, :]
            #print(sequence_output_layer)
            max_pooling = GlobalAveragePooling1D()(sequence_output.last_hidden_state)
            dense_layer = Dense(2, activation="sigmoid")(max_pooling)
            output = layers.Rescaling(scale=6.0, offset=-2.0)(dense_layer)
            model = Model(inputs=[input_ids, attention_mask], outputs=output)
            lr_schedules = tf.keras.optimizers.schedules.ExponentialDecay(
            initial_learning_rate=CONFIG.learning_rate, 
            decay_steps=CONFIG.decay_steps, 
            decay_rate=CONFIG.decay_rate)
            model.compile(optimizer=Adam(learning_rate=CONFIG.learning_rate), loss='mse', metrics=['mse'])

    return model

def scheduler(epoch):
    learning_rate = CONFIG.learning_rate
    if epoch == 0:
        return learning_rate
    else:
        return learning_rate * (0.9**epoch)


In [11]:
MCRMSE_scores = []

def train_and_save_model(index,train_data, valid_data):
        tokenizer = deberta_tokenizer(CONFIG.model_name)

        # select features
        X_train, y_train = select_features(train_data, CONFIG.train_cols, CONFIG.test_cols)
        X_valid, y_valid = select_features(valid_data, CONFIG.train_cols, CONFIG.test_cols)

        # tokenize
        input_ids_train, attention_masks_train = deberta_encode(X_train, tokenizer, CONFIG.max_length)
        input_ids_valid, attention_masks_valid = deberta_encode(X_valid, tokenizer, CONFIG.max_length)

        # create dataset
        inputs_trains = [input_ids_train, attention_masks_train]
        targets_trains = np.array(y_train, dtype="float32")

        inputs_valids = [input_ids_valid, attention_masks_valid]
        targets_valids = np.array(y_valid, dtype="float32")

       
        tf.keras.backend.clear_session()
        # fit model
        model = get_model(CONFIG.model_name, CONFIG.max_length)
        print(model.summary())
        callbacks = [
        tf.keras.callbacks.ModelCheckpoint(f"{path}/model_fold_{index}.ckpt",
                                           monitor="val_loss",
                                           mode="min",
                                           save_best_only=True,
                                           verbose=1,
                                           save_weights_only=True,),
        tf.keras.callbacks.EarlyStopping(monitor='val_loss', 
                                         min_delta=1e-5, 
                                         patience=2, 
                                         verbose=1,
                                         mode='min',
                                        restore_best_weights=True), tf.keras.callbacks.LearningRateScheduler(scheduler)
        ]

        # Validation and other steps
        history = model.fit(
            inputs_trains,
            targets_trains,
            validation_data=(inputs_valids, targets_valids),
            epochs=CONFIG.epochs, 
            batch_size=CONFIG.batch_size,
            verbose=1,
            callbacks = callbacks
        )
        tf.keras.backend.clear_session()
    
        # evaluate model
        model.evaluate(inputs_valids, targets_valids)

        # Score model
        y_pred = model.predict(inputs_valids)

        # MCRMSE score
        print('MCRMSE score: ', MCRMSE(y_valid, y_pred))
        print('Score loss: ', score_loss(y_valid, y_pred))
        MCRMSE_scores.append(MCRMSE(y_valid, y_pred))
        
        model.save(f'{path}/model_{index}_{CONFIG.model_name}') 
#         tf.keras.saving.save_model(
#     model, filepath, overwrite=True, save_format=None, **kwargs
# )

In [12]:
for gpu in CONFIG.device:
    tf.config.experimental.set_memory_growth(gpu, True)

kf = KFold(n_splits=CONFIG.n_splits, shuffle=CONFIG.shuffle, random_state=CONFIG.seed)
for fold, (train_index, valid_index) in enumerate(kf.split(Summaries_train, groups = Summaries_train['prompt_id'])):
    tf.keras.backend.clear_session()
    train_data = Summaries_train.iloc[train_index]
    valid_data = Summaries_train.iloc[valid_index]

    print(f"Training Fold {fold}")
    train_and_save_model(fold,train_data, valid_data)

Training Fold 0


Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
All model checkpoint layers were used when initializing TFDebertaV2Model.

All the layers of TFDebertaV2Model were initialized from the model checkpoint at /kaggle/input/debertav3base.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFDebertaV2Model for predictions without further training.


Model: "model"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 input_ids (InputLayer)         [(None, 360)]        0           []                               
                                                                                                  
 attention_mask (InputLayer)    [(None, 360)]        0           []                               
                                                                                                  
 deberta (TFDebertaV2MainLayer)  TFBaseModelOutput(l  183831552  ['input_ids[0][0]',              
                                ast_hidden_state=(N               'attention_mask[0][0]']         
                                one, 360, 768),                                                   
                                 hidden_states=((No                                           

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
All model checkpoint layers were used when initializing TFDebertaV2Model.

All the layers of TFDebertaV2Model were initialized from the model checkpoint at /kaggle/input/debertav3base.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFDebertaV2Model for predictions without further training.


Model: "model"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 input_ids (InputLayer)         [(None, 360)]        0           []                               
                                                                                                  
 attention_mask (InputLayer)    [(None, 360)]        0           []                               
                                                                                                  
 deberta (TFDebertaV2MainLayer)  TFBaseModelOutput(l  183831552  ['input_ids[0][0]',              
                                ast_hidden_state=(N               'attention_mask[0][0]']         
                                one, 360, 768),                                                   
                                 hidden_states=((No                                           

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
All model checkpoint layers were used when initializing TFDebertaV2Model.

All the layers of TFDebertaV2Model were initialized from the model checkpoint at /kaggle/input/debertav3base.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFDebertaV2Model for predictions without further training.


Model: "model"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 input_ids (InputLayer)         [(None, 360)]        0           []                               
                                                                                                  
 attention_mask (InputLayer)    [(None, 360)]        0           []                               
                                                                                                  
 deberta (TFDebertaV2MainLayer)  TFBaseModelOutput(l  183831552  ['input_ids[0][0]',              
                                ast_hidden_state=(N               'attention_mask[0][0]']         
                                one, 360, 768),                                                   
                                 hidden_states=((No                                           

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
All model checkpoint layers were used when initializing TFDebertaV2Model.

All the layers of TFDebertaV2Model were initialized from the model checkpoint at /kaggle/input/debertav3base.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFDebertaV2Model for predictions without further training.


Model: "model"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 input_ids (InputLayer)         [(None, 360)]        0           []                               
                                                                                                  
 attention_mask (InputLayer)    [(None, 360)]        0           []                               
                                                                                                  
 deberta (TFDebertaV2MainLayer)  TFBaseModelOutput(l  183831552  ['input_ids[0][0]',              
                                ast_hidden_state=(N               'attention_mask[0][0]']         
                                one, 360, 768),                                                   
                                 hidden_states=((No                                           

In [12]:
# preprocessing test data
Summaries_test['text'] = Summaries_test['text'].apply(lambda x: clean_data(x))
Summaries_test['prompt_question'] = Summaries_test['prompt_question'].apply(lambda x: clean_data(x))
Summaries_test['prompt_title'] = Summaries_test['prompt_title'].apply(lambda x: clean_data(x))
Summaries_test['prompt_text'] = Summaries_test['prompt_text'].apply(lambda x: clean_data(x))


In [19]:
tokenizer = deberta_tokenizer(CONFIG.model_name)
input_ids_train, attention_masks_train = deberta_encode(Summaries_test[CONFIG.train_cols].values, tokenizer, CONFIG.max_length)
inputs_trains = [input_ids_train, attention_masks_train]

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [4]:
best_model = tf.keras.models.load_model('/kaggle/working/models/model_2_/kaggle/input/debertav3base')

In [20]:
best_model.summary()

Model: "model"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 input_ids (InputLayer)         [(None, 360)]        0           []                               
                                                                                                  
 attention_mask (InputLayer)    [(None, 360)]        0           []                               
                                                                                                  
 deberta (TFDebertaV2MainLayer)  {'last_hidden_state  183831552  ['input_ids[0][0]',              
                                ': (None, 360, 768)               'attention_mask[0][0]']         
                                , 'hidden_states':                                                
                                ((None, 360, 768),                                            

In [21]:
y_test_pred = best_model.predict(inputs_trains)



In [22]:
valid_final_df = pd.DataFrame(y_test_pred, columns = ['content', 'wordings']) 
Valid_final = pd.concat([Summaries_test['student_id'], valid_final_df], axis=1)
Valid_final.to_csv('submission.csv', index = False)