In [1]:
import re
import os
import numpy as np
import pandas as pd
import random
import math
import tensorflow as tf
import logging
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import KFold
from tensorflow.keras import backend as K
from transformers import RobertaTokenizer, TFRobertaModel
from kaggle_datasets import KaggleDatasets
tf.get_logger().setLevel(logging.ERROR)
from kaggle_datasets import KaggleDatasets
from tqdm.notebook import tqdm
import re
import nltk

from nltk.corpus import stopwords
from nltk import pos_tag

In [2]:
# Configurations
EPOCHS = 50
# Batch size
BATCH_SIZE = 24
# Seed
SEED = 123
# Learning rate
LR = 0.000040
# Verbosity
VERBOSE = 1
# Number of folds for training
FOLDS = 5

# Max length
MAX_LEN = 250

ES_PATIENCE = 7
# PATIENCE = 2

# Get the trained model we want to use
MODEL = '../input/huggingface-roberta/roberta-base'

# Let's load our model tokenizer
tokenizer = RobertaTokenizer.from_pretrained(MODEL)

# For tf.dataset
AUTO = tf.data.experimental.AUTOTUNE

In [3]:
def preprocess(data):
    excerpt_processed = []
    for i in tqdm(data['excerpt']):
        
        i = re.sub("[^a-zA-Z]"," ",i)
        i = i.lower()
        i = nltk.word_tokenize(i)
        i = [word for word in i if not word in set(stopwords.words("english"))]
        
        lemma = nltk.WordNetLemmatizer()
        i = [lemma.lemmatize(word) for word in i]
        i=" ".join(i)
        excerpt_processed.append(i)
    return excerpt_processed

In [4]:
# Function to seed everything
def seed_everything(seed):
    random.seed(seed)
    np.random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    tf.random.set_seed(seed)


In [5]:
# This function tokenize the text according to a transformers model tokenizer
def regular_encode(text, tokenizer, max_len=MAX_LEN):
    encode_dict = tokenizer.batch_encode_plus(
                text,
                padding = 'max_length',
                truncation = True,
                max_length = max_len)
    return np.array(encode_dict['input_ids'])

In [6]:
# This function encode our training sentences
def encode_texts(x_train, x_val, MAX_LEN):
    x_train = regular_encode(x_train.tolist(), tokenizer, max_len = MAX_LEN)
    x_val = regular_encode(x_val.tolist(), tokenizer, max_len = MAX_LEN)
    return x_train, x_val

def encode_texts_test(x_test, MAX_LEN):
    x_test = regular_encode(x_test.tolist(), tokenizer, max_len = MAX_LEN)
    return x_test

In [7]:
# Function to transform arrays to tensors
def transform_to_tensors(x_train, x_val, y_train, y_val):
    train_dataset = (
        tf.data.Dataset
        .from_tensor_slices((x_train, y_train))
        .repeat()
        .shuffle(2048)
        .batch(BATCH_SIZE)
        .prefetch(AUTO)
    )

    valid_dataset = (
        tf.data.Dataset
        .from_tensor_slices((x_val, y_val))
        .batch(BATCH_SIZE)
        .prefetch(AUTO)
    )
    
    
    return train_dataset, valid_dataset

def transform_to_tensors_test(x_test):
    test_dataset = (
        tf.data.Dataset
        .from_tensor_slices((x_test))
        .batch(BATCH_SIZE)
        .prefetch(AUTO)
    )
    
    
    return test_dataset

In [8]:
# Function to build our model
def build_roberta_base_model(max_len = MAX_LEN):
    transformer = TFRobertaModel.from_pretrained(MODEL)
    input_word_ids = tf.keras.layers.Input(shape = (max_len, ), dtype = tf.int32, name = 'input_word_ids')
    sequence_output = transformer(input_word_ids)[0]
    # We only need the cls_token, resulting in a 2d array
    cls_token = sequence_output[:, 0, :]
#     x = tf.keras.layers.Dropout(0.09)(cls_token)
    output = tf.keras.layers.Dense(1, activation = 'linear', dtype = 'float32')(cls_token)
    model = tf.keras.models.Model(inputs = [input_word_ids], outputs = [output])
    model.compile(optimizer = tf.keras.optimizers.Adam(lr = LR),
                  loss = [tf.keras.losses.MeanSquaredError()],
                  metrics = [tf.keras.metrics.RootMeanSquaredError()])
    return model

# Function to train and evaluate our model
test_predictions = []
def train_and_evaluate():
    
    # Read our training data
    df = pd.read_csv('../input/commonlitreadabilityprize/train.csv')
    df_test = pd.read_csv('../input/commonlitreadabilityprize/test.csv')
#     df['excerpt'] = preprocess(df)
#     df_test['excerpt'] = preprocess(df_test)
    # Seed everything
    seed_everything(SEED)
    
    # Initiate kfold object with shuffle and a specific seed
    kfold = KFold(n_splits = FOLDS, shuffle = True, random_state = SEED)
    # Create out of folds array to store predictions
    oof_predictions = np.zeros(len(df))
    
    for fold, (trn_ind, val_ind) in enumerate(kfold.split(df)):
        print('\n')
        print('-'*50)
        print(f'Training fold {fold + 1}')
        K.clear_session()
        # Get text features and target
        x_train, x_val = df['excerpt'].iloc[trn_ind], df['excerpt'].iloc[val_ind]
        y_train, y_val = df['target'].iloc[trn_ind].values, df['target'].iloc[val_ind].values
        # Encode our text with Roberta tokenizer
        x_train, x_val = encode_texts(x_train, x_val, MAX_LEN)
        x_test = encode_texts_test(df_test['excerpt'], MAX_LEN)
        # Function to transform our numpy array to a tf Dataset
        train_dataset, valid_dataset = transform_to_tensors(x_train, x_val, y_train, y_val)
        test_dataset = transform_to_tensors_test(x_test)
        # Build model
        model = build_roberta_base_model(max_len = MAX_LEN)
        # Model checkpoint
        es = tf.keras.callbacks.EarlyStopping(monitor='val_root_mean_squared_error', mode='min', 
                       patience=ES_PATIENCE, restore_best_weights=True, verbose=1)
        checkpoint = tf.keras.callbacks.ModelCheckpoint(f'Roberta_Base_{SEED}_{fold + 1}.h5', 
                                                        monitor = 'val_root_mean_squared_error', 
                                                        verbose = VERBOSE, 
                                                        save_best_only = True,
                                                        save_weights_only = True, 
                                                        mode = 'min')
        steps = x_train.shape[0] // (BATCH_SIZE)
        # Training phase
        history = model.fit(train_dataset,
                            batch_size = BATCH_SIZE,
                            epochs = EPOCHS,
                            verbose = VERBOSE,
                            callbacks = [es,checkpoint],
                            validation_data = valid_dataset,
                            steps_per_epoch = steps)
        
        
        # Load best epoch weights
        model.load_weights(f'Roberta_Base_{SEED}_{fold + 1}.h5')
        # Predict validation set to save them in the out of folds array
        val_pred = model.predict(valid_dataset)
        oof_predictions[val_ind] = val_pred.reshape(-1)
        test_pred = model.predict(test_dataset)
        test_predictions.append(test_pred)
    print('\n')
    print('-'*50)
    # Calculate out of folds root mean squared error
    oof_rmse = np.sqrt(mean_squared_error(df['target'], oof_predictions))
    print(f'Our out of folds RMSE is {oof_rmse}')
    return oof_predictions,test_predictions
    

a,b=train_and_evaluate()



--------------------------------------------------
Training fold 1


Some layers from the model checkpoint at ../input/huggingface-roberta/roberta-base were not used when initializing TFRobertaModel: ['lm_head']
- This IS expected if you are initializing TFRobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFRobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
All the layers of TFRobertaModel were initialized from the model checkpoint at ../input/huggingface-roberta/roberta-base.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFRobertaModel for predictions without further training.


Epoch 1/50

Epoch 00001: val_root_mean_squared_error improved from inf to 0.73794, saving model to Roberta_Base_123_1.h5
Epoch 2/50

Epoch 00002: val_root_mean_squared_error improved from 0.73794 to 0.68246, saving model to Roberta_Base_123_1.h5
Epoch 3/50

Epoch 00003: val_root_mean_squared_error improved from 0.68246 to 0.57667, saving model to Roberta_Base_123_1.h5
Epoch 4/50

Epoch 00004: val_root_mean_squared_error did not improve from 0.57667
Epoch 5/50

Epoch 00005: val_root_mean_squared_error did not improve from 0.57667
Epoch 6/50

Epoch 00006: val_root_mean_squared_error improved from 0.57667 to 0.54098, saving model to Roberta_Base_123_1.h5
Epoch 7/50

Epoch 00007: val_root_mean_squared_error did not improve from 0.54098
Epoch 8/50

Epoch 00008: val_root_mean_squared_error did not improve from 0.54098
Epoch 9/50

Epoch 00009: val_root_mean_squared_error did not improve from 0.54098
Epoch 10/50

Epoch 00010: val_root_mean_squared_error did not improve from 0.54098
Epoch 11/50

Some layers from the model checkpoint at ../input/huggingface-roberta/roberta-base were not used when initializing TFRobertaModel: ['lm_head']
- This IS expected if you are initializing TFRobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFRobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
All the layers of TFRobertaModel were initialized from the model checkpoint at ../input/huggingface-roberta/roberta-base.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFRobertaModel for predictions without further training.


Epoch 1/50

Epoch 00001: val_root_mean_squared_error improved from inf to 0.82647, saving model to Roberta_Base_123_2.h5
Epoch 2/50

Epoch 00002: val_root_mean_squared_error improved from 0.82647 to 0.61180, saving model to Roberta_Base_123_2.h5
Epoch 3/50

Epoch 00003: val_root_mean_squared_error did not improve from 0.61180
Epoch 4/50

Epoch 00004: val_root_mean_squared_error improved from 0.61180 to 0.59745, saving model to Roberta_Base_123_2.h5
Epoch 5/50

Epoch 00005: val_root_mean_squared_error improved from 0.59745 to 0.55529, saving model to Roberta_Base_123_2.h5
Epoch 6/50

Epoch 00006: val_root_mean_squared_error did not improve from 0.55529
Epoch 7/50

Epoch 00007: val_root_mean_squared_error did not improve from 0.55529
Epoch 8/50

Epoch 00008: val_root_mean_squared_error did not improve from 0.55529
Epoch 9/50

Epoch 00009: val_root_mean_squared_error did not improve from 0.55529
Epoch 10/50

Epoch 00010: val_root_mean_squared_error did not improve from 0.55529
Epoch 11/50

Some layers from the model checkpoint at ../input/huggingface-roberta/roberta-base were not used when initializing TFRobertaModel: ['lm_head']
- This IS expected if you are initializing TFRobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFRobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
All the layers of TFRobertaModel were initialized from the model checkpoint at ../input/huggingface-roberta/roberta-base.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFRobertaModel for predictions without further training.


Epoch 1/50

Epoch 00001: val_root_mean_squared_error improved from inf to 0.54007, saving model to Roberta_Base_123_3.h5
Epoch 2/50

Epoch 00002: val_root_mean_squared_error did not improve from 0.54007
Epoch 3/50

Epoch 00003: val_root_mean_squared_error did not improve from 0.54007
Epoch 4/50

Epoch 00004: val_root_mean_squared_error did not improve from 0.54007
Epoch 5/50

Epoch 00005: val_root_mean_squared_error did not improve from 0.54007
Epoch 6/50

Epoch 00006: val_root_mean_squared_error did not improve from 0.54007
Epoch 7/50

Epoch 00007: val_root_mean_squared_error did not improve from 0.54007
Epoch 8/50
Restoring model weights from the end of the best epoch.

Epoch 00008: val_root_mean_squared_error did not improve from 0.54007
Epoch 00008: early stopping


--------------------------------------------------
Training fold 4


Some layers from the model checkpoint at ../input/huggingface-roberta/roberta-base were not used when initializing TFRobertaModel: ['lm_head']
- This IS expected if you are initializing TFRobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFRobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
All the layers of TFRobertaModel were initialized from the model checkpoint at ../input/huggingface-roberta/roberta-base.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFRobertaModel for predictions without further training.


Epoch 1/50

Epoch 00001: val_root_mean_squared_error improved from inf to 0.58787, saving model to Roberta_Base_123_4.h5
Epoch 2/50

Epoch 00002: val_root_mean_squared_error did not improve from 0.58787
Epoch 3/50

Epoch 00003: val_root_mean_squared_error improved from 0.58787 to 0.50663, saving model to Roberta_Base_123_4.h5
Epoch 4/50

Epoch 00004: val_root_mean_squared_error did not improve from 0.50663
Epoch 5/50

Epoch 00005: val_root_mean_squared_error did not improve from 0.50663
Epoch 6/50

Epoch 00006: val_root_mean_squared_error did not improve from 0.50663
Epoch 7/50

Epoch 00007: val_root_mean_squared_error did not improve from 0.50663
Epoch 8/50

Epoch 00008: val_root_mean_squared_error did not improve from 0.50663
Epoch 9/50

Epoch 00009: val_root_mean_squared_error did not improve from 0.50663
Epoch 10/50
Restoring model weights from the end of the best epoch.

Epoch 00010: val_root_mean_squared_error did not improve from 0.50663
Epoch 00010: early stopping


-----------

Some layers from the model checkpoint at ../input/huggingface-roberta/roberta-base were not used when initializing TFRobertaModel: ['lm_head']
- This IS expected if you are initializing TFRobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFRobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
All the layers of TFRobertaModel were initialized from the model checkpoint at ../input/huggingface-roberta/roberta-base.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFRobertaModel for predictions without further training.


Epoch 1/50

Epoch 00001: val_root_mean_squared_error improved from inf to 0.58448, saving model to Roberta_Base_123_5.h5
Epoch 2/50

Epoch 00002: val_root_mean_squared_error improved from 0.58448 to 0.58445, saving model to Roberta_Base_123_5.h5
Epoch 3/50

Epoch 00003: val_root_mean_squared_error improved from 0.58445 to 0.57852, saving model to Roberta_Base_123_5.h5
Epoch 4/50

Epoch 00004: val_root_mean_squared_error did not improve from 0.57852
Epoch 5/50

Epoch 00005: val_root_mean_squared_error did not improve from 0.57852
Epoch 6/50

Epoch 00006: val_root_mean_squared_error did not improve from 0.57852
Epoch 7/50

Epoch 00007: val_root_mean_squared_error did not improve from 0.57852
Epoch 8/50

Epoch 00008: val_root_mean_squared_error did not improve from 0.57852
Epoch 9/50

Epoch 00009: val_root_mean_squared_error improved from 0.57852 to 0.49702, saving model to Roberta_Base_123_5.h5
Epoch 10/50

Epoch 00010: val_root_mean_squared_error did not improve from 0.49702
Epoch 11/50

In [9]:
test = pd.read_csv('/kaggle/input/commonlitreadabilityprize/test.csv')
submission = test[['id']]
submission['target'] = np.mean(b, axis=0)
submission.to_csv('submission.csv', index=False)
display(submission.head(10))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  This is separate from the ipykernel package so we can avoid doing imports until


Unnamed: 0,id,target
0,c0f722661,-0.197872
1,f0953f0a5,-0.239901
2,0df072751,-0.36553
3,04caf4e0c,-2.489808
4,0e63f8bea,-1.903589
5,12537fe78,-1.071037
6,965e592c0,0.281837
