In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
!pwd

/content


In [None]:
!pip install -r /content/drive/MyDrive/ColabNotebooks/requirements.txt

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Ignoring tensorflow-macos: markers 'sys_platform == "darwin" and "ARM" in platform_version' don't match your environment
Ignoring tensorflow: markers 'sys_platform == "darwin" and "ARM" not in platform_version' don't match your environment


In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
import os
from transformers import  TFDebertaModel, DebertaTokenizerFast

import torch
#import pytorch_lightning as pl
#from pytorch_lightning import Trainer, seed_everything
#from pytorch_lightning.callbacks import ModelCheckpoint, EarlyStopping
from tensorflow.keras import layers, Input, Model, models

# Configuration

In [None]:
class config:
    
    train_dataset_path="/content/drive/MyDrive/ColabNotebooks/train.csv"
       
    #save_dir="../input/colab-models-download-v2-0/"
    
    #tokenizer params
    truncation = True
    padding = True #'max_length'
    max_length = 512
    
    # model params
    train_col='full_text'
    model_name = "microsoft/deberta-v3-large"
    target_cols = ['cohesion', 'syntax', 'vocabulary', 'phraseology', 'grammar', 'conventions']
    load_from_disk = None
    
    #training params
    learning_rate = 9e-6
    batch_size = 2
    epochs = 3
    NFOLDS = 5

# Load the Tokenizer
first, we define a function to tokenize the text from a dataframe

In [None]:
def tokenize (df):
    texts=list(df[config.train_col])
    tokenized=tokenizer(texts,
                       padding=config.padding,
                       truncating=True,
                       max_length=config.max_length)
    tokenized["labels"]= [df[column] for column in config.target_cols]
    tokenized['length'] = len(tokenized['input_ids'])
    
    return tokenized

Then, we load the tokenizer.

In [None]:
tokenizer = DebertaTokenizerFast.from_pretrained("microsoft/deberta-base")

# Load the data

In [None]:
data=pd.read_csv(config.train_dataset_path).copy()
texts=data[config.train_col]
targets=data[config.target_cols]
train_texts, val_texts, train_targets, val_targets=train_test_split(texts, targets, test_size=0.3)

# Convert the data into a tokenized form
Here we want the tokens to be read by a tf.keras model

In [None]:
tokenized_train_texts = tokenizer(list(train_texts), return_tensors='tf',truncation=config.truncation, padding=config.padding)
tokenized_val_texts = tokenizer(list(val_texts), return_tensors='tf', truncation=config.truncation, padding=config.padding)

In [None]:
tokenized_train_texts.keys()

dict_keys(['input_ids', 'token_type_ids', 'attention_mask'])

In [None]:
tokenized_train_texts['input_ids'].shape

TensorShape([2737, 512])

# Defining the model
### Defining the loss function

In [None]:
from keras import backend as K
def root_mean_squared_error(y_true, y_pred):
        return K.sqrt(K.mean(K.square(y_pred - y_true), axis=-1))

### Model architecture
Here we use the output of the pretrained DeBerta model as an input of a dense intermediate layer, then we input the result in the linear regression parallele output layers, for each target.

In [None]:
input_ids = Input(shape=((512)),dtype='int32')

In [None]:
transformer_model = TFDebertaModel.from_pretrained('microsoft/deberta-base', output_hidden_states=True, return_dict=True)

All model checkpoint layers were used when initializing TFDebertaModel.

All the layers of TFDebertaModel were initialized from the model checkpoint at microsoft/deberta-base.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFDebertaModel for predictions without further training.


In [None]:
# Import the needed model(Bert, Roberta or DistilBert) with output_hidden_states=True
transformer_model = TFDebertaModel.from_pretrained('microsoft/deberta-base', output_hidden_states=True, return_dict=True)
transformer_model.trainable = False

input_ids = Input(shape=((512)),dtype='int32', name='input_ids')
attention_mask = Input(shape=((512)), dtype='int32', name='attention_mask')

transformer = transformer_model(dict(input_ids=input_ids,attention_mask=attention_mask))    
hidden_states = transformer[0] # get output_hidden_states

#hidden_states_size = 4 # count of the last states 
#hiddes_states_ind = list(range(-hidden_states_size, 0, 1))

#selected_hiddes_states = layers.concatenate(tuple([hidden_states[i] for i in hiddes_states_ind]))

# Add a layer maxpool 1D
pooling_layer = layers.GlobalMaxPooling1D()(hidden_states)

# Now we can use selected_hiddes_states as we want
last_hidden_layer = layers.Dense(128, activation='relu')(pooling_layer)

# Defining the regression layer
cohesion_output=layers.Dense(1, activation="linear", name="cohesion")(last_hidden_layer)
syntax_output=layers.Dense(1, activation="linear", name="syntax")(last_hidden_layer)
vocabulary_output=layers.Dense(1, activation="linear", name="vocabulary")(last_hidden_layer)
phraseology_output=layers.Dense(1, activation="linear", name="phraseology")(last_hidden_layer)
grammar_output=layers.Dense(1, activation="linear", name="grammar")(last_hidden_layer)
conventions_output=layers.Dense(1, activation="linear", name="conventions")(last_hidden_layer)

# output in a list
output= [cohesion_output, syntax_output, vocabulary_output, phraseology_output, grammar_output, conventions_output]

#Assembling the model
model = Model(inputs = [input_ids, attention_mask], outputs = output)
model.summary()

All model checkpoint layers were used when initializing TFDebertaModel.

All the layers of TFDebertaModel were initialized from the model checkpoint at microsoft/deberta-base.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFDebertaModel for predictions without further training.


Model: "model_1"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 attention_mask (InputLayer)    [(None, 512)]        0           []                               
                                                                                                  
 input_ids (InputLayer)         [(None, 512)]        0           []                               
                                                                                                  
 tf_deberta_model_3 (TFDebertaM  TFBaseModelOutput(l  138601728  ['attention_mask[0][0]',         
 odel)                          ast_hidden_state=(N               'input_ids[0][0]']              
                                one, 512, 768),                                                   
                                 hidden_states=((No                                         

In [None]:
model.compile(loss='mse', optimizer='adam', loss_weights=[1/6 for i in range(6)], metrics=root_mean_squared_error)

In [None]:
from tensorflow.keras import callbacks

es = callbacks.EarlyStopping(patience=5, restore_best_weights=True)
history = model.fit(x={'input_ids':tokenized_train_texts['input_ids'],
                        'attention_mask':tokenized_train_texts['attention_mask']},
                    y=train_targets,epochs=40,batch_size=16,validation_split=0.2, callbacks=[es],
          verbose=1)

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20

In [None]:
#model.save('/content/drive/MyDrive/ColabNotebooks/Houssam-Deberta-V1-Maxpool-Dense-128/houssam_deberta_V1_maxpool_dense_128_model.h5')
#model.save_weights('/content/drive/MyDrive/ColabNotebooks/Houssam-Deberta-V1-Maxpool-Dense-128/houssam_deberta_V1_maxpool_dense_128_model_weights.h5')

In [None]:
# convert the history.history dict to a pandas DataFrame:     
hist_deberta_df = pd.DataFrame(history.history)
# Save to CSV
hist_csv_file = '/content/drive/MyDrive/ColabNotebooks/Houssam-Deberta-V1-Maxpool-Dense-128/deberta_v1_history.csv'
with open(hist_csv_file, mode='w') as deberta_v1_history:
    hist_deberta_df.to_csv(deberta_v1_history)

In [None]:
def plot_history(history, title='', axs=None, exp_name=""):

    #compute new metrics and add in 'history.history'
    metrics=['cohesion', 'syntax', 'vocabulary', 'phraseology', 'grammar', 'conventions']
    scores=['loss','root_mean_squared_error']
    history.history['MCRMSE']=[]
    history.history['val_MCRMSE']=[]
    for epoch in range(len(history.history['loss'])):
      mcrmse=0
      val_mcrmse=0
      for m in metrics:
        mcrmse +=history.history[f'{m}_root_mean_squared_error'][epoch]
        val_mcrmse +=history.history[f'val_{m}_root_mean_squared_error'][epoch]
      history.history['MCRMSE'].append(mcrmse/len(metrics))
      history.history['val_MCRMSE'].append(val_mcrmse/len(metrics))

    if axs is not None:
        ax1, ax2 = axs
    else:
        f, ax = plt.subplots(7, 2, figsize=(12, 2*7))
    
    if len(exp_name) > 0 and exp_name[0] != '_':
        exp_name = '_' + exp_name

    ax[0][0].plot(history.history['loss'], label='train' + exp_name)
    ax[0][0].plot(history.history['val_loss'], label='val' + exp_name)
    ax[0][0].set_title('loss')
    ax[0][0].legend()
   
    ax[0][1].plot(history.history['MCRMSE'], label='train' + exp_name)
    ax[0][1].plot(history.history['val_MCRMSE'], label='val' + exp_name)
    ax[0][1].set_title('MCRMSE')
    ax[0][1].legend()
    i=0
    for m in metrics:
      j=0
      i+=1
      for s in scores:
        ax[i][j].plot(history.history[f'{m}_{s}'], label='train' + exp_name)
        ax[i][j].plot(history.history[f'val_{m}_{s}'], label='val' + exp_name)
        ax[i][j].set_title(f'{m}_{s}')
        ax[i][j].legend()
        j+=1
        
    return ax
plot_history(history, title='', axs=None, exp_name="")

In [None]:

model.load_weights('/content/drive/MyDrive/ColabNotebooks/Houssam-Deberta-V1-Maxpool-Dense-128/houssam_deberta_V1_maxpool_dense_128_model_weights.h5')

In [None]:
model_eval=model.evaluate(x={'input_ids':tokenized_val_texts['input_ids'],
                        'attention_mask':tokenized_val_texts['attention_mask']}, y=val_targets)

In [None]:
np.mean(model_eval[-6:])

In [None]:
model.__dict__