In [55]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
import os
from transformers import  TFDebertaV2Model, DebertaV2TokenizerFast

import torch
import pytorch_lightning as pl
from pytorch_lightning import Trainer, seed_everything
from pytorch_lightning.callbacks import ModelCheckpoint, EarlyStopping
from tensorflow.keras import layers, Input, Sequential, Model

# Configuration

In [56]:
class config:
    base_dir = "/kaggle/working/"
    seed = 69
    # dataset path 
    
    parent_path=os.path.join(os.getcwd(), os.pardir)
    train_dataset_path=os.path.join(parent_path, "raw_data/train.csv")
       
    #save_dir="../input/colab-models-download-v2-0/"
    
    #tokenizer params
    truncation = True
    padding = True #'max_length'
    max_length = 512
    
    # model params
    train_col='full_text'
    model_name = "microsoft/deberta-v3-large"
    target_cols = ['cohesion', 'syntax', 'vocabulary',
       'phraseology', 'grammar', 'conventions']
    load_from_disk = None
    
    #training params
    learning_rate = 9e-6
    batch_size = 2
    epochs = 3
    NFOLDS = 5

seed_everything(config.seed)

Global seed set to 69


69

# Load the Tokenizer
first, we define a function to tokenize the text from a dataframe

In [57]:
def tokenize (df):
    texts=list(df[config.train_col])
    tokenized=tokenizer(texts,
                       padding=config.padding,
                       truncating=True,
                       max_length=config.max_length)
    tokenized["labels"]= [df[column] for column in config.target_cols]
    tokenized['length'] = len(tokenized['input_ids'])
    
    return tokenized

Then, we load the tokenizer.

In [58]:
tokenizer = DebertaV2TokenizerFast.from_pretrained("microsoft/deberta-v2-xlarge")

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


# Load the data

In [59]:
data=pd.read_csv(config.train_dataset_path).copy()
texts=data[config.train_col]
targets=data[config.target_cols]
train_texts, val_texts, train_targets, val_targets=train_test_split(texts, targets, test_size=0.2)

# Convert the data into a tokenized form
Here we want the tokens to be read by a tf.keras model

In [60]:
tokenized_train_texts = tokenizer(list(train_texts), return_tensors='tf',truncation=config.truncation, padding=config.padding)
tokenized_val_texts = tokenizer(list(val_texts), return_tensors='tf', truncation=config.truncation, padding=config.padding)

In [61]:
tokenized_train_texts.keys()

dict_keys(['input_ids', 'token_type_ids', 'attention_mask'])

In [62]:
tokenized_train_texts

{'input_ids': <tf.Tensor: shape=(3128, 512), dtype=int32, numpy=
array([[   1, 3031,  334, ...,    0,    0,    0],
       [   1, 2074,  189, ...,    0,    0,    0],
       [   1,   16, 8455, ...,    0,    0,    0],
       ...,
       [   1,   16,  214, ...,    0,    0,    0],
       [   1,   23,  569, ...,    0,    0,    0],
       [   1,   23,  307, ...,    0,    0,    0]], dtype=int32)>, 'token_type_ids': <tf.Tensor: shape=(3128, 512), dtype=int32, numpy=
array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]], dtype=int32)>, 'attention_mask': <tf.Tensor: shape=(3128, 512), dtype=int32, numpy=
array([[1, 1, 1, ..., 0, 0, 0],
       [1, 1, 1, ..., 0, 0, 0],
       [1, 1, 1, ..., 0, 0, 0],
       ...,
       [1, 1, 1, ..., 0, 0, 0],
       [1, 1, 1, ..., 0, 0, 0],
       [1, 1, 1, ..., 0, 0, 0]], dtype=int32)>}

In [63]:
tokenized_train_texts['input_ids'].shape

TensorShape([3128, 512])

# Defining the model
### Defining the loss function

In [64]:
from keras import backend as K
def root_mean_squared_error(y_true, y_pred):
        return K.sqrt(K.mean(K.square(y_pred - y_true), axis=-1))

### Model architecture
Here we use the output of the pretrained DeBerta model as an input of a dense intermediate layer, then we input the result in the linear regression parallele output layers, for each target.

In [65]:
input_ids = Input(shape=((512)),dtype='int32')

In [66]:
# Import the needed model(Bert, Roberta or DistilBert) with output_hidden_states=True
transformer_model = TFDebertaV2Model.from_pretrained('microsoft/deberta-v2-xlarge', output_hidden_states=True)

input_ids = Input(shape=((512)),dtype='int32')
attention_mask = Input(shape=((512)), dtype='int32')

transformer = transformer_model(dict(input_ids=input_ids,attention_mask=attention_mask))    
hidden_states = transformer[1] # get output_hidden_states

hidden_states_size = 4 # count of the last states 
hiddes_states_ind = list(range(-hidden_states_size, 0, 1))

selected_hiddes_states = layers.concatenate(tuple([hidden_states[i] for i in hiddes_states_ind]))

# Now we can use selected_hiddes_states as we want
last_hidden_layer = layers.Dense(8, activation='relu')(selected_hiddes_states)

# Defining the regression layer
cohesion_output=layers.Dense(1, activation="linear", name="cohesion")(last_hidden_layer)
syntax_output=layers.Dense(1, activation="linear", name="syntax")(last_hidden_layer)
vocabulary_output=layers.Dense(1, activation="linear", name="vocabulary")(last_hidden_layer)
phraseology_output=layers.Dense(1, activation="linear", name="phraseology")(last_hidden_layer)
grammar_output=layers.Dense(1, activation="linear", name="grammar")(last_hidden_layer)
conventions_output=layers.Dense(1, activation="linear", name="conventions")(last_hidden_layer)

# output in a list
output= [cohesion_output, syntax_output, vocabulary_output, phraseology_output, grammar_output, conventions_output]

#Assembling the model
model = Model(inputs = [input_ids, attention_mask], outputs = output)
model.summary()

Downloading (…)"tf_model.h5";: 100%|██████████| 3.54G/3.54G [05:07<00:00, 11.5MB/s]
2023-03-08 15:28:31.208272: W tensorflow/core/framework/cpu_allocator_impl.cc:82] Allocation of 787046400 exceeds 10% of free system memory.
2023-03-08 15:28:33.736233: W tensorflow/core/framework/cpu_allocator_impl.cc:82] Allocation of 787046400 exceeds 10% of free system memory.


: 

: 