In [34]:
!pip install -r requirements.txt
!pip install -U sentencepiece

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Ignoring tensorflow-macos: markers 'sys_platform == "darwin" and "ARM" in platform_version' don't match your environment
Ignoring tensorflow: markers 'sys_platform == "darwin" and "ARM" not in platform_version' don't match your environment
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [35]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
import os
from transformers import  TFDebertaV2Model, DebertaV2TokenizerFast
from tensorflow.keras import callbacks
from tensorflow.keras import layers, Input, Sequential, Model

# Configuration

In [36]:
class config:
    base_dir = "/kaggle/working/"
  
    # dataset path   
    train_dataset_path='/content/train.csv'
    test_dataset_path = '/content/test.csv'
    
    #tokenizer params
    truncation = True
    padding = True #'max_length'
    max_length = 512
    
    # model params
    train_col='full_text'
    model_name = "microsoft/deberta-v2-xlarge"
    target_cols = ['cohesion', 'syntax', 'vocabulary',
       'phraseology', 'grammar', 'conventions']

# Load the Tokenizer
first, we define a function to tokenize the text from a dataframe

In [37]:
tokenizer = DebertaV2TokenizerFast.from_pretrained("microsoft/deberta-v2-xlarge")

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [38]:
def tokenize (df):
    texts=list(df[config.train_col])
    tokenized=tokenizer(texts,
                       padding=config.padding,
                       truncating=True,
                       max_length=config.max_length)
    tokenized["labels"]= [df[column] for column in config.target_cols]
    tokenized['length'] = len(tokenized['input_ids'])
    
    return tokenized

# Load the data

In [39]:
data=pd.read_csv('/content/train.csv').copy()
texts=data[config.train_col]
targets=data[config.target_cols]
train_texts, val_texts, train_targets, val_targets=train_test_split(texts, targets, test_size=0.2)

In [40]:
train_targets.shape

(3128, 6)

In [48]:
data_test_texts = pd.read_csv('/content/test.csv').copy()
test_texts = data_test_texts[config.train_col]

# Convert the data into a tokenized form
Here we want the tokens to be read by a tf.keras model

In [41]:
tokenized_train_texts = tokenizer(list(train_texts), return_tensors='tf',truncation=config.truncation, padding=config.padding)
tokenized_val_texts = tokenizer(list(val_texts), return_tensors='tf', truncation=config.truncation, padding=config.padding)

In [49]:
tokenized_test_texts = tokenizer(list(test_texts), return_tensors='tf', truncation=config.truncation, padding=config.padding)

# Defining the model
### Defining the loss function

In [42]:
from keras import backend as K
def root_mean_squared_error(y_true, y_pred):
        return K.sqrt(K.mean(K.square(y_pred - y_true), axis=-1))

### Model architecture
Here we use the output of the pretrained DeBerta model as an input of a dense intermediate layer, then we input the result in the linear regression parallele output layers, for each target.

In [43]:
input_ids = Input(shape=((512)),dtype='int32')

In [44]:
# Import the needed model(Bert, Roberta or DistilBert) with output_hidden_states=True
transformer_model = TFDebertaV2Model.from_pretrained('microsoft/deberta-v2-xlarge', output_hidden_states=True, return_dict=True)
transformer_model.trainable = False

input_ids = Input(shape=((512)),dtype='int32', name='input_ids')
attention_mask = Input(shape=((512)), dtype='int32', name='attention_mask')

transformer = transformer_model(dict(input_ids=input_ids,attention_mask=attention_mask))    
hidden_states = transformer[0] # get output_hidden_states


# Add a layer maxpool 1D
pooling_layer = layers.GlobalMaxPooling1D()(hidden_states)

# Now we can use selected_hiddes_states as we want
last_hidden_layer = layers.Dense(64, activation='relu')(pooling_layer)

# Defining the regression layer
cohesion_output=layers.Dense(1, activation="linear", name="cohesion")(last_hidden_layer)
syntax_output=layers.Dense(1, activation="linear", name="syntax")(last_hidden_layer)
vocabulary_output=layers.Dense(1, activation="linear", name="vocabulary")(last_hidden_layer)
phraseology_output=layers.Dense(1, activation="linear", name="phraseology")(last_hidden_layer)
grammar_output=layers.Dense(1, activation="linear", name="grammar")(last_hidden_layer)
conventions_output=layers.Dense(1, activation="linear", name="conventions")(last_hidden_layer)

# output in a list
output= [cohesion_output, syntax_output, vocabulary_output, phraseology_output, grammar_output, conventions_output]

#Assembling the model
model = Model(inputs = [input_ids, attention_mask], outputs = output)
model.summary()

All model checkpoint layers were used when initializing TFDebertaV2Model.

All the layers of TFDebertaV2Model were initialized from the model checkpoint at microsoft/deberta-v2-xlarge.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFDebertaV2Model for predictions without further training.


Model: "model_2"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 attention_mask (InputLayer)    [(None, 512)]        0           []                               
                                                                                                  
 input_ids (InputLayer)         [(None, 512)]        0           []                               
                                                                                                  
 tf_deberta_v2_model_2 (TFDeber  TFBaseModelOutput(l  884593152  ['attention_mask[0][0]',         
 taV2Model)                     ast_hidden_state=(N               'input_ids[0][0]']              
                                one, 512, 1536),                                                  
                                 hidden_states=((No                                         

In [45]:
# Compile the model 
model.compile(loss='mse', optimizer='adam',loss_weights=[1/6 for i in range(6)], metrics= root_mean_squared_error)

In [46]:
# Fit the model
es = callbacks.EarlyStopping(patience=2, restore_best_weights=True)
history = model.fit(x={'input_ids':tokenized_train_texts['input_ids'],
                        'attention_mask':tokenized_train_texts['attention_mask']},
                    y=train_targets,epochs=100,batch_size=6,validation_split=0.2, callbacks=[es],
          verbose=1)

Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100


In [50]:
tokenized_test_text = tokenizer(list(test_texts), return_tensors='tf',truncation=config.truncation, padding=config.padding)

In [51]:
# Make predictions on the test set
test_predictions = model.predict({'input_ids':tokenized_test_texts['input_ids'], 'attention_mask':tokenized_test_texts['attention_mask']})

# Create a list of dictionaries with the predictions
predictions_list = []
for i in range(len(test_predictions[0])):
    prediction_dict = {'cohesion': test_predictions[0][i],
                       'syntax': test_predictions[1][i],
                       'vocabulary': test_predictions[2][i],
                       'phraseology': test_predictions[3][i],
                       'grammar': test_predictions[4][i],
                       'conventions': test_predictions[5][i]}
    predictions_list.append(prediction_dict)

# Convert the list to a dataframe
predictions_df = pd.DataFrame(predictions_list)





In [52]:
# Save the dataframe to a CSV file
predictions_df.to_csv('predictions.csv', index=False)

In [53]:
from google.colab import drive
drive.mount('/content/drive')


Mounted at /content/drive


In [60]:
!mkdir -p /content/drive/MyDrive/mathieu_first_deberta_v2_xlarge_model


In [61]:
# Save the model weigths on google drive
model.save('/content/drive/MyDrive/mathieu_first_deberta_v2_xlarge_model/mathieu_first_deberta_v2_xlarge_model.h5')
model.save_weights('/content/drive/MyDrive/mathieu_first_deberta_v2_xlarge_model/mathieu_first_deberta_v2_xlarge_model_weights.h5')
