In [1]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [2]:
# ! pip install tensorflow keras keras-nlp


In [3]:
import pandas as pd
import numpy as np
import tensorflow as tf
import keras
import keras_nlp
from sklearn.model_selection import train_test_split
from sklearn.metrics import cohen_kappa_score
from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint

In [4]:
# Load the datasets
train_data = pd.read_csv('/content/drive/MyDrive/Automated_essay_scoring/data/train.csv')
test_data = pd.read_csv('/content/drive/MyDrive/Automated_essay_scoring/data/test.csv')

In [5]:
# Text Preprocessing function
def preprocess_text(text):
    text = text.lower()
    text = tf.keras.preprocessing.text.text_to_word_sequence(text)
    return ' '.join(text)

train_data['full_text'] = train_data['full_text'].apply(preprocess_text)
test_data['full_text'] = test_data['full_text'].apply(preprocess_text)

In [6]:
# Encode the scores
train_labels = train_data['score'] - 1
# Split the training data
X_train, X_val, y_train, y_val = train_test_split(train_data['full_text'], train_labels, test_size=0.2, random_state=42)
# Convert data to numpy arrays
X_train = np.array(X_train.tolist())
X_val = np.array(X_val.tolist())
y_train = y_train.to_numpy()
y_val = y_val.to_numpy()

In [7]:
# Pretrained classifier with keras_nlp
classifier = keras_nlp.models.BertClassifier.from_preset(
    "bert_base_en",
    num_classes=6,
)
# Compile the model
classifier.compile(
    loss=keras.losses.SparseCategoricalCrossentropy(from_logits=True),
    optimizer=keras.optimizers.Adam(5e-5),
    metrics=['accuracy'],
)
# early stopping and model checkpoint
early_stopping = EarlyStopping(monitor='val_loss', patience=5, restore_best_weights=True)
model_checkpoint = ModelCheckpoint('best_model_bert.keras', save_best_only=True, monitor='val_loss', mode='min')

Downloading from https://www.kaggle.com/api/v1/models/keras/bert/keras/bert_base_en/2/download/model.safetensors...
Downloading from https://www.kaggle.com/api/v1/models/keras/bert/keras/bert_base_en/2/download/model.safetensors.index.json...
Downloading from https://www.kaggle.com/api/v1/models/keras/bert/keras/bert_base_en/2/download/task.json...
Downloading from https://www.kaggle.com/api/v1/models/keras/bert/keras/bert_base_en/2/download/model.safetensors...
Downloading from https://www.kaggle.com/api/v1/models/keras/bert/keras/bert_base_en/2/download/model.safetensors.index.json...
Downloading from https://www.kaggle.com/api/v1/models/keras/bert/keras/bert_base_en/2/download/model.safetensors...
Downloading from https://www.kaggle.com/api/v1/models/keras/bert/keras/bert_base_en/2/download/model.safetensors.index.json...
Downloading from https://www.kaggle.com/api/v1/models/keras/bert/keras/bert_base_en/2/download/preprocessor.json...


In [None]:
# Train the model
history = classifier.fit(
    x=X_train,
    y=y_train,
    validation_data=(X_val, y_val),
    epochs=10,
    batch_size=16,
    callbacks=[early_stopping, model_checkpoint],
    verbose=2
)

Epoch 1/10


In [None]:
# Load best model
classifier.load_weights('best_model_bert.keras')
# Predict on the validation data
val_predictions = classifier.predict(x=X_val, batch_size=16).argmax(axis=-1) + 1
# quadratic weighted kappa score
kappa_score = cohen_kappa_score(y_val + 1, val_predictions, weights='quadratic')
print(f'Validation Quadratic Weighted Kappa: {kappa_score}')
# Predict on the test data
test_predictions = classifier.predict(x=test_data['full_text'].tolist(), batch_size=16).argmax(axis=-1) + 1

In [None]:
# saving result
result = pd.DataFrame({'essay_id': test_data['essay_id'], 'score': test_predictions})
result.head(10)