In [None]:
import pandas as pd
import numpy as np
import tensorflow as tf
import matplotlib.pyplot as plt
import transformers
from sklearn.model_selection import train_test_split
from tensorflow.keras.layers import Dense, Input, Concatenate, Dropout
from tensorflow.keras.models import Model
from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint
from sklearn import preprocessing
from tqdm import tqdm
from transformers import RobertaTokenizer, TFRobertaModel
import keras
from sklearn.model_selection import KFold

In [None]:
seed = 1234
def set_seeds(seed_value):
    tf.random.set_seed(seed_value)
    np.random.seed(seed_value)

set_seeds(seed)

In [None]:
prompts_train = pd.read_csv("/kaggle/input/commonlit-evaluate-student-summaries/prompts_train.csv")
summaries_train = pd.read_csv("/kaggle/input/commonlit-evaluate-student-summaries/summaries_train.csv")

prompts_test = pd.read_csv("/kaggle/input/commonlit-evaluate-student-summaries/prompts_test.csv")
summaries_test = pd.read_csv("/kaggle/input/commonlit-evaluate-student-summaries/summaries_test.csv")

In [None]:
merged_test_data = summaries_test.merge(prompts_test, how= 'left', on='prompt_id')
merged_test_data[['prompt_question', 'prompt_text', 'text']] = merged_test_data[['prompt_question', 'prompt_text', 'text']].applymap(lambda x: x.replace('\r', '').replace('\n', ''))

merged_data = summaries_train.merge(prompts_train, how= 'left', on='prompt_id')
merged_data[['prompt_question', 'prompt_text', 'text']] = merged_data[['prompt_question', 'prompt_text', 'text']].applymap(lambda x: x.replace('\r', '').replace('\n', ''))

In [None]:
data = merged_data
submission_test_data = merged_test_data

In [None]:
data

In [None]:
#PromptText ChatGPT Summarization
summary_dict = {
    "39c16e" : "The poet crafting tragedy should avoid simple plots. The change in fortune should affect a character who isn't entirely virtuous or wicked but faces downfall due to error. The best tragedy ends in misfortune. Double-thread plots are weaker, more fitting for comedy.",
    "3b9047": "Egyptian society resembled a pyramid, with gods atop, followed by pharaohs and nobles. Pharaohs held great power, responsible for protection, laws, and taxes. Viziers aided them, scribes managed records, nobles and priests held influence, soldiers maintained order, craftsmen and merchants formed the middle class, while slaves and farmers were at the bottom. Social mobility existed, allowing some to rise through education and bureaucracy.",
    "814d6b": "In 1967, history teacher Ron Jones conducted the Third Wave experiment at a California high school. Creating a disciplined movement called 'The Third Wave' he showed how people easily follow authority and adopt superiority. The experiment revealed the potential for justifying harmful actions through blind obedience.",
    "ebad26": "In a family with firsthand experience of Packingtown's meat industry, the Third Wave experiment reveals how spoiled meat is repurposed for consumption. With horrifying details, the text exposes unsanitary practices in processing, including mixing tainted meat, chemically altering odors, and using contaminated materials. The shocking narrative underscores the industry's disregard for safety."
}

In [None]:
data['chatgpt_summary'] = data['prompt_id'].map(summary_dict)
submission_test_data['chatgpt_summary'] = submission_test_data['prompt_id'].map(summary_dict)

In [None]:
data

In [None]:
def encoder(title,text):
    tokenizer = RobertaTokenizer.from_pretrained("../input/roberta-base/")
    max_len = 190
    encoded = tokenizer(
        title,
        text,
        add_special_tokens = True,
        max_length = max_len,
        padding = 'max_length',
        truncation= "only_second",
        return_attention_mask=True
    )
    input_ids = np.array(encoded['input_ids'], dtype = "int32")
    attention_masks = np.array(encoded['attention_mask'], dtype = "int32")
    
    return {"input_ids": input_ids, "attention_masks": attention_masks}


In [None]:
#DEF CUSTOM LOSS?

In [None]:
def build_model():
    #units = hp.Int(name="units", min_value=10, max_value=250, step=30)
    #units2 = hp.Int(name="units2", min_value=5, max_value=250, step=30)
    #activation = hp.Choice(name="activation", values=["relu", "linear"])
    #activation2 = hp.Choice(name="activation2", values=["relu", "linear"])
    #max_len = hp.Int(name="max_len", min_value=100, max_value=200, step=10)
    max_len = 190
    model = TFRobertaModel.from_pretrained("../input/roberta-base/")
    model_ids = Input(shape=(max_len,), dtype='int32')
    model_masks = Input(shape=(max_len,), dtype='int32')
    x = model(input_ids=model_ids, 
              attention_mask=model_masks)
    x_content = tf.keras.layers.GlobalAveragePooling1D()(x.last_hidden_state)
    x_wording = tf.keras.layers.GlobalAveragePooling1D()(x.last_hidden_state) 
    content_output = Dense(1, name='content_output')(x_content)
    wording_output = Dense(1, name='wording_output')(x_wording)
    
    model = tf.keras.Model(inputs = [model_ids, model_masks], outputs = [content_output, wording_output])
    model.compile(
        optimizer = tf.keras.optimizers.Adam(),
        loss = "mse",
        metrics=["mse"])
    return model

In [None]:
X_train = encoder(list(data["prompt_title"]),list(data["text"]))
y_train = data[["content", "wording"]].values.tolist()

#y_train = []
#y_train.append(list(data[["content", "wording"]]))
#y_train.append(list(data["wording"]))

In [None]:
def scheduler(epoch):
    learning_rate = 2e-5
    if epoch <= 3:
        return learning_rate * 0.05
    else:
        return learning_rate * (0.9**epoch)
    
callback_lr = tf.keras.callbacks.LearningRateScheduler(scheduler)

In [None]:
'''''

#Keras Auto Tuner
import keras_tuner as kt

tuner = kt.BayesianOptimization(
    build_model,
    objective = 'loss',
    max_trials = 20,
    executions_per_trial = 2,
    directory = "kt_test",
    overwrite = False
)
'''''


In [None]:
'''''

tuner.search_space_summary()

'''''


In [None]:
'''''
tuner.search(
    X_train, y_train,
    batch_size = 20,
    epochs = 12,
    validation_split=0.2,
    callbacks=[
            EarlyStopping(
                monitor='val_loss', patience=3, restore_best_weights=True
            ),
            ModelCheckpoint(
                'roberta_uspppm.h5',
                monitor='val_loss',
                save_best_only=True,
                save_weights_only=True
            ),
            callback_lr
        ],
    shuffle=True
)
'''''


In [None]:
'''''

top_n = 5
best_hps = tuner.get_best_hyperparameters(top_n)

for hp in best_hps:
    print("Hyperparameters:")
    for key, value in hp.values.items():
        print(f"{key}: {value}")
    print("\n")
    
'''''



In [None]:
def train_model(train_col, X_train_input_ids, X_train_attention_masks):
    model = build_model()

    model.fit(
        (np.array(X_train_input_ids), np.array(X_train_attention_masks)),
        np.array(train_col),
        epochs=12,
        shuffle=True,
        callbacks=[
            EarlyStopping(
                monitor='val_loss', patience=2, restore_best_weights=True
            ),
            ModelCheckpoint(
                'roberta_uspppm.h5',
                monitor='val_loss',
                save_best_only=True,
                save_weights_only=True
            ),
            callback_lr
        ],
        batch_size=36,
        validation_split=0.2
    )
    
    return model

In [None]:
k = 5
kf = KFold(n_splits=k, shuffle=True, random_state=seed)

In [None]:
#target_cols = ["content", "wording"]

best_val_metrics = float('inf')
best_model = None

for fold_index, (train_index, val_index) in enumerate(kf.split(X_train['input_ids'])):
    print(f"Training Fold {fold_index + 1}/{k}")
    
    X_train_input_ids_fold = X_train['input_ids'][train_index]
    X_train_attention_masks_fold = X_train['attention_masks'][train_index]
    
    y_train_fold = [y_train[i] for i in train_index]
    y_val_fold = [y_train[i] for i in val_index]
    
    model = train_model(y_train_fold, X_train_input_ids_fold, X_train_attention_masks_fold)
    history = model.history
    
    val_content_mse = history.history['val_content_output_mse'][-1]
    val_wording_mse = history.history['val_wording_output_mse'][-1]
    average_val_mse = (val_content_mse + val_wording_mse) / 2       
    
    if average_val_mse < best_val_metrics:
        best_val_metrics = average_val_mse
        best_model = model
        
if best_model is not None:
    best_model.save("best_model.h5")

In [None]:
X_test = encoder(list(submission_test_data["prompt_title"]),list(submission_test_data["text"]))

In [None]:
pred = best_model.predict((np.array(X_test['input_ids']), np.array(X_test['attention_masks'])))
pred

In [None]:
sample_submission = pd.read_csv("/kaggle/input/commonlit-evaluate-student-summaries/sample_submission.csv")

target_cols = ["content", "wording"]
for col_index, col_name in enumerate(target_cols):
    sample_submission[col_name] = pred[col_index]


In [None]:
sample_submission.to_csv("submission.csv", index = False)

In [None]:
sample_submission