# **CommonLit Readability**

## **Import libraries**

In [None]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from pylab import rcParams
import torch
import transformers
from transformers import BertModel, BertTokenizer, RobertaTokenizer, RobertaModel
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score
from tensorflow import keras
from keras.models import Sequential
from keras.layers import Dense, Dropout
from keras.wrappers.scikit_learn import KerasRegressor, KerasClassifier
from sklearn.model_selection import KFold, StratifiedKFold
from keras import optimizers
import json
import itertools
import warnings
%matplotlib inline

In [None]:
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

In [None]:
pd.set_option("display.max_colwidth", None)

In [None]:
warnings.filterwarnings("ignore")

## **Load data**

#### Load and describe the training data

In [None]:
filename = "../input/commonlitreadabilityprize/train.csv"
df_train = pd.read_csv(filename)
df_train = df_train.drop(["url_legal", "license"], axis = 1)

In [None]:
df_train.head(1)

In [None]:
df_train.info()

In [None]:
df_train['target'].describe()

In [None]:
std = df_train['target'].std()
mean = df_train['target'].mean()
print('mean:', mean)
print('std: ', std)

#### Load the test data

In [None]:
filename = "../input/commonlitreadabilityprize/test.csv"
df_test = pd.read_csv(filename)
df_test = df_test.drop(["url_legal", "license"], axis = 1)

In [None]:
df_test.head(1)

## **Transform data**

#### Remove new lines

In [None]:
def to_string(row_text):
  lines = row_text.split('\n')
  string = ""
  for line in lines:
    string = string + " " + line
  return string

#### Remove new lines from the training data

In [None]:
df_train['excerpt'] = df_train['excerpt'].apply(to_string)

In [None]:
df_train.head(1)

#### Remove new lines from the test data

In [None]:
df_test['excerpt'] = df_test['excerpt'].apply(to_string)

In [None]:
df_test.head(1)

## **Exploratory Data Analysis**

In [None]:
sns.set_style("darkgrid")
rcParams['figure.figsize'] = 9, 6

In [None]:
sns.kdeplot(df_train.target, shade=True, color="r")
plt.xlabel('Average ratings')
plt.show()

In [None]:
sns.kdeplot(df_train.standard_error, shade=True, color="r")
plt.xlabel('Standard errors')
plt.show()

In [None]:
x=df_train['target']
y=df_train['standard_error']
plt.scatter(x=x, y=y)
plt.annotate("remove", xy=(0, 0), arrowprops=dict(facecolor='orange', shrink=0.05), 
             xytext=(0.6, 0.3), textcoords='axes fraction', fontsize=12, weight='bold',
             horizontalalignment='right', verticalalignment='top', color='orange')
plt.xlabel('Targets')
plt.ylabel('Standard errors')
plt.show()

In [None]:
ind = df_train[df_train['target'] == 0].index
df_train = df_train.drop(ind)

In [None]:
lower_bound = mean - std
upper_bound = mean + std
lower_bound, upper_bound

In [None]:
plt.scatter(x=df_train['target'], y=df_train['standard_error'])

plt.axvline(x=lower_bound, ymin=0, ymax=1, linewidth=1.5, linestyle="--", color='darkorchid')
plt.axvline(x=upper_bound, ymin=0, ymax=1, linewidth=1.5, linestyle="--", color='darkorchid')

plt.xlabel('Targets')
plt.ylabel('Standard errors')
plt.show()

In [None]:
min_value = df_train["target"].min()
max_value = df_train["target"].max()
print("min: ",  min_value)
print("max: ",  max_value)

## **Choose sequence length**

In [None]:
PRE_TRAINED_MODEL = "roberta-base"
#PRE_TRAINED_MODEL = 'bert-base-uncased'

In [None]:
tokenizer = RobertaTokenizer.from_pretrained(PRE_TRAINED_MODEL, do_lower_case=True)
#tokenizer = transformers.BertTokenizer.from_pretrained(PRE_TRAINED_MODEL)

In [None]:
%%time

for df in [("training data", df_train), ("test data", df_test)]:
  excerpt_tokens = []
  for excerpt in df[1].excerpt:
    tokens = tokenizer.tokenize(excerpt)
    excerpt_tokens.append(len(tokens))

  min_tokens = min(excerpt_tokens)
  max_tokens = max(excerpt_tokens)
  print(df[0],":")
  print("-" * 100)
  print('min ve max tokens:', min_tokens, max_tokens)
  print('\n')

  sns.distplot(excerpt_tokens)
  plt.xlim([min_tokens-50, max_tokens+50]);
  plt.xlabel('Token count');
  plt.show()

  print('\n')


## **Parameters-I**

In [None]:
MAX_LEN = 320
RANDOM_SEED = 42
SPLIT_RATIO = 0.2

## **Encoding**

#### Add input_id, attention_mask, last_hidden_states columns

In [None]:
# Run above

df_train['input_ids']=''
df_train['attention_mask']=''
df_train['last_hidden_states']=''

#### Encode excerpts

In [None]:
def encode_excerpt(row_data):
    encoding = tokenizer.encode_plus(
        row_data,
        max_length=MAX_LEN,
        truncation=True,
        add_special_tokens=True,
        pad_to_max_length=True,
        return_attention_mask=True, 
        return_token_type_ids=False,
        return_tensors='pt'
    )
    
    input_ids = encoding['input_ids'] 
    attention_mask = encoding['attention_mask']
    
    return pd.Series([input_ids, attention_mask])

In [None]:
%%time

df_train[['input_ids', 'attention_mask']] = df_train['excerpt'].apply(encode_excerpt)

In [None]:
df_train.head(1)

## **Embeddings**

#### Model definition

In [None]:
model = RobertaModel.from_pretrained(PRE_TRAINED_MODEL) # Run
#model = BertModel.from_pretrained(PRE_TRAINED_MODEL) # Run

#### Find word embeddings

In [None]:
def find_last_hidden_states(input_ids, attention_mask):

    with torch.no_grad():
        last_hidden_state, pooled_output = model(
            input_ids=input_ids,
            attention_mask=attention_mask,
            return_dict = False
        )

        features = last_hidden_state[:,0,:].numpy()
        features_flat = list(itertools.chain(*features))
    
    return features_flat

#torch.Size([1, 320, 768])

In [None]:
%%time

df_train['last_hidden_states'] = df_train[['input_ids', 'attention_mask']].apply(lambda row: find_last_hidden_states(row['input_ids'], row['attention_mask']), axis=1)

In [None]:
df_train.head(1)

## **Save and load new data**

#### Save the training data

In [None]:
df_train.to_pickle("./train_data_bert_embed.pkl")

#### Load the training data with embeddings

In [None]:
# Run below

filename = "../input/commonlit-readability/train_data_roberta_embed.pkl"
df_embed = pd.read_pickle(filename)
# filename = "../input/commonlit-readability/train_data_bert_embed.pkl"
# df_embed = pd.read_pickle(filename)

In [None]:
df_embed.head(1)

In [None]:
input_size = len(df_embed['last_hidden_states'].head(1)[0])
input_size

## **Train-validation data features and labels**

In [None]:
def split_train_val(full_data, split_ratio):
  df_train, df_val = train_test_split(
      full_data,
      test_size=split_ratio,
      random_state=RANDOM_SEED
      )

  print("training data:", df_train.shape)
  print("validation data:", df_val.shape)

  return df_train, df_val

## **Prepare the dataset**

#### Extend embeddings across df columns

In [None]:
df_embed[[x for x in range(1, input_size+1)]] = pd.DataFrame(df_embed.last_hidden_states.tolist(), index= df_embed.index)

In [None]:
df_embed.insert(4,'group','')

In [None]:
def group_by(row):
  if row <  lower_bound or row >  upper_bound:
    group_name = 1
  else:
    group_name = 0
  return group_name

In [None]:
df_embed['group'] = df_embed['target'].apply(group_by)
df_embed.head(1)

#### Simplify the dataset

In [None]:
df_simplified = df_embed.drop(["excerpt", "standard_error", "input_ids", "attention_mask", "last_hidden_states"], axis = 1)

In [None]:
df_simplified.head()

#### Split the dataset

In [None]:
df_train_set, df_val_set = split_train_val(df_simplified, SPLIT_RATIO)

In [None]:
train_dataset = df_train_set.values
val_dataset = df_val_set.values

In [None]:
# X_train = train_dataset[:,2:].tolist()
#y_train = train_dataset[:,1].tolist()
X_train = train_dataset[:,3:].tolist()
y_train = train_dataset[:,2].tolist()

In [None]:
#X_val = val_dataset[:,2:].tolist()
# y_val = val_dataset[:,1].tolist()
X_val = val_dataset[:,3:].tolist()
y_val = val_dataset[:,2].tolist()

#### Define the keras model

In [None]:
def create_model(optimizer, activation_function, init):
    
    # create model
    keras_model = Sequential()
    keras_model.add(Dense(input_size, input_dim=input_size, kernel_initializer=init, activation=activation_function))
    keras_model.add(Dense(3, kernel_initializer=init, activation=activation_function))
    keras_model.add(Dropout(0.1))
#     keras_model.add(Dense(input_size, kernel_initializer=init, activation=activation_function))
    keras_model.add(Dense(1, kernel_initializer=init))
    
    # Compile model
#     keras_model.compile(loss='mse', optimizer=optimizer, metrics=['mae'])
    keras_model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
    
    return keras_model

#### Tuning for the best model

In [None]:
%%time

#grid search optimizer, activation function, initializer, epochs and batch size
optimizers = ['Adam', 'Adamax'] # 'SGD', 'RMSprop', 'Adagrad', 'Adadelta', 'Adam', 'Adamax', 'Nadam'
activation_functions = ['relu','tanh', 'linear', 'sigmoid'] # 'softmax', 'softplus', 'softsign', 'relu', 'tanh', 'sigmoid', 'hard_sigmoid', 'linear'
inits = ['glorot_uniform', 'normal', 'uniform'] # 'uniform', 'lecun_uniform', 'normal', 'zero', 'glorot_normal', 'glorot_uniform', 'he_normal', 'he_uniform'
epochs = [50]
batches = [16, 32]
kfold = 5

# model
# keras_model = KerasRegressor(build_fn=create_model, epochs=25, batch_size=16, verbose=0)
keras_model = KerasClassifier(build_fn=create_model, epochs=25, batch_size=16, verbose=0)

# grid search
param_grid = dict(optimizer=optimizers,
                  activation_function=activation_functions, 
                  epochs=epochs, 
                  batch_size=batches, 
                  init=inits, 
                 )

grid = GridSearchCV(estimator=keras_model, param_grid=param_grid, cv=kfold, n_jobs=-1, refit=True, verbose=3)
grid_result = grid.fit(X_train, y_train)


In [None]:
print("Best: %f using %s" % (grid_result.best_score_, grid_result.best_params_))

In [None]:
grid_results = {"best_score": grid_result.best_score_, "best_parameters": grid_result.best_params_}
with open('./grid_results.json', 'w') as file:
    json.dump(grid_results, file,  indent=4)
grid_results

#### Evaluation

In [None]:
EPOCHS = 50
BATCH_SIZE = 32
OPTIMIZER = 'Adam'
LEARNING_RATE = 1e-2
ACTIVATION_FUNCTION = 'relu'
INIT = 'uniform'

In [None]:
# %%time

# estimator = KerasRegressor(build_fn=create_model(OPTIMIZER, ACTIVATION_FUNCTION, INIT), epochs=EPOCHS, batch_size=BATCH_SIZE, verbose=1)
# kfold = KFold(n_splits=5)
# results = cross_val_score(estimator, X_val, y_val, cv=kfold)
# print("Baseline: %.2f (%.2f) MSE" % (abs(results.mean()), results.std()))
# print(f"MSE: {abs(results.mean())}")

In [None]:
initial_learning_rate = LEARNING_RATE
lr_schedule = keras.optimizers.schedules.ExponentialDecay(
    initial_learning_rate,
    decay_steps=100000,
    decay_rate=0.96,
    staircase=True)

In [None]:
%%time

# create model
keras_model = Sequential()
keras_model.add(Dense(input_size, input_dim=input_size, kernel_initializer=INIT, activation=ACTIVATION_FUNCTION))
# keras_model.add(Dense(2048, kernel_initializer=INIT, activation=ACTIVATION_FUNCTION))
# keras_model.add(Dropout(0.1))
# keras_model.add(Dense(3072,  activation=ACTIVATION_FUNCTION))
keras_model.add(Dense(1, activation='sigmoid'))
            
# Compile model
# keras_model.compile(loss='mean_squared_error', optimizer=optimizers.Adam(learning_rate=lr_schedule), metrics=['mae'])
keras_model.compile(loss='binary_crossentropy', optimizer=optimizers.Adam(learning_rate=lr_schedule), metrics=['accuracy'])
history = keras_model.fit(X_train, y_train, validation_data=(X_val, y_val), epochs=EPOCHS, batch_size=BATCH_SIZE)

In [None]:
history_dict = history.history
val_loss_values = history_dict['val_loss']
val_accuracy = history_dict['val_accuracy']

print('minimum value loss:', min(val_loss_values))
print('maximum value accuracy:', max(val_accuracy))

In [None]:
history_dict = history.history
loss_values = history_dict['loss']
val_loss_values = history_dict['val_loss']
accuracy = history_dict['mae']
val_accuracy = history_dict['val_mae']
 
epochs = range(1, len(loss_values) + 1)
fig, ax = plt.subplots(1, 2, figsize=(14, 6))

# Plot the model accuracy (MAE) vs Epochs
ax[0].plot(epochs, accuracy, 'b', color='#b3b300', label='Training accuracy')
ax[0].plot(epochs, val_accuracy, 'b', color='#cc5200', label='Validation accuracy')
ax[0].set_title('Training & Validation Accuracy', fontsize=16)
ax[0].set_xlabel('Epochs', fontsize=16)
ax[0].set_ylabel('Accuracy', fontsize=16)
ax[0].legend()

# Plot the loss vs Epochs
ax[1].plot(epochs, loss_values, 'b', color='#b3b300', label='Training loss')
ax[1].plot(epochs, val_loss_values, 'b', color='#cc5200', label='Validation loss')
ax[1].set_title('Training & Validation Loss', fontsize=16)
ax[1].set_xlabel('Epochs', fontsize=16)
ax[1].set_ylabel('Loss', fontsize=16)
ax[1].legend()

plt.show()

In [None]:
mse = min(val_loss_values)
rmse = np.sqrt(mse)
rmse