# **CommonLit Readability**

## **Import libraries**

In [None]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from pylab import rcParams
import torch
import transformers
from transformers import RobertaTokenizer, RobertaModel
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.svm import SVR
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import RepeatedKFold
from sklearn.linear_model import Ridge
from sklearn.linear_model import Lasso
from numpy import mean
from numpy import std
from sklearn.datasets import make_classification
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import RepeatedStratifiedKFold
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
import itertools
import warnings
%matplotlib inline

In [None]:
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

In [None]:
pd.set_option("display.max_colwidth", None)

In [None]:
warnings.filterwarnings("ignore")

## **Load data**

#### Load and describe the training data

In [None]:
filename = "../input/commonlitreadabilityprize/train.csv"
df_train = pd.read_csv(filename)
df_train = df_train.drop(["url_legal", "license"], axis = 1)

In [None]:
df_train.head(1)

In [None]:
df_train.info()

In [None]:
df_train['target'].describe()

In [None]:
std = df_train['target'].std()
mean = df_train['target'].mean()
print('mean:', mean)
print('std: ', std)

#### Load the test data

In [None]:
filename = "../input/commonlitreadabilityprize/test.csv"
df_test = pd.read_csv(filename)
df_test = df_test.drop(["url_legal", "license"], axis = 1)

In [None]:
df_test.head(1)

## **Transform data**

#### Remove new lines

In [None]:
def to_string(row_text):
  lines = row_text.split('\n')
  string = ""
  for line in lines:
    string = string + " " + line
  return string

#### Remove new lines from the training data

In [None]:
df_train['excerpt'] = df_train['excerpt'].apply(to_string)

In [None]:
df_train.head(1)

#### Remove new lines from the test data

In [None]:
df_test['excerpt'] = df_test['excerpt'].apply(to_string)

In [None]:
df_test.head(1)

## **Exploratory Data Analysis**

In [None]:
sns.set_style("darkgrid")
rcParams['figure.figsize'] = 9, 6

In [None]:
sns.kdeplot(df_train.target, shade=True, color="r")
plt.xlabel('Average ratings')
plt.show()

In [None]:
sns.kdeplot(df_train.standard_error, shade=True, color="r")
plt.xlabel('Standard errors')
plt.show()

In [None]:
x=df_train['target']
y=df_train['standard_error']
plt.scatter(x=x, y=y)
plt.annotate("remove", xy=(0, 0), arrowprops=dict(facecolor='orange', shrink=0.05), 
             xytext=(0.6, 0.3), textcoords='axes fraction', fontsize=12, weight='bold',
             horizontalalignment='right', verticalalignment='top', color='orange')
plt.xlabel('Targets')
plt.ylabel('Standard errors')
plt.show()

In [None]:
ind = df_train[df_train['target'] == 0].index
df_train = df_train.drop(ind)

In [None]:
lower_bound = mean - std
upper_bound = mean + std
lower_bound, upper_bound

In [None]:
plt.scatter(x=df_train['target'], y=df_train['standard_error'])

plt.axvline(x=lower_bound, ymin=0, ymax=1, linewidth=1.5, linestyle="--", color='darkorchid')
plt.axvline(x=upper_bound, ymin=0, ymax=1, linewidth=1.5, linestyle="--", color='darkorchid')

plt.xlabel('Targets')
plt.ylabel('Standard errors')
plt.show()

In [None]:
min_value = df_train["target"].min()
max_value = df_train["target"].max()
print("min: ",  min_value)
print("max: ",  max_value)

## **Choose sequence length**

In [None]:
PRE_TRAINED_MODEL = 'roberta-base'

In [None]:
tokenizer = RobertaTokenizer.from_pretrained(PRE_TRAINED_MODEL, do_lower_case=True)

In [None]:
%%time

for df in [("training data", df_train), ("test data", df_test)]:
  excerpt_tokens = []
  for excerpt in df[1].excerpt:
    tokens = tokenizer.tokenize(excerpt)
    excerpt_tokens.append(len(tokens))

  min_tokens = min(excerpt_tokens)
  max_tokens = max(excerpt_tokens)
  print(df[0],":")
  print("-" * 100)
  print('min ve max tokens:', min_tokens, max_tokens)
  print('\n')

  sns.distplot(excerpt_tokens)
  plt.xlim([min_tokens-50, max_tokens+50]);
  plt.xlabel('Token count');
  plt.show()

  print('\n')


## **Parameters**

In [None]:
RANDOM_SEED = 42
MAX_LEN = 320
SPLIT_RATIO = 0.2
MULTIPLIER = 0.10

## **Encoding**

#### Add input_id, attention_mask, last_hidden_states columns

In [None]:
df_train['input_ids']=''
df_train['attention_mask']=''
df_train['last_hidden_states']=''

Encode excerpts

In [None]:
def encode_excerpt(row_data):
    encoding = tokenizer.encode_plus(
        row_data,
        max_length=MAX_LEN,
        truncation=True,
        add_special_tokens=True,
        pad_to_max_length=True,
        return_attention_mask=True, 
        return_token_type_ids=False,
        return_tensors='pt'
    )
    
    input_ids = encoding['input_ids'] 
    attention_mask = encoding['attention_mask']
    
    return pd.Series([input_ids, attention_mask])

In [None]:
%%time

df_train[['input_ids', 'attention_mask']] = df_train['excerpt'].apply(encode_excerpt)

In [None]:
df_train.head(1)

## **Embeddings**

#### Model definition

In [None]:
model = RobertaModel.from_pretrained(PRE_TRAINED_MODEL)

#### Find word embeddings

In [None]:
def find_last_hidden_states(input_ids, attention_mask):

    with torch.no_grad():
        last_hidden_state, pooled_output = model(
            input_ids=input_ids,
            attention_mask=attention_mask,
            return_dict = False
        )

        features = last_hidden_state[:,0,:].numpy()
        #features = pooled_output[:,:].numpy()
        features_flat = list(itertools.chain(*features))
    
    return features_flat

#last_hidden_state: torch.Size([1, 310, 768])
#pooled_output: torch.Size([1, 768])

In [None]:
%%time

df_train['last_hidden_states'] = df_train[['input_ids', 'attention_mask']].apply(lambda row: find_last_hidden_states(row['input_ids'], row['attention_mask']), axis=1)

In [None]:
df_train.head(1)

## **Save and load new data**

#### Save the training data

In [None]:
df_train.to_pickle("./train_data_roberta_embed.pkl")

#### Load the training data with embeddings

In [None]:
filename = "../input/commonlit-readability/train_data_roberta_embed.pkl"
df_bert_embed = pd.read_pickle(filename)

In [None]:
df_bert_embed.head(1)

In [None]:
input_size = len(df_bert_embed['last_hidden_states'].head(1)[0])
input_size

In [None]:
df_bert_embed['target'] = df_bert_embed['target'] * (1 + MULTIPLIER)

In [None]:
df_bert_embed.head(1)

## **Train-validation data features and labels**

In [None]:
def split_train_val(full_data, split_ratio):
  df_train, df_val = train_test_split(
      full_data,
      test_size=split_ratio,
      random_state=RANDOM_SEED
      )

  print("training data:", df_train.shape)
  print("validation data:", df_val.shape)

  return df_train, df_val

In [None]:
df_train_set, df_val_set = split_train_val(df_bert_embed, SPLIT_RATIO)

In [None]:
X_train = np.array(df_train_set['last_hidden_states'].tolist())

In [None]:
y_train = df_train_set['target'].values

In [None]:
X_val = np.array(df_val_set['last_hidden_states'].tolist())

In [None]:
y_val = df_val_set['target'].values

## **Loss function**

In [None]:
def rmse_loss(y_val, y_pred):
    mse = mean_squared_error(y_val, y_pred)
    rmse = np.sqrt(mse)
    return rmse

## **Ridge / Lasso Regression**

#### Define model

In [None]:
#reg_model = Lasso(alpha=0)
reg_model = Ridge(alpha=0.28)

#### Evaluate the model

In [None]:
cv = RepeatedKFold(n_splits=5, n_repeats=3, random_state=1)

scores_mse = cross_val_score(reg_model, X_train, y_train, scoring='neg_mean_squared_error', cv=cv, n_jobs=-1)

scores_mae = cross_val_score(reg_model, X_train, y_train, scoring='neg_mean_absolute_error', cv=cv, n_jobs=-1)

print('Mean MSE: %.5f' % (abs(scores_mse)).mean())
print('\n')
print(scores_mae)

In [None]:
reg_model.fit(X_train, y_train)

#### Predict

In [None]:
y_pred = reg_model.predict(X_val)

In [None]:
rmse = rmse_loss(y_val, y_pred)
rmse

#### Tuning

In [None]:
# define grid
grid = dict()
grid['alpha'] = np.arange(0, 1, 0.01)

# define search
search = GridSearchCV(reg_model, grid, scoring='neg_mean_squared_error', cv=cv, n_jobs=-1)

# perform the search
results = search.fit(X_train, y_train)

# summarize
print('MSE: %.3f' % results.best_score_)
print('Config: %s' % results.best_params_)

## **SVR Regression**

#### Tuning and training

In [None]:
param_grid = {'C': [0.01, 0.1, 1, 100], 'gamma': [1, 0.1, 0.01, 0.001],'epsilon': [0, 0.1, 0.2], 'kernel': ['rbf', 'poly']}

In [None]:
%%time

grid = GridSearchCV(SVR(),param_grid,refit=True, n_jobs=-1, cv=5, verbose=1)
grid.fit(X_train,y_train)

In [None]:
print(grid.best_params_)

In [None]:
print(grid.best_estimator_)

#### Train the data

In [None]:
KERNEL = 'rbf'
C_VALUE = 100
GAMMA = 0.01
EPSILON = 0.2
VERBOSE=2

In [None]:
svr_rbf_reg = SVR(kernel=KERNEL, C=C_VALUE, gamma=GAMMA, epsilon=EPSILON, verbose=VERBOSE)

In [None]:
svr_rbf_reg.fit(X_train, y_train)

#### Predict

In [None]:
grid_predictions = grid.predict(X_val)

In [None]:
y_pred = svr_rbf_reg.predict(X_val)

In [None]:
rmse = rmse_loss(y_val, grid_predictions)
rmse

In [None]:
rmse = rmse_loss(y_val, y_pred)
rmse