In [None]:
import pandas as pd
import numpy as np
import keras
import matplotlib.pyplot as plt
import tensorflow as tf
from keras.models import Sequential
from keras.layers import Dense
from keras.layers import LSTM
from keras.layers import Dropout
from sklearn.preprocessing import MinMaxScaler

In [None]:
!pip install simpletransformers

In [None]:
import warnings
warnings.simplefilter('ignore')
import gc
from scipy.special import softmax

from simpletransformers.classification.classification_model import ClassificationModel
from sklearn.metrics import mean_squared_error as mse

In [None]:
df = pd.read_csv('df.csv')

In [None]:
train = df.iloc[:, :11685610]
test = df.iloc[:, 11685610:]

In [None]:
train.head()

In [None]:
train['text'].apply(lambda x: len(x)).describe()

In [None]:
train['label'].value_counts()

In [None]:
train['label'][~train['label'].isin([0, -1, 1])] = -1

In [None]:
train.isnull().sum()

In [None]:
test.head()

In [None]:
test.isnull().sum()

In [None]:
train[train['label'] == 1]['text'].values[:5]

In [None]:
train[train['label'] == 0]['text'].values[:5]

In [None]:
train[train['label'] == -1]['text'].values[:5]

In [None]:
def get_model(model_type, model_name, n_epochs = 2, train_batch_size = 112, eval_batch_size = 144, seq_len = 134, lr = 2e-5):
  model = ClassificationModel(model_type, model_name,num_labels=1, args={'train_batch_size':train_batch_size,
                                                                         "eval_batch_size": eval_batch_size,
                                                                         'reprocess_input_data': True,
                                                                         'overwrite_output_dir': True,
                                                                         'fp16': False,
                                                                         'do_lower_case': False,
                                                                         'num_train_epochs': n_epochs,
                                                                         'max_seq_length': seq_len,
                                                                         'regression': True,
                                                                         'manual_seed': 2,
                                                                         "learning_rate":lr,
                                                                         "save_eval_checkpoints": False,
                                                                         "save_model_every_epoch": False,})
return model

In [None]:
tmp = pd.DataFrame()
tmp['text'] = train['text']
tmp['labels'] = train['label']
tmp_test = test[['text']].rename({'text': 'text'}, axis=1)
tmp_test['labels'] = 0
tmp_trn, tmp_val = train_test_split(tmp, test_size=0.15, random_state=2)

In [None]:
model = get_model('roberta', 'roberta-base', n_epochs=3)
model.train_model(tmp_trn)
preds_val = model.eval_model(tmp_val)[1]
preds_val = np.clip(preds_val, -1, 1)
print(f"RMSE: {mse(tmp_val['labels'], preds_val)**0.5}")
test_preds = model.eval_model(tmp_test)[1]
test_preds = np.clip(test_preds, -1, 1)
pv_1 = preds_val
pt_1 = test_preds

In [None]:
model = get_model('roberta', 'roberta-large', n_epochs=1, train_batch_size=16, eval_batch_size=16)
model.train_model(tmp_trn)
preds_val = model.eval_model(tmp_val)[1]
preds_val = np.clip(preds_val, -1, 1)
print(f"RMSE: {mse(tmp_val['labels'], preds_val)**0.5}")
test_preds = model.eval_model(tmp_test)[1]
test_preds = np.clip(test_preds, -1, 1)
pv_2 = preds_val
pt_2 = test_preds

In [None]:
model = get_model('roberta', 'roberta-large', n_epochs=2, train_batch_size=16, eval_batch_size=16, lr = 2e-5)
model.train_model(tmp_trn)
preds_val = model.eval_model(tmp_val)[1]
preds_val = np.clip(preds_val, -1, 1)
print(f"RMSE: {mse(tmp_val['labels'], preds_val)**0.5}")
test_preds = model.eval_model(tmp_test)[1]
test_preds = np.clip(test_preds, -1, 1)
pv_3 = preds_val
pt_3 = test_preds

In [None]:
model = get_model('roberta', 'roberta-large', n_epochs=3, train_batch_size=16, eval_batch_size=16, lr = 1e-5)
model.train_model(tmp_trn)
preds_val = model.eval_model(tmp_val)[1]
preds_val = np.clip(preds_val, -1, 1)
print(f"RMSE: {mse(tmp_val['labels'], preds_val)**0.5}")
test_preds = model.eval_model(tmp_test)[1]
test_preds = np.clip(test_preds, -1, 1)
pv_4 = preds_val
pt_4 = test_preds

In [None]:
pv = ((pv_1 * 0.3 + pv_2 * 0.7) * 0.3 + pv_3*0.7)*0.65 + pv_4*0.35
print(f"RMSE: {mse(tmp_val['labels'], pv)**0.5}")

In [None]:
tp = ((pt_1 * 0.3 + pt_2 * 0.7) * 0.3 + pt_3*0.7)*0.65 + pt_4*0.35

In [None]:
pd.Series(tp).describe()