In [None]:
!pip install simpletransformers --quiet

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m250.5/250.5 kB[0m [31m4.6 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.8/7.8 MB[0m [31m56.5 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m43.6/43.6 kB[0m [31m1.9 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m9.7/9.7 MB[0m [31m23.3 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.0/2.0 MB[0m [31m8.8 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m468.7/468.7 kB[0m [31m16.1 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.3/1.3 MB[0m [31m23.0 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.0/7.0 MB[0m [31m67.3 MB/s[0m eta [

In [None]:
import numpy as np
import pandas as pd
from sklearn.metrics import *
from sklearn.model_selection import *

from tqdm import tqdm
import warnings
warnings.simplefilter('ignore')
import gc
from scipy.special import softmax

from simpletransformers.classification.classification_model import ClassificationModel
from sklearn.metrics import mean_squared_error as mse

In [None]:
TRAIN_PATH = 'Train.csv'
TEST_PATH = 'Test.csv'
SAMPLE_SUB_PATH = 'SampleSubmission.csv'

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
%cd "/content/drive/My Drive/Colab Notebooks"

/content/drive/My Drive/Colab Notebooks


In [None]:
train = pd.read_csv(TRAIN_PATH)
test = pd.read_csv(TEST_PATH)
sample_sub = pd.read_csv(SAMPLE_SUB_PATH)
ID_COL, TARGET_COL = sample_sub.columns.tolist()

In [None]:
train.head()

Unnamed: 0,tweet_id,safe_text,label,agreement
0,CL1KWCMY,Me &amp; The Big Homie meanboy3000 #MEANBOY #M...,0.0,1.0
1,E3303EME,I'm 100% thinking of devoting my career to pro...,1.0,1.0
2,M4IVFSMS,"#whatcausesautism VACCINES, DO NOT VACCINATE Y...",-1.0,1.0
3,1DR6ROZ4,I mean if they immunize my kid with something ...,-1.0,1.0
4,J77ENIIE,Thanks to <user> Catch me performing at La Nui...,0.0,1.0


In [None]:
train['safe_text'].apply(lambda x: len(x)).describe()

count    10001.000000
mean        99.902810
std         29.893888
min          1.000000
25%         79.000000
50%        107.000000
75%        122.000000
max        153.000000
Name: safe_text, dtype: float64

In [None]:
train['label'].value_counts()

 0.000000    4908
 1.000000    4053
-1.000000    1038
 0.666667       1
Name: label, dtype: int64

In [None]:
train['label'][~train['label'].isin([0, -1, 1])] = -1
train.isnull().sum()

tweet_id     0
safe_text    0
label        0
agreement    2
dtype: int64

In [None]:
train = train.dropna()
train.isnull().sum()

tweet_id     0
safe_text    0
label        0
agreement    0
dtype: int64

In [None]:
test.head()

Unnamed: 0,tweet_id,safe_text
0,00BHHHP1,<user> <user> ... &amp; 4 a vaccine given 2 he...
1,00UNMD0E,Students starting school without whooping coug...
2,01AXPTJF,"I'm kinda over every ep of <user> being ""rippe..."
3,01HOEQJW,How many innocent children die for lack of vac...
4,01JUKMAO,"CDC eyeing bird flu vaccine for humans, though..."


In [None]:
test.isnull().sum()

tweet_id     0
safe_text    1
dtype: int64

In [None]:
test['safe_text'] = test['safe_text'].fillna('xxxxxx')

test.isnull().sum()

tweet_id     0
safe_text    0
dtype: int64

## Lets see some of the tweets of each kind

### 1. Positive

In [None]:
# train[train['label'] == 1]['safe_text'].values[:5]

### 2. Neutral

In [None]:
# train[train['label'] == 0]['safe_text'].values[:5]

### 3. Negative

In [None]:
# train[train['label'] == -1]['safe_text'].values[:5]

### Lets go to the modelling part. Simple Transformers is extremely simple to use, and switching architectures requires only the change in argument name.

In [None]:
def get_model(model_type, model_name, n_epochs = 2, train_batch_size = 112, eval_batch_size = 144, seq_len = 134, lr = 2e-5):
  model = ClassificationModel(model_type, model_name,num_labels=1, args={'train_batch_size':train_batch_size,
                                         "eval_batch_size": eval_batch_size,
                                         'reprocess_input_data': True,
                                         'overwrite_output_dir': True,
                                         'fp16': False,
                                         'do_lower_case': False,
                                         'num_train_epochs': n_epochs,
                                         'max_seq_length': seq_len,
                                         'regression': True,
                                         'manual_seed': 2,
                                         'learning_rate':lr,
                                         'save_eval_checkpoints': False,
                                         'save_model_every_epoch': False,})
  return model

In [None]:
tmp = pd.DataFrame()
tmp['text'] = train['safe_text']
tmp['labels'] = train['label']
tmp_test = test[['safe_text']].rename({'safe_text': 'text'}, axis=1)
tmp_test['labels'] = 0
tmp_trn, tmp_val = train_test_split(tmp, test_size=0.25, random_state=2)

### Model B: Roberta Large 1 Epoch

In [None]:
model = get_model('roberta', 'roberta-large', n_epochs=2, train_batch_size=16, eval_batch_size=16,lr = 1e-5)
model.train_model(tmp_trn)
preds_val = model.eval_model(tmp_val)[1]
preds_val = np.clip(preds_val, -1, 1)
print(f"RMSE: {mse(tmp_val['labels'], preds_val)**0.5}")
test_preds = model.eval_model(tmp_test)[1]
test_preds = np.clip(test_preds, -1, 1)
pv_1 = preds_val
pt_1 = test_preds

Downloading (…)lve/main/config.json:   0%|          | 0.00/482 [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/1.43G [00:00<?, ?B/s]

Some weights of the model checkpoint at roberta-large were not used when initializing RobertaForSequenceClassification: ['lm_head.dense.weight', 'lm_head.dense.bias', 'roberta.pooler.dense.bias', 'lm_head.decoder.weight', 'lm_head.bias', 'roberta.pooler.dense.weight', 'lm_head.layer_norm.bias', 'lm_head.layer_norm.weight']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-large and are newly initialized: ['classifier.out_proj.bias', 'classi

Downloading (…)olve/main/vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

Downloading (…)olve/main/merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

  0%|          | 0/7499 [00:00<?, ?it/s]

### Model C: Roberta Large 2 Epochs

In [None]:
model = get_model('roberta', 'roberta-large', n_epochs=4, train_batch_size=16, eval_batch_size=16, lr = 1.1e-5)
model.train_model(tmp_trn)
preds_val = model.eval_model(tmp_val)[1]
preds_val = np.clip(preds_val, -1, 1)
print(f"RMSE: {mse(tmp_val['labels'], preds_val)**0.5}")
test_preds = model.eval_model(tmp_test)[1]
test_preds = np.clip(test_preds, -1, 1)
pv_2 = preds_val
pt_2 = test_preds

Some weights of the model checkpoint at roberta-large were not used when initializing RobertaForSequenceClassification: ['roberta.pooler.dense.weight', 'lm_head.layer_norm.bias', 'lm_head.layer_norm.weight', 'lm_head.decoder.weight', 'lm_head.dense.bias', 'lm_head.dense.weight', 'lm_head.bias', 'roberta.pooler.dense.bias']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-large and are newly initialized: ['classifier.dense.bias', 'classifie

  0%|          | 0/7499 [00:00<?, ?it/s]

Epoch:   0%|          | 0/4 [00:00<?, ?it/s]

Running Epoch 0 of 4:   0%|          | 0/469 [00:00<?, ?it/s]

Running Epoch 1 of 4:   0%|          | 0/469 [00:00<?, ?it/s]

Running Epoch 2 of 4:   0%|          | 0/469 [00:00<?, ?it/s]

Running Epoch 3 of 4:   0%|          | 0/469 [00:00<?, ?it/s]

  0%|          | 0/2500 [00:00<?, ?it/s]

Running Evaluation:   0%|          | 0/157 [00:00<?, ?it/s]

RMSE: 0.4765242627080429


  0%|          | 0/5177 [00:00<?, ?it/s]

Running Evaluation:   0%|          | 0/324 [00:00<?, ?it/s]

### Model D: Roberta Large 3 Epochs

In [None]:
model = get_model('roberta', 'roberta-large', n_epochs=5, train_batch_size=16, eval_batch_size=16, lr = 10e-6)
model.train_model(tmp_trn)
preds_val = model.eval_model(tmp_val)[1]
preds_val = np.clip(preds_val, -1, 1)
print(f"RMSE: {mse(tmp_val['labels'], preds_val)**0.5}")
test_preds = model.eval_model(tmp_test)[1]
test_preds = np.clip(test_preds, -1, 1)
pv_3 = preds_val
pt_3 = test_preds

Some weights of the model checkpoint at roberta-large were not used when initializing RobertaForSequenceClassification: ['roberta.pooler.dense.weight', 'lm_head.layer_norm.bias', 'lm_head.layer_norm.weight', 'lm_head.decoder.weight', 'lm_head.dense.bias', 'lm_head.dense.weight', 'lm_head.bias', 'roberta.pooler.dense.bias']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-large and are newly initialized: ['classifier.dense.bias', 'classifie

  0%|          | 0/7499 [00:00<?, ?it/s]

Epoch:   0%|          | 0/5 [00:00<?, ?it/s]

Running Epoch 0 of 5:   0%|          | 0/469 [00:00<?, ?it/s]

Running Epoch 1 of 5:   0%|          | 0/469 [00:00<?, ?it/s]

Running Epoch 2 of 5:   0%|          | 0/469 [00:00<?, ?it/s]

Running Epoch 3 of 5:   0%|          | 0/469 [00:00<?, ?it/s]

Running Epoch 4 of 5:   0%|          | 0/469 [00:00<?, ?it/s]

  0%|          | 0/2500 [00:00<?, ?it/s]

Running Evaluation:   0%|          | 0/157 [00:00<?, ?it/s]

RMSE: 0.4851251386739668


  0%|          | 0/5177 [00:00<?, ?it/s]

Running Evaluation:   0%|          | 0/324 [00:00<?, ?it/s]

### Blending of Different Models

In [None]:
pv = pv_1 * 0.575 + pv_2 * 0.425  + pv_3*0.0
print(f"RMSE: {mse(tmp_val['labels'], pv)**0.5}")

RMSE: 0.4566385542968621


In [None]:
tp = pt_1 * 0.575 + pt_2 * 0.425  + pt_3*0

In [None]:
final_test_preds = tp
preds_df_final = test[[ID_COL]]
preds_df_final[TARGET_COL] = final_test_preds
SUB_FILE_NAME = 'roberta_ensemble_e2e4e5_tune.csv'
preds_df_final.to_csv(SUB_FILE_NAME, index=False)


In [None]:
from google.colab import files
files.download(SUB_FILE_NAME)

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>