In [0]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [0]:
!cp /content/drive/'My Drive'/kaggle.json .

In [0]:
!ls

cache_dir  kaggle.json	runs	     sample_submission.csv  test.csv
drive	   outputs	sample_data  sub.csv		    train.csv


In [0]:
!mkdir ~/.kaggle
!mv kaggle.json ~/.kaggle

mkdir: cannot create directory ‘/root/.kaggle’: File exists


In [0]:
!kaggle competitions download -c nlp-getting-started

test.csv: Skipping, found more recently modified local copy (use --force to force download)
sample_submission.csv: Skipping, found more recently modified local copy (use --force to force download)
train.csv: Skipping, found more recently modified local copy (use --force to force download)


In [0]:
!pip install simpletransformers==0.19.2



In [0]:
import os, re, string
import random

import numpy as np
import pandas as pd
import sklearn
from scipy.special import softmax
import gc

import torch

from simpletransformers.classification import ClassificationModel
from sklearn.model_selection import StratifiedKFold

Let's have a look at our data

In [0]:
train_data = pd.read_csv("train.csv")
test_data = pd.read_csv("test.csv")
train_data.head()

Unnamed: 0,id,keyword,location,text,target
0,1,,,Our Deeds are the Reason of this #earthquake M...,1
1,4,,,Forest fire near La Ronge Sask. Canada,1
2,5,,,All residents asked to 'shelter in place' are ...,1
3,6,,,"13,000 people receive #wildfires evacuation or...",1
4,7,,,Just got sent this photo from Ruby #Alaska as ...,1


During experiments I find that removing stopwords, lemmatization, removing urls, removing emojis just leads to deterioration of the model's performance. So I will use lowercasing and will stay just letters: 

In [0]:
train_data['text']=train_data['text'].str.lower()
test_data['text']=test_data['text'].str.lower()

In [0]:
train_data['text']=train_data['text'].str.replace("[^a-z]", " ")
test_data['text']=test_data['text'].str.replace("[^a-z]", " ")

Leave only the necessary columns

In [0]:
train_data = train_data[['text', 'target']]
test_data = test_data[['text']]

Get all default arguments:

In [0]:
bert_uncased = ClassificationModel('bert', 'bert-base-uncased') 

bert_uncased.args

{'adam_epsilon': 1e-08,
 'best_model_dir': 'outputs/best_model',
 'cache_dir': 'cache_dir/',
 'do_lower_case': False,
 'early_stopping_delta': 0,
 'early_stopping_patience': 3,
 'eval_batch_size': 8,
 'evaluate_during_training': False,
 'evaluate_during_training_steps': 2000,
 'evaluate_during_training_verbose': False,
 'fp16': True,
 'fp16_opt_level': 'O1',
 'gradient_accumulation_steps': 1,
 'learning_rate': 4e-05,
 'logging_steps': 50,
 'max_grad_norm': 1.0,
 'max_seq_length': 128,
 'model_name': 'bert-base-uncased',
 'model_type': 'bert',
 'n_gpu': 1,
 'no_cache': False,
 'num_train_epochs': 1,
 'output_dir': 'outputs/',
 'overwrite_output_dir': False,
 'process_count': 1,
 'regression': False,
 'reprocess_input_data': False,
 'save_eval_checkpoints': True,
 'save_model_every_epoch': True,
 'save_steps': 2000,
 'silent': False,
 'sliding_window': False,
 'stride': 0.8,
 'tensorboard_dir': None,
 'tie_value': 1,
 'train_batch_size': 8,
 'use_cached_eval_features': True,
 'use_early_

Let's change some of them to train faster and get better score:

In [0]:
custom_args = {'fp16': False, # not using mixed precision 
               'train_batch_size': 16,
               'gradient_accumulation_steps': 2,
               'learning_rate': 1e-05,
               'overwrite_output_dir': True,
               'num_train_epochs': 3,
               'use_early_stopping': True,
               'max_seq_length': 50,
               'evaluate_during_training_verbose': True}

In [0]:
def model(features, n_folds = 5):

    k_fold = StratifiedKFold(n_splits=n_folds, shuffle=True)
    results = []

    out_of_fold = np.zeros(features.to_numpy().shape[0])

    for train_index, val_index in k_fold.split(features, features['target']):
      
        train_df = features.iloc[train_index]
        val_df = features.iloc[val_index]
    
        model = ClassificationModel('bert', 'bert-base-uncased', args=custom_args) 
        model.train_model(train_df)

        result, model_outputs, _ = model.eval_model(val_df, f1=sklearn.metrics.f1_score)
        val_proba = softmax(model_outputs, axis=1)

        out_of_fold[val_index] = val_proba[:, 1]

        results.append(result['f1'])

        gc.enable()
        del train_df, val_df
        gc.collect()

    return out_of_fold, results

In [0]:
predictions, f1 = model(train_data)

Features loaded from cache at cache_dir/cached_train_bert_50_2_6090


  "Dataframe headers not specified. Falling back to using column 0 as text and column 1 as labels."


HBox(children=(IntProgress(value=0, description='Epoch', max=3, style=ProgressStyle(description_width='initial…

HBox(children=(IntProgress(value=0, description='Current iteration', max=381, style=ProgressStyle(description_…

Running loss: 0.450439



Running loss: 0.449704

HBox(children=(IntProgress(value=0, description='Current iteration', max=381, style=ProgressStyle(description_…

Running loss: 0.456449

HBox(children=(IntProgress(value=0, description='Current iteration', max=381, style=ProgressStyle(description_…

Running loss: 0.083468Training of bert model complete. Saved to outputs/.
Features loaded from cache at cache_dir/cached_dev_bert_50_2_1523


  "Dataframe headers not specified. Falling back to using column 0 as text and column 1 as labels."


HBox(children=(IntProgress(value=0, max=191), HTML(value='')))

{'mcc': 0.6600609549357257, 'tp': 507, 'tn': 763, 'fp': 101, 'fn': 152, 'f1': 0.8003157063930546, 'eval_loss': 0.39832949029837605}
Features loaded from cache at cache_dir/cached_train_bert_50_2_6090


HBox(children=(IntProgress(value=0, description='Epoch', max=3, style=ProgressStyle(description_width='initial…

HBox(children=(IntProgress(value=0, description='Current iteration', max=381, style=ProgressStyle(description_…

Running loss: 0.440049

HBox(children=(IntProgress(value=0, description='Current iteration', max=381, style=ProgressStyle(description_…

Running loss: 0.322550

HBox(children=(IntProgress(value=0, description='Current iteration', max=381, style=ProgressStyle(description_…

Running loss: 0.094372Training of bert model complete. Saved to outputs/.
Features loaded from cache at cache_dir/cached_dev_bert_50_2_1523


HBox(children=(IntProgress(value=0, max=191), HTML(value='')))

{'mcc': 0.676585874630077, 'tp': 504, 'tn': 778, 'fp': 86, 'fn': 155, 'f1': 0.8070456365092074, 'eval_loss': 0.38327673794360373}
Features loaded from cache at cache_dir/cached_train_bert_50_2_6090


HBox(children=(IntProgress(value=0, description='Epoch', max=3, style=ProgressStyle(description_width='initial…

HBox(children=(IntProgress(value=0, description='Current iteration', max=381, style=ProgressStyle(description_…

Running loss: 0.595385

HBox(children=(IntProgress(value=0, description='Current iteration', max=381, style=ProgressStyle(description_…

Running loss: 0.641631

HBox(children=(IntProgress(value=0, description='Current iteration', max=381, style=ProgressStyle(description_…

Running loss: 0.173651Training of bert model complete. Saved to outputs/.
Features loaded from cache at cache_dir/cached_dev_bert_50_2_1523


HBox(children=(IntProgress(value=0, max=191), HTML(value='')))

{'mcc': 0.667040365917132, 'tp': 501, 'tn': 774, 'fp': 90, 'fn': 158, 'f1': 0.8016000000000001, 'eval_loss': 0.3938724779821347}
Features loaded from cache at cache_dir/cached_train_bert_50_2_6091


HBox(children=(IntProgress(value=0, description='Epoch', max=3, style=ProgressStyle(description_width='initial…

HBox(children=(IntProgress(value=0, description='Current iteration', max=381, style=ProgressStyle(description_…

Running loss: 0.387800

HBox(children=(IntProgress(value=0, description='Current iteration', max=381, style=ProgressStyle(description_…

Running loss: 0.325634

HBox(children=(IntProgress(value=0, description='Current iteration', max=381, style=ProgressStyle(description_…

Running loss: 0.551014Training of bert model complete. Saved to outputs/.
Features loaded from cache at cache_dir/cached_dev_bert_50_2_1522


HBox(children=(IntProgress(value=0, max=191), HTML(value='')))

{'mcc': 0.6475000895480004, 'tp': 504, 'tn': 757, 'fp': 122, 'fn': 139, 'f1': 0.7943262411347518, 'eval_loss': 0.4360057632804541}
Features loaded from cache at cache_dir/cached_train_bert_50_2_6091


HBox(children=(IntProgress(value=0, description='Epoch', max=3, style=ProgressStyle(description_width='initial…

HBox(children=(IntProgress(value=0, description='Current iteration', max=381, style=ProgressStyle(description_…

Running loss: 0.366732

HBox(children=(IntProgress(value=0, description='Current iteration', max=381, style=ProgressStyle(description_…

Running loss: 0.414414

HBox(children=(IntProgress(value=0, description='Current iteration', max=381, style=ProgressStyle(description_…

Running loss: 0.366755Training of bert model complete. Saved to outputs/.
Features loaded from cache at cache_dir/cached_dev_bert_50_2_1522


HBox(children=(IntProgress(value=0, max=191), HTML(value='')))

{'mcc': 0.6458147107424473, 'tp': 511, 'tn': 748, 'fp': 131, 'fn': 132, 'f1': 0.795330739299611, 'eval_loss': 0.4312085686787885}


Let's look again at the F1:

In [0]:
for i, result in enumerate(f1, 1):
    print("Fold {}: {:4f}".format(i, result))
    
print("\nMean F1: {:4f}".format(np.mean(f1)))

Fold 1: 0.800316
Fold 2: 0.807046
Fold 3: 0.801600
Fold 4: 0.794326
Fold 5: 0.795331

Mean F1: 0.799724


In [0]:
prediction = train_data.copy()

prediction['bert_pred'] = predictions
prediction = prediction.drop(columns = ['target', 'text'])

In [0]:
prediction.head()

Unnamed: 0,bert_pred
0,0.97546
1,0.966836
2,0.821777
3,0.981626
4,0.967243


In [0]:
model = ClassificationModel('bert', 'bert-base-uncased', args=custom_args)
model.train_model(train_data)
_, raw_outputs = model.predict(test_data['text'])

Features loaded from cache at cache_dir/cached_train_bert_50_2_7613


  "Dataframe headers not specified. Falling back to using column 0 as text and column 1 as labels."


HBox(children=(IntProgress(value=0, description='Epoch', max=3, style=ProgressStyle(description_width='initial…

HBox(children=(IntProgress(value=0, description='Current iteration', max=476, style=ProgressStyle(description_…

Running loss: 0.595285



Running loss: 0.294684

HBox(children=(IntProgress(value=0, description='Current iteration', max=476, style=ProgressStyle(description_…

Running loss: 0.107539

HBox(children=(IntProgress(value=0, description='Current iteration', max=476, style=ProgressStyle(description_…

Running loss: 0.371855Training of bert model complete. Saved to outputs/.
Features loaded from cache at cache_dir/cached_dev_bert_50_2_3263


HBox(children=(IntProgress(value=0, max=408), HTML(value='')))

In [0]:
test_proba = softmax(raw_outputs, axis=1)

In [0]:
test_predictions = test_proba[:, 1]

In [0]:
test_prediction = test_data.copy()

test_prediction['bert_pred'] = test_predictions

test_prediction = test_prediction.drop(columns = ['text'])

In [0]:
test_prediction.head()

Unnamed: 0,bert_pred
0,0.972604
1,0.981176
2,0.982866
3,0.989683
4,0.988967


In [0]:
prediction.to_csv('/content/drive/My Drive/tweeter/train_predictions_bert.csv', index_label='idx')
test_prediction.to_csv('/content/drive/My Drive/tweeter/test_predictions_bert.csv', index_label='idx')

тренуэ на всих даних

In [0]:
model = ClassificationModel('bert', 'bert-base-uncased', args=custom_args) 
model.train_model(train_data)

Converting to features started. Cache is not used.


  "Dataframe headers not specified. Falling back to using column 0 as text and column 1 as labels."


HBox(children=(IntProgress(value=0, max=7613), HTML(value='')))




HBox(children=(IntProgress(value=0, description='Epoch', max=3, style=ProgressStyle(description_width='initial…

HBox(children=(IntProgress(value=0, description='Current iteration', max=476, style=ProgressStyle(description_…

Running loss: 0.690791



Running loss: 0.558720

HBox(children=(IntProgress(value=0, description='Current iteration', max=476, style=ProgressStyle(description_…

Running loss: 0.613014

HBox(children=(IntProgress(value=0, description='Current iteration', max=476, style=ProgressStyle(description_…

Running loss: 0.577653
Training of bert model complete. Saved to outputs/.


In [0]:
predictions, raw_outputs = model.predict(test_data['text'])

sample_submission = pd.read_csv("sample_submission.csv")
sample_submission["target"] = predictions
sample_submission.to_csv("sub.csv", index=False)

Features loaded from cache at cache_dir/cached_dev_bert_50_2_3263


HBox(children=(IntProgress(value=0, max=408), HTML(value='')))




In [0]:
!kaggle competitions submit -c nlp-getting-started -f sub.csv -m "Message"

100% 22.2k/22.2k [00:00<00:00, 23.5kB/s]
Successfully submitted to Real or Not? NLP with Disaster Tweets