In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/nlp-getting-started/sample_submission.csv
/kaggle/input/nlp-getting-started/train.csv
/kaggle/input/nlp-getting-started/test.csv


## Installing SimpleTransformers

In [2]:
!pip install git+git://github.com/AndLen/simpletransformers.git --quiet

[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
dask-cudf 21.8.3 requires cupy-cuda114, which is not installed.
distributed 2021.7.1 requires dask==2021.07.1, but you have dask 2021.9.1 which is incompatible.
dask-cudf 21.8.3 requires dask<=2021.07.1,>=2021.6.0, but you have dask 2021.9.1 which is incompatible.
dask-cudf 21.8.3 requires pandas<1.3.0dev0,>=1.0, but you have pandas 1.3.3 which is incompatible.
allennlp 2.7.0 requires transformers<4.10,>=4.1, but you have transformers 4.11.3 which is incompatible.[0m


In [3]:
import csv
import os
import torch
from transformers import pipeline
import gc
import seaborn as sns
from matplotlib import pyplot as plt
from scipy.special import softmax
from simpletransformers.classification import (ClassificationModel, ClassificationArgs)
import sklearn
from sklearn.model_selection import train_test_split

## Loading Data

In [4]:
test = pd.read_csv("/kaggle/input/nlp-getting-started/test.csv")
training = pd.read_csv("/kaggle/input/nlp-getting-started/train.csv")

In [5]:
training["text"].isna().sum()

0

In [6]:
training_df = training[["text", "target"]]
training_df.columns = ["text", "labels"]

In [7]:
training_df.shape

(7613, 2)

In [8]:
shuffled_training = training_df.sample(frac=1).reset_index(drop=True)

## Train, test, eval splitting with 70:15:15 proportions

Used for model selection, finally I train the classifier on the whole training set

In [9]:
#train_df, test_df = train_test_split(training_df, test_size=0.15, random_state=42, stratify=training_df["labels"])
#eval_df, test_df = train_test_split(test_df, test_size=0.50, random_state=42, stratify=test_df["labels"])

In [10]:
#train_df.shape ,test_df.shape, eval_df.shape

In [11]:
# This cleans ram and vram during re-runs
gc.collect()
torch.cuda.empty_cache()

## Loading Bertweet from Huggingface

In [12]:
# Create a ClassificationModel
model_args = ClassificationArgs(num_train_epochs=2, 
                                overwrite_output_dir=True)
model_args.manual_seed = 42
model_args.best_model_dir = "/kaggle/working/best_model"
model_args.output_dir = "/kaggle/temp/output"
model_args.normalization = True #this enables the built-in Bertweet custom tokenizer

model_args.reprocess_input_data = True
#odel_args.evaluate_during_training = True
#model_args.evaluate_during_training_verbose = True
model_args.train_batch_size = 80
model_args.eval_batch_size = 80

model_args.early_stopping_metric = "mcc"
model_args.early_stopping_metric_minimize = False
model_args.use_early_stopping = True
model_args.early_stopping_consider_epochs = True
model_args.early_stopping_patience = 1

model = ClassificationModel(model_type='bertweet', 
                            model_name='vinai/bertweet-base', 
                            args = model_args, 
                            num_labels = 2)

Downloading:   0%|          | 0.00/558 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/517M [00:00<?, ?B/s]

Some weights of the model checkpoint at vinai/bertweet-base were not used when initializing RobertaForSequenceClassification: ['lm_head.layer_norm.bias', 'lm_head.dense.weight', 'roberta.pooler.dense.bias', 'lm_head.decoder.bias', 'lm_head.layer_norm.weight', 'roberta.pooler.dense.weight', 'lm_head.dense.bias', 'lm_head.bias', 'lm_head.decoder.weight']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at vinai/bertweet-base and are newly initialized: 

Downloading:   0%|          | 0.00/824k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.03M [00:00<?, ?B/s]

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [13]:
model.train_model(shuffled_training,
                  acc=sklearn.metrics.accuracy_score, 
                  f1=sklearn.metrics.f1_score)

  0%|          | 0/7613 [00:00<?, ?it/s]

Epoch:   0%|          | 0/2 [00:00<?, ?it/s]

Running Epoch 0 of 2:   0%|          | 0/96 [00:00<?, ?it/s]

Running Epoch 1 of 2:   0%|          | 0/96 [00:00<?, ?it/s]

(192, 0.4022546214982867)

In [14]:
result, model_outputs, wrong_predictions = model.eval_model(shuffled_training, 
                                                            acc=sklearn.metrics.accuracy_score,
                                                            f1=sklearn.metrics.f1_score)

  0%|          | 0/7613 [00:00<?, ?it/s]

Running Evaluation:   0%|          | 0/96 [00:00<?, ?it/s]

In [15]:
#0.85 accuracy on test set
result

{'mcc': 0.7856130535876131,
 'tp': 2747,
 'tn': 4067,
 'fp': 275,
 'fn': 524,
 'auroc': 0.9401777776901574,
 'auprc': 0.941405197747371,
 'acc': 0.8950479443057927,
 'f1': 0.8730335293182903,
 'eval_loss': 0.2854788162900756}

## Predictions on new data

In [16]:
predictions, raw_outputs = model.predict(test["text"].to_list())

  0%|          | 0/3263 [00:00<?, ?it/s]

  0%|          | 0/41 [00:00<?, ?it/s]

In [17]:
mypreds = pd.DataFrame(test[["id"]])
mypreds["target"] = predictions

In [18]:
mypreds.to_csv("submission.csv", index=False)