In [1]:
import numpy as np 
import pandas as pd 
import os

In [2]:
df = pd.read_csv('tweets.csv', usecols = ['text', 'target'])
df.head()

Unnamed: 0,text,target
0,"Communal violence in Bhainsa, Telangana. ""Ston...",1
1,Telangana: Section 144 has been imposed in Bha...,1
2,Arsonist sets cars ablaze at dealership https:...,1
3,Arsonist sets cars ablaze at dealership https:...,1
4,"""Lord Jesus, your love brings freedom and pard...",0


In [3]:
df.isnull().sum()

text      0
target    0
dtype: int64

In [4]:
df.sample(frac=1).reset_index(drop=True)

Unnamed: 0,text,target
0,"めちゃくちゃ (adj-na,n) (1) absurd; unreasonable; pr...",0
1,"As Taal volcano erupts in the Philippines, als...",0
2,Remember when Brie Larson said we needed more ...,0
3,It was a holy place I see. Very sacred https:/...,0
4,Sullis held Hindu women down while their Barba...,1
...,...,...
11365,"""Once the [scrapping] sale contract has been s...",0
11366,There is no greater harm than that of time was...,0
11367,I am sick of reporters saying Trump is once ag...,0
11368,Unbelievable!! Arundhati Roy is smiling where ...,0


In [5]:
df['target'].value_counts()

target
0    9256
1    2114
Name: count, dtype: int64

In [6]:
df = df.rename(columns={'target':'label'})

In [7]:
from datasets import Dataset

  from .autonotebook import tqdm as notebook_tqdm


In [8]:
dataset = Dataset.from_pandas(df)

In [9]:
dataset = dataset.train_test_split(test_size=0.2)
dataset

DatasetDict({
    train: Dataset({
        features: ['text', 'label'],
        num_rows: 9096
    })
    test: Dataset({
        features: ['text', 'label'],
        num_rows: 2274
    })
})

In [10]:
id2label = {0:'general',1:'disaster'}
label2id = {'general':0,'disaster':1}

In [11]:
from transformers import AutoTokenizer
import torch

device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')

model_ckpt = 'huawei-noah/TinyBERT_General_4L_312D'
tokenizer = AutoTokenizer.from_pretrained(model_ckpt, use_fast=True)

In [12]:
tokenizer(dataset['train'][0]['text'])

def tokenize(batch):
    temp = tokenizer(batch['text'], padding=True, truncation=True, max_length=300)
    return temp

dataset = dataset.map(tokenize, batched=True, batch_size=None)

Map:   0%|          | 0/9096 [00:00<?, ? examples/s]

Map: 100%|██████████| 9096/9096 [00:01<00:00, 5546.59 examples/s]
Map: 100%|██████████| 2274/2274 [00:00<00:00, 7085.63 examples/s]


In [13]:
dataset['train'][0].keys()

dict_keys(['text', 'label', 'input_ids', 'token_type_ids', 'attention_mask'])

In [14]:
import evaluate
import numpy as np

accuracy = evaluate.load("accuracy")

def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    predictions = np.argmax(predictions, axis=1)
    return accuracy.compute(predictions=predictions, references=labels)

In [15]:
from transformers import AutoModelForSequenceClassification, TrainingArguments, Trainer

model = AutoModelForSequenceClassification.from_pretrained(model_ckpt, num_labels=len(label2id), label2id=label2id, id2label=id2label)

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at huawei-noah/TinyBERT_General_4L_312D and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [16]:
args = TrainingArguments(
    output_dir='train_dir',
    overwrite_output_dir=True,
    num_train_epochs=10,
    learning_rate=2e-5,
    per_device_train_batch_size=64,
    per_device_eval_batch_size=64,
    eval_strategy='epoch',
    logging_dir="logs",
    report_to="none",
)

trainer = Trainer(
    model=model,
    args=args,
    train_dataset=dataset['train'],
    eval_dataset=dataset['test'],
    compute_metrics=compute_metrics,
    tokenizer=tokenizer
)

ImportError: Using the `Trainer` with `PyTorch` requires `accelerate>=0.26.0`: Please run `pip install transformers[torch]` or `pip install 'accelerate>=0.26.0'`

In [34]:
import os
os.environ["WANDB_DISAIBLED"]="true"

In [35]:
trainer.train()

Epoch,Training Loss,Validation Loss,Accuracy
1,No log,0.262932,0.8927
2,No log,0.349535,0.849604
3,No log,0.251482,0.903694
4,0.210800,0.268715,0.891381
5,0.210800,0.266778,0.897537
6,0.210800,0.277093,0.899296
7,0.142400,0.287526,0.89314
8,0.142400,0.285712,0.901935
9,0.142400,0.321775,0.889182
10,0.142400,0.328125,0.886104


TrainOutput(global_step=1430, training_loss=0.1552101161930111, metrics={'train_runtime': 87.5767, 'train_samples_per_second': 1038.632, 'train_steps_per_second': 16.329, 'total_flos': 193603334641920.0, 'train_loss': 0.1552101161930111, 'epoch': 10.0})

In [36]:
trainer.evaluate()

{'eval_loss': 0.32812485098838806,
 'eval_accuracy': 0.886103781882146,
 'eval_runtime': 0.7568,
 'eval_samples_per_second': 3004.908,
 'eval_steps_per_second': 47.571,
 'epoch': 10.0}

In [38]:
trainer.save_model('tinybert-disaster-tweet')