This is an attempt at the Kaggle Competition Natural Language Processing with Disaster Tweets<br>
https://www.kaggle.com/competitions/nlp-getting-started/overview<br>
I used Getting started with NLP for absolute beginners (https://www.kaggle.com/code/jhoward/getting-started-with-nlp-for-absolute-beginners)<br>
as a guide, modifying it to classify instead of being linear.<br>
Output gets 0.823, which is ~320 on the leaderboard.<br>
Hasn't been iterated on at all.


In [1]:
#download the data from kaggle
from pathlib import Path
path = Path('nlp-getting-started')
if not path.exists():
    import zipfile,kaggle
    kaggle.api.competition_download_cli(str(path))
    zipfile.ZipFile(f'{path}.zip').extractall(path)

In [2]:
#Cursory Look
import pandas as pd
df = pd.read_csv(path/'train.csv')
df

Unnamed: 0,id,keyword,location,text,target
0,1,,,Our Deeds are the Reason of this #earthquake M...,1
1,4,,,Forest fire near La Ronge Sask. Canada,1
2,5,,,All residents asked to 'shelter in place' are ...,1
3,6,,,"13,000 people receive #wildfires evacuation or...",1
4,7,,,Just got sent this photo from Ruby #Alaska as ...,1
...,...,...,...,...,...
7608,10869,,,Two giant cranes holding a bridge collapse int...,1
7609,10870,,,@aria_ahrary @TheTawniest The out of control w...,1
7610,10871,,,M1.94 [01:04 UTC]?5km S of Volcano Hawaii. htt...,1
7611,10872,,,Police investigating after an e-bike collided ...,1


In [3]:
#Define our input to contain the other information we have
df['input'] = 'TEXT: ' + df.text + '; KEY: ' + df.keyword.fillna('') + '; LOC: ' + df.location.fillna('')
df.input.head()

0    TEXT: Our Deeds are the Reason of this #earthq...
1    TEXT: Forest fire near La Ronge Sask. Canada; ...
2    TEXT: All residents asked to 'shelter in place...
3    TEXT: 13,000 people receive #wildfires evacuat...
4    TEXT: Just got sent this photo from Ruby #Alas...
Name: input, dtype: object

In [4]:
from datasets import Dataset, DatasetDict

ds = Dataset.from_pandas(df)
ds

Dataset({
    features: ['id', 'keyword', 'location', 'text', 'target', 'input'],
    num_rows: 7613
})

In [5]:
#choose a model
model_nm = 'microsoft/deberta-v3-small'

In [6]:
#get our tokenizer
from transformers import AutoModelForSequenceClassification, AutoTokenizer
tokz = AutoTokenizer.from_pretrained(model_nm)



In [7]:
#test tokenizer real quick
tokz.tokenize("A platypus is an ornithorhynchus anatinus.")

['▁A',
 '▁platypus',
 '▁is',
 '▁an',
 '▁or',
 'ni',
 'tho',
 'rhynch',
 'us',
 '▁an',
 'at',
 'inus',
 '.']

In [8]:
def tok_func(x): return tokz(x["input"])

In [9]:
#tokenize dataset
tok_ds = ds.map(tok_func, batched=True)

Map:   0%|          | 0/7613 [00:00<?, ? examples/s]

In [10]:
#add input_ids
row = tok_ds[0]
row['input'], row['input_ids']

('TEXT: Our Deeds are the Reason of this #earthquake May ALLAH Forgive us all; KEY: ; LOC: ',
 [1,
  54453,
  294,
  581,
  65453,
  281,
  262,
  18037,
  265,
  291,
  953,
  117831,
  903,
  4924,
  17018,
  43632,
  381,
  305,
  346,
  29908,
  294,
  2600,
  57615,
  294,
  2])

In [11]:
#Transformers wants labels column
tok_ds = tok_ds.rename_columns({'target':'labels'})

In [12]:
#get test set
eval_df = pd.read_csv(path/'test.csv')
eval_df

Unnamed: 0,id,keyword,location,text
0,0,,,Just happened a terrible car crash
1,2,,,"Heard about #earthquake is different cities, s..."
2,3,,,"there is a forest fire at spot pond, geese are..."
3,9,,,Apocalypse lighting. #Spokane #wildfires
4,11,,,Typhoon Soudelor kills 28 in China and Taiwan
...,...,...,...,...
3258,10861,,,EARTHQUAKE SAFETY LOS ANGELES ÛÒ SAFETY FASTE...
3259,10865,,,Storm in RI worse than last hurricane. My city...
3260,10868,,,Green Line derailment in Chicago http://t.co/U...
3261,10874,,,MEG issues Hazardous Weather Outlook (HWO) htt...


In [13]:
dds = tok_ds.train_test_split(0.25, seed=42)
dds

DatasetDict({
    train: Dataset({
        features: ['id', 'keyword', 'location', 'text', 'labels', 'input', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 5709
    })
    test: Dataset({
        features: ['id', 'keyword', 'location', 'text', 'labels', 'input', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 1904
    })
})

In [14]:
#same preprocessing steps
eval_df['input'] = 'TEXT: ' + eval_df.text + '; KEY: ' + eval_df.keyword.fillna('') + '; LOC: ' + eval_df.location.fillna('')
eval_ds = Dataset.from_pandas(eval_df).map(tok_func, batched=True)

Map:   0%|          | 0/3263 [00:00<?, ? examples/s]

In [15]:
import numpy as np
from sklearn.metrics import f1_score, precision_score, recall_score, accuracy_score
def f1_d(eval_pred):
    predictions, labels = eval_pred
    predicted_labels = np.argmax(predictions, axis=1)
    return {'f1': f1_score(labels, predicted_labels)}

In [16]:
from transformers import TrainingArguments,Trainer

In [17]:
bs = 128
epochs = 4

In [18]:
lr = 8e-5

In [19]:
args = TrainingArguments('outputs', learning_rate=lr, warmup_ratio=0.1, lr_scheduler_type='cosine', fp16=True,
    eval_strategy="epoch", per_device_train_batch_size=bs, per_device_eval_batch_size=bs*2,
    num_train_epochs=epochs, weight_decay=0.01, report_to='none')

In [20]:
model = AutoModelForSequenceClassification.from_pretrained(model_nm, num_labels=2)
trainer = Trainer(model, args, train_dataset=dds['train'], eval_dataset=dds['test'],
                  tokenizer=tokz, compute_metrics=f1_d)

Some weights of DebertaV2ForSequenceClassification were not initialized from the model checkpoint at microsoft/deberta-v3-small and are newly initialized: ['classifier.bias', 'classifier.weight', 'pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  trainer = Trainer(model, args, train_dataset=dds['train'], eval_dataset=dds['test'],


In [21]:
trainer.train()

Epoch,Training Loss,Validation Loss,F1
1,No log,0.457657,0.787074
2,No log,0.390234,0.805178
3,No log,0.402882,0.794003
4,No log,0.454617,0.798267


TrainOutput(global_step=180, training_loss=0.36426730685763886, metrics={'train_runtime': 33.0735, 'train_samples_per_second': 690.462, 'train_steps_per_second': 5.442, 'total_flos': 461031687564084.0, 'train_loss': 0.36426730685763886, 'epoch': 4.0})

Training doesn't look great, likely starts overfitting quickly.<br>
Could experiment with smaller learning rates or adding some decay.<br>
This is mostly a test to apply this format to a slightly different problem, so I won't iterate for now.

In [22]:
#predictions on the test set
preds = trainer.predict(eval_ds).predictions.astype(float)
preds

array([[-2.90625   ,  2.64257812],
       [-1.63671875,  1.64941406],
       [-2.828125  ,  2.5703125 ],
       ...,
       [-3.04492188,  2.65039062],
       [-1.25097656,  1.27441406],
       [-2.44335938,  2.27539062]])

In [23]:
preds = np.argmax(preds, axis=1)
preds

array([1, 1, 1, ..., 1, 1, 1])

In [24]:
import datasets

submission_df = pd.DataFrame({
    'id' : eval_df['id'],
    'target' : preds
})
submission_df

Unnamed: 0,id,target
0,0,1
1,2,1
2,3,1
3,9,1
4,11,1
...,...,...
3258,10861,1
3259,10865,1
3260,10868,1
3261,10874,1


In [25]:
submission_df.to_csv('submission.csv', index=False)