In [1]:
%%html
<style>
.dataframe th {
    font-size: 17px;
}
.dataframe td {
    font-size: 16px;
}
</style>

In [None]:
# for kaggle
# !pip install evaluate -q
# import evaluate

# for local venv
# !pip install -r requirements.txt

In [2]:
import pandas as pd
import numpy as np

In [3]:
from sklearn.model_selection import train_test_split
from datasets import Dataset
import re
from transformers import AutoTokenizer, AutoModelForSequenceClassification, TrainingArguments, Trainer, DataCollatorWithPadding

In [73]:
import torch
torch.cuda.is_available()

True

In [7]:
df = pd.read_csv('train.csv')
# for kaggle
# df = pd.read_csv('/kaggle/input/nlp-getting-started/train.csv') 

In [8]:
df.head()

Unnamed: 0,id,keyword,location,text,target
0,1,,,Our Deeds are the Reason of this #earthquake M...,1
1,4,,,Forest fire near La Ronge Sask. Canada,1
2,5,,,All residents asked to 'shelter in place' are ...,1
3,6,,,"13,000 people receive #wildfires evacuation or...",1
4,7,,,Just got sent this photo from Ruby #Alaska as ...,1


In [9]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7613 entries, 0 to 7612
Data columns (total 5 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   id        7613 non-null   int64 
 1   keyword   7552 non-null   object
 2   location  5080 non-null   object
 3   text      7613 non-null   object
 4   target    7613 non-null   int64 
dtypes: int64(2), object(3)
memory usage: 297.5+ KB


### peprocessing

In [10]:
'|'.join(set(''.join(df.text.tolist()))) # уникальные символы

"ü|z|-|ã|r|(|$|å|Z|]|D|>|G|K|^|X|w|¨|U|ª|\\|C|Ñ|7|g|+|S|{|¡|Û|~|Q|I|c|A|«|i|.|â|||Ò|\n|8|F|Ç|2|W|H|?|Â|0|1|P|\x9d|`|R|m|'|Ï|Y|4|£|[|Ó|p|÷| |h|V|a|´|M|E|B|@|9|s|=|%|¢|¬|O|;|&|f|x|Ì|}|j|l|N|k|©|_|L|)|b|/|,|u|T|v|¤|e|*|¼|J|t|5|y|:|!|Ê|È|q|n|#|6|d|\x89|o|3"

In [11]:
def text_cleaner(text):
    '''
    чистим от ссылок, спец символов и символов, которых нет в ascii
    '''
    text=text.lower()
    text = re.sub(r'''((http|https)\:\/\/)?[a-zA-Z0-9\.\/\?\:@\-_=#]+\.([a-zA-Z]){2,6}([a-zA-Z0-9\.\&\/\?\:@\-_=#])*''', "", text) # почистим url-ы
    text = text.encode("ascii", "ignore").decode() 
    text = re.sub(r'''[-$>^\*+\}\{\[\]\'\`''=~;_&%@\)\(\/\|\\#:]+''','', text)
    text=" ".join(text.split())
    return text

df['text'] = df['text'].apply(text_cleaner) 

In [12]:
df.target.value_counts(normalize=True) 
# соотношение меток приемлимое, можно обойтись без даунсемплинга и пр.

0    0.57034
1    0.42966
Name: target, dtype: float64

In [13]:
df.rename(columns={'target': 'label'}, inplace=True)

In [57]:
train, valid = train_test_split(df, test_size=0.2)

In [58]:
train.shape, valid.shape

((6090, 5), (1523, 5))

In [59]:
train.reset_index(drop=True, inplace=True)
valid.reset_index(drop=True, inplace=True)

In [60]:
train_d = Dataset.from_pandas(train[['text', 'label']])
valid_d = Dataset.from_pandas(valid[['text', 'label']])

### Model

In [61]:
MODEL_NAME = "bert-base-uncased"
# можно взять "distilbert-base-uncased", качество не сильно пострадает

In [62]:
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)

In [63]:
tokenized_train = train_d.map(lambda x: tokenizer(x["text"], truncation=True), batched=True)
tokenized_valid = valid_d.map(lambda x: tokenizer(x["text"], truncation=True), batched=True)

Map:   0%|          | 0/6090 [00:00<?, ? examples/s]

Map:   0%|          | 0/1523 [00:00<?, ? examples/s]

In [64]:
tokenized_train = tokenized_train.remove_columns('text')

In [65]:
tokenized_valid = tokenized_valid.remove_columns('text')

In [66]:
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

In [67]:
f1_metric = evaluate.load("f1")

def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    predictions = np.argmax(predictions, axis=1)
    return f1_metric.compute(predictions=predictions, references=labels)

In [68]:
model = AutoModelForSequenceClassification.from_pretrained(MODEL_NAME, num_labels=2)

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.decoder.weight', 'cls.predictions.bias', 'cls.seq_relationship.bias', 'cls.predictions.transform.dense.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at

In [69]:
training_args = TrainingArguments(
    output_dir="model_bert_2e-6",
    learning_rate=2e-6,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=5,
    weight_decay=0.01,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True,
    push_to_hub=False,
    remove_unused_columns=False
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train,
    eval_dataset=tokenized_valid,
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

In [70]:
trainer.train()

You're using a BertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Epoch,Training Loss,Validation Loss,F1
1,No log,0.4748,0.750383
2,0.551800,0.423271,0.783912
3,0.416600,0.416983,0.788491
4,0.389900,0.412062,0.792453
5,0.389900,0.411978,0.794326


TrainOutput(global_step=1905, training_loss=0.43424471957789945, metrics={'train_runtime': 138.6699, 'train_samples_per_second': 219.586, 'train_steps_per_second': 13.738, 'total_flos': 560215853517000.0, 'train_loss': 0.43424471957789945, 'epoch': 5.0})

### Inference

In [28]:
from transformers import pipeline

In [30]:
label2id = {"LABEL_0": 0, "LABEL_1": 1}

In [31]:
test = pd.read_csv('test.csv')
# for kaggle
# test = pd.read_csv('/kaggle/input/nlp-getting-started/test.csv') 

test['text'] = test['text'].apply(text_cleaner)

In [72]:
model_path = trainer.state.best_model_checkpoint
model_path

'model_bert_2e-6\\checkpoint-1905'

In [42]:
classifier = pipeline("sentiment-analysis", model=model_path)
test['target'] = test.text.apply(lambda x: label2id[classifier(x)[0]['label']])

In [None]:
test.to_csv('submission_bert.csv', columns=['id', 'target'], index=False)