In [1]:
!pip install transformers
!pip install datasets
!pip install --upgrade pandas
!pip install evaluate



In [2]:
import torch

In [3]:
data_path = "/content/jutsus.jsonl" #@param {type:"string"}
text_column_name = "text" #@param {type:"string"}
label_column_name = "jutsu" #@param {type:"string"}

model_name = "distilbert-base-uncased" #@param {type:"string"}
test_size = 0.2 #@param {type:"number"}
num_labels = 3 #@param {type:"number"}

device = 'cuda' if torch.cuda.is_available() else 'cpu'

In [4]:
import pandas as pd
df = pd.read_json(data_path, lines=True)
df.head()

Unnamed: 0,jutsu_name,jutsu_type,jutsu_description
0,,Taijutsu,Lars punches the opponent before striking them...
1,,"Ninjutsu, Barrier Ninjutsu, Fūinjutsu",Iruka applies three simultaneous sealing formu...
2,,"Ninjutsu, Barrier Ninjutsu","This is a type of trap ninjutsu (トラップ忍術, Torap..."
3,,"Ninjutsu, Cooperation Ninjutsu, Barrier Ninjutsu",A technique used by the shinobi from the Allie...
4,,"Ninjutsu, Barrier Ninjutsu",This technique requires a scroll on which a bl...


In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2670 entries, 0 to 2669
Data columns (total 3 columns):
 #   Column             Non-Null Count  Dtype 
---  ------             --------------  ----- 
 0   jutsu_name         2670 non-null   object
 1   jutsu_type         2670 non-null   object
 2   jutsu_description  2670 non-null   object
dtypes: object(3)
memory usage: 62.7+ KB


In [6]:
def simplify_justu(jutsu):
    if 'Genjutsu' in jutsu:
        return 'Genjutsu'
    if 'Taijutsu' in jutsu:
        return 'Taijutsu'
    if 'Ninjutsu' in jutsu:
        return 'Ninjutsu'

    return None

In [7]:
df['jutsu_type_simplified'] = df['jutsu_type'].apply(simplify_justu)

In [8]:
df['jutsu_type_simplified'].value_counts()

jutsu_type_simplified
Ninjutsu    1847
Taijutsu     578
Genjutsu      97
Name: count, dtype: int64

In [9]:
df['text'] = df['jutsu_name']+'. '+df['jutsu_description']

In [10]:
df['jutsu'] = df['jutsu_type_simplified']

In [11]:
df= df[['text','jutsu']]

In [12]:
df = df.dropna()

In [13]:
from bs4 import BeautifulSoup

In [14]:
class Cleaner():
  def __init__(self):
    pass
  def put_line_breaks(self,text):
    text = text.replace('</p>','</p>\n')
    return text
  def remove_html_tags(self,text):
    cleantext = BeautifulSoup(text, "lxml").text
    return cleantext
  def clean(self,text):
    text = self.put_line_breaks(text)
    text = self.remove_html_tags(text)
    return text

In [15]:
cleaner = Cleaner()
df['text_cleaned'] = df[text_column_name].apply(cleaner.clean)

  cleantext = BeautifulSoup(text, "lxml").text


In [16]:
df['jutsu'].value_counts()

jutsu
Ninjutsu    1847
Taijutsu     578
Genjutsu      97
Name: count, dtype: int64

label encoder


In [17]:
from sklearn import preprocessing

In [18]:
le = preprocessing.LabelEncoder()
le.fit(df[label_column_name].tolist())
df['label'] = le.transform(df[label_column_name].tolist())


In [19]:
df.head()

Unnamed: 0,text,jutsu,text_cleaned,label
0,. Lars punches the opponent before striking th...,Taijutsu,. Lars punches the opponent before striking th...,2
1,. Iruka applies three simultaneous sealing for...,Ninjutsu,. Iruka applies three simultaneous sealing for...,1
2,". This is a type of trap ninjutsu (トラップ忍術, Tor...",Ninjutsu,". This is a type of trap ninjutsu (トラップ忍術, Tor...",1
3,. A technique used by the shinobi from the All...,Ninjutsu,. A technique used by the shinobi from the All...,1
4,. This technique requires a scroll on which a ...,Ninjutsu,. This technique requires a scroll on which a ...,1


In [20]:
from sklearn.utils.class_weight import compute_class_weight

In [21]:
class_weights = compute_class_weight('balanced',
                     classes=sorted(df['label'].unique().tolist()),
                     y=df['label'].tolist()).tolist()


In [22]:
class_weights

[8.666666666666666, 0.4551524995488179, 1.4544405997693195]

train test split

In [23]:
from sklearn.model_selection import train_test_split

In [24]:
df_train,df_test = train_test_split(df,test_size=test_size,stratify=df['label'])

In [25]:
print(df_train.shape)
print(df_test.shape)

(2017, 4)
(505, 4)


converting to huggingface dataset

In [26]:
from datasets import Dataset

In [27]:
train_dataset = Dataset.from_pandas(df_train)
test_dataset = Dataset.from_pandas(df_test)

In [28]:
print(train_dataset)
print(test_dataset)

Dataset({
    features: ['text', 'jutsu', 'text_cleaned', 'label', '__index_level_0__'],
    num_rows: 2017
})
Dataset({
    features: ['text', 'jutsu', 'text_cleaned', 'label', '__index_level_0__'],
    num_rows: 505
})


tokenizer

In [29]:
from transformers import AutoTokenizer

In [30]:
tokenizer = AutoTokenizer.from_pretrained(model_name)

def preprocess_function(examples):
    return tokenizer(examples["text_cleaned"], truncation=True)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


In [31]:
tokenized_train = train_dataset.map(preprocess_function, batched=True)

Map:   0%|          | 0/2017 [00:00<?, ? examples/s]

In [32]:
tokenized_test = test_dataset.map(preprocess_function, batched=True)

Map:   0%|          | 0/505 [00:00<?, ? examples/s]

model initialization

In [33]:
from transformers import AutoModelForSequenceClassification

In [34]:
model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=num_labels)

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


train model

In [35]:
from transformers import DataCollatorWithPadding
from transformers import TrainingArguments, Trainer
import evaluate
import numpy as np
import torch
from torch import nn

In [36]:
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

In [37]:
metric = evaluate.load("accuracy")


In [38]:
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    return metric.compute(predictions=predictions, references=labels)


In [39]:
class CustomTrainer(Trainer):
    def compute_loss(self, model, inputs, return_outputs=False):
        labels = inputs.get("labels")
        # forward pass
        outputs = model(**inputs)
        logits = outputs.get("logits")
        # compute custom loss (suppose one has 3 labels with different weights)
        loss_fct = nn.CrossEntropyLoss(weight=torch.tensor(class_weights).to(device=device))
        loss = loss_fct(logits.view(-1, self.model.config.num_labels), labels.view(-1))
        return (loss, outputs) if return_outputs else loss


In [40]:
!pip install --upgrade accelerate transformers[torch]



In [41]:
training_args = TrainingArguments(
    output_dir="/content/results",
    learning_rate=2e-4,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=5,
    weight_decay=0.01,
    evaluation_strategy = "epoch",
    logging_strategy="epoch"
)

trainer = CustomTrainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train,
    eval_dataset=tokenized_test,
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics

)



In [42]:
trainer.train()

Epoch,Training Loss,Validation Loss,Accuracy
1,0.9954,0.858543,0.839604
2,0.9285,0.921101,0.857426
3,0.939,0.946848,0.837624
4,0.9153,0.947787,0.841584
5,0.872,0.954934,0.851485


TrainOutput(global_step=1265, training_loss=0.9300241462797987, metrics={'train_runtime': 257.6887, 'train_samples_per_second': 39.136, 'train_steps_per_second': 4.909, 'total_flos': 563004504982062.0, 'train_loss': 0.9300241462797987, 'epoch': 5.0})

In [43]:
from sklearn.metrics import classification_report

In [44]:
preds = trainer.predict(tokenized_train)
preds = np.argmax(preds[:3][0],axis=1)
GT = df_train['label'].tolist()
print(classification_report(GT,preds))

              precision    recall  f1-score   support

           0       0.00      0.00      0.00        78
           1       0.87      0.95      0.91      1477
           2       0.79      0.72      0.75       462

    accuracy                           0.86      2017
   macro avg       0.55      0.55      0.55      2017
weighted avg       0.82      0.86      0.84      2017



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [45]:
preds = trainer.predict(tokenized_test)
preds = np.argmax(preds[:3][0],axis=1) #preds[:3][1]
GT = df_test['label'].tolist()
print(classification_report(GT,preds))

              precision    recall  f1-score   support

           0       0.00      0.00      0.00        19
           1       0.87      0.95      0.91       370
           2       0.78      0.69      0.73       116

    accuracy                           0.85       505
   macro avg       0.55      0.55      0.55       505
weighted avg       0.82      0.85      0.83       505



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
