In [39]:
import pandas as pd
from sklearn import preprocessing
from sklearn.model_selection import train_test_split
from transformers import AutoTokenizer
from datasets import Dataset

In [40]:
# Load Dataset
data_path = "../data/jutsus.jsonl"
df = pd.read_json(data_path, lines=True)
df.head()

Unnamed: 0,jutsu_name,jutsu_type,jutsu_description
0,10 Hit Combo,Taijutsu,Lars punches the opponent before striking them...
1,All Weapons Above Heaven,Ninjutsu,This technique raises all the status boosts (S...
2,Alder,"Kekkei Genkai, Taijutsu","Kimimaro rushes at his opponent, and using the..."
3,All is Suffering,"Kekkei Genkai, Ninjutsu","Using the giant statue as a medium, each of th..."
4,All Directions Shuriken,"Ninjutsu, Shurikenjutsu, Clone Techniques, Kin...",After using the Multiple Shadow Clone Techniqu...


In [41]:
def simplify_jutsu_type(jutsu):
    if "Genjutsu" in jutsu:
        return "Genjutsu"
    if "Ninjutsu" in jutsu:
        return "Ninjutsu"
    if "Taijutsu" in jutsu:
        return "Taijutsu"

In [42]:
df['jutsu_type_simplified'] = df['jutsu_type'].apply(simplify_jutsu_type)
df

Unnamed: 0,jutsu_name,jutsu_type,jutsu_description,jutsu_type_simplified
0,10 Hit Combo,Taijutsu,Lars punches the opponent before striking them...,Taijutsu
1,All Weapons Above Heaven,Ninjutsu,This technique raises all the status boosts (S...,Ninjutsu
2,Alder,"Kekkei Genkai, Taijutsu","Kimimaro rushes at his opponent, and using the...",Taijutsu
3,All is Suffering,"Kekkei Genkai, Ninjutsu","Using the giant statue as a medium, each of th...",Ninjutsu
4,All Directions Shuriken,"Ninjutsu, Shurikenjutsu, Clone Techniques, Kin...",After using the Multiple Shadow Clone Techniqu...,Ninjutsu
...,...,...,...,...
2924,100 Metre Punch,Taijutsu,"A shorter version of the 1000 Metre Punch, the...",Taijutsu
2925,100% Single Punch,Taijutsu,Tsunade gathers large amounts of chakra in her...,Taijutsu
2926,Absolute: Fang Passing Fang,"Taijutsu, Collaboration Techniques",Kiba and Akamaru perform the Fang Passing Fang...,Taijutsu
2927,1000 Metre Punch,Taijutsu,The user focuses a large amount of chakra into...,Taijutsu


In [43]:
df['jutsu_type_simplified'].value_counts()

jutsu_type_simplified
Ninjutsu    2262
Taijutsu     398
Genjutsu     101
Name: count, dtype: int64

In [44]:
df['text'] = df['jutsu_name'] + ". " + df['jutsu_description']
df['jutsu'] = df['jutsu_type_simplified']
df = df[['text', 'jutsu']]
df = df.dropna()
df.head()

Unnamed: 0,text,jutsu
0,10 Hit Combo. Lars punches the opponent before...,Taijutsu
1,All Weapons Above Heaven. This technique raise...,Ninjutsu
2,"Alder. Kimimaro rushes at his opponent, and us...",Taijutsu
3,All is Suffering. Using the giant statue as a ...,Ninjutsu
4,All Directions Shuriken. After using the Multi...,Ninjutsu


In [45]:
from bs4 import BeautifulSoup
class Cleaner():
    def __init__(self):
        pass
    
    def put_line_breaks(self, text):
        return text.replace('<\p>', '<\p>\n')
    
    def remove_html_tags(self, text):
        clean_text = BeautifulSoup(text, 'lxml').text
        return clean_text
    
    def clean(self, text):
        text = self.put_line_breaks(text)
        text = self.remove_html_tags(text).strip()
        return text
        

In [46]:
text_column_name = 'text'
label_column_name = 'jutsu'

In [47]:
cleaner = Cleaner()
df['text_cleaned'] = df[text_column_name].apply(cleaner.clean)
df.head()

  clean_text = BeautifulSoup(text, 'lxml').text


Unnamed: 0,text,jutsu,text_cleaned
0,10 Hit Combo. Lars punches the opponent before...,Taijutsu,10 Hit Combo. Lars punches the opponent before...
1,All Weapons Above Heaven. This technique raise...,Ninjutsu,All Weapons Above Heaven. This technique raise...
2,"Alder. Kimimaro rushes at his opponent, and us...",Taijutsu,"Alder. Kimimaro rushes at his opponent, and us..."
3,All is Suffering. Using the giant statue as a ...,Ninjutsu,All is Suffering. Using the giant statue as a ...
4,All Directions Shuriken. After using the Multi...,Ninjutsu,All Directions Shuriken. After using the Multi...


In [48]:
le = preprocessing.LabelEncoder()
le.fit(df[label_column_name].tolist())

In [49]:
label_dict = {index:label_name for index, label_name in enumerate(le.__dict__['classes_'].tolist())}
label_dict

{0: 'Genjutsu', 1: 'Ninjutsu', 2: 'Taijutsu'}

In [50]:
df['label'] = le.transform(df[label_column_name].tolist())

In [51]:
df.head()

Unnamed: 0,text,jutsu,text_cleaned,label
0,10 Hit Combo. Lars punches the opponent before...,Taijutsu,10 Hit Combo. Lars punches the opponent before...,2
1,All Weapons Above Heaven. This technique raise...,Ninjutsu,All Weapons Above Heaven. This technique raise...,1
2,"Alder. Kimimaro rushes at his opponent, and us...",Taijutsu,"Alder. Kimimaro rushes at his opponent, and us...",2
3,All is Suffering. Using the giant statue as a ...,Ninjutsu,All is Suffering. Using the giant statue as a ...,1
4,All Directions Shuriken. After using the Multi...,Ninjutsu,All Directions Shuriken. After using the Multi...,1


In [52]:
test_size = 0.2
df_train, df_test = train_test_split(df,
                                     test_size=test_size,
                                     stratify=df['label'],)

In [53]:
df_train['jutsu'].value_counts()

jutsu
Ninjutsu    1809
Taijutsu     318
Genjutsu      81
Name: count, dtype: int64

In [54]:
model_name = "distilbert/distilbert-base-uncased"

In [55]:
tokenizer = AutoTokenizer.from_pretrained(model_name)



In [56]:
def preprocess_function(tokenizer, examples):
    return tokenizer(examples["text_cleaned"], truncation=True)

In [57]:
train_dataset = Dataset.from_pandas(df_train)
test_dataset = Dataset.from_pandas(df_test)

tokenized_train = train_dataset.map(lambda examples: preprocess_function(tokenizer, examples), batched=True)
tokenized_test = test_dataset.map(lambda examples: preprocess_function(tokenizer, examples), batched=True)

Map:   0%|          | 0/2208 [00:00<?, ? examples/s]

Map: 100%|██████████| 2208/2208 [00:00<00:00, 10177.38 examples/s]
Map: 100%|██████████| 553/553 [00:00<00:00, 9785.92 examples/s]
