In [28]:
import pandas as pd
from sklearn import preprocessing
from sklearn.model_selection import train_test_split
from transformers import AutoTokenizer
from datasets import Dataset


# Load Dataset

In [3]:
data_path = "../data/jutsus.jsonl"
df = pd.read_json(data_path, lines=True)
df.head()

Unnamed: 0,jutsu_name,jutsu_type,jutsu_description
0,10 Hit Combo,Taijutsu,Lars punches the opponent before striking them...
1,Assassination Technique,"Kenjutsu, Fighting Style",A sword technique used by Root members. Using ...
2,Assimilation: Rock Blizzard,Ninjutsu,After performing the Assimilate All Creation T...
3,Assimilation: Rock Tank,Taijutsu,This technique is a copied version of the Akim...
4,Asura Attack,"Kekkei Genkai, Ninjutsu, Dōjutsu","With the body modifications of the Asura Path,..."


In [4]:
def simplify_jutsu(jutsu):
    if "Genjutsu" in jutsu:
        return "Genjutsu"
    if "Ninjutsu" in jutsu:
        return "Ninjutsu"
    if "Taijutsu" in jutsu:
        return "Taijutsu"

In [5]:
df['jutsu_type_simplified'] = df['jutsu_type'].apply(simplify_jutsu)

In [6]:
df.head()

Unnamed: 0,jutsu_name,jutsu_type,jutsu_description,jutsu_type_simplified
0,10 Hit Combo,Taijutsu,Lars punches the opponent before striking them...,Taijutsu
1,Assassination Technique,"Kenjutsu, Fighting Style",A sword technique used by Root members. Using ...,
2,Assimilation: Rock Blizzard,Ninjutsu,After performing the Assimilate All Creation T...,Ninjutsu
3,Assimilation: Rock Tank,Taijutsu,This technique is a copied version of the Akim...,Taijutsu
4,Asura Attack,"Kekkei Genkai, Ninjutsu, Dōjutsu","With the body modifications of the Asura Path,...",Ninjutsu


In [7]:
df['jutsu_type_simplified'].value_counts()

jutsu_type_simplified
Ninjutsu    2259
Taijutsu     398
Genjutsu     101
Name: count, dtype: int64

In [8]:
df['text'] = df['jutsu_name'] + ". " + df['jutsu_description']
df['jutsus'] = df['jutsu_type_simplified']
df = df[['text', 'jutsus']]
df = df.dropna()

In [9]:
df.head()

Unnamed: 0,text,jutsus
0,10 Hit Combo. Lars punches the opponent before...,Taijutsu
2,Assimilation: Rock Blizzard. After performing ...,Ninjutsu
3,Assimilation: Rock Tank. This technique is a c...,Taijutsu
4,Asura Attack. With the body modifications of t...,Ninjutsu
5,Assimilate All Creation Technique. The Assimil...,Ninjutsu


In [10]:
from bs4 import BeautifulSoup
class Cleaner():
    def __init__(self):
        pass 
    
    def put_line_breaks(self, text):
        return text.replace("<\p>", "<\p>\n")
    
    def remove_html_tags(self, text):
        clean_text = BeautifulSoup(text, "lxml").text
        return clean_text

    def clean(self, text):
        text = self.put_line_breaks(text)
        text = self.remove_html_tags(text)
        text = text.strip()
        return text

In [11]:
text_column_name = 'text'
label_column_name = "jutsus"

In [12]:
# Clean Text
cleaner = Cleaner()
df['text_cleaned'] = df[text_column_name].apply(cleaner.clean)

  clean_text = BeautifulSoup(text, "lxml").text


In [13]:
df.head(2)

Unnamed: 0,text,jutsus,text_cleaned
0,10 Hit Combo. Lars punches the opponent before...,Taijutsu,10 Hit Combo. Lars punches the opponent before...
2,Assimilation: Rock Blizzard. After performing ...,Ninjutsu,Assimilation: Rock Blizzard. After performing ...


In [15]:
# Encode Labels 
le = preprocessing.LabelEncoder()
le.fit(df[label_column_name].tolist())

In [16]:
label_dict = {index:label_name for index, label_name in enumerate(le.__dict__['classes_'].tolist())}
label_dict

{0: 'Genjutsu', 1: 'Ninjutsu', 2: 'Taijutsu'}

In [17]:
df['label'] = le.transform(df[label_column_name].tolist())

In [18]:
df.head()

Unnamed: 0,text,jutsus,text_cleaned,label
0,10 Hit Combo. Lars punches the opponent before...,Taijutsu,10 Hit Combo. Lars punches the opponent before...,2
2,Assimilation: Rock Blizzard. After performing ...,Ninjutsu,Assimilation: Rock Blizzard. After performing ...,1
3,Assimilation: Rock Tank. This technique is a c...,Taijutsu,Assimilation: Rock Tank. This technique is a c...,2
4,Asura Attack. With the body modifications of t...,Ninjutsu,Asura Attack. With the body modifications of t...,1
5,Assimilate All Creation Technique. The Assimil...,Ninjutsu,Assimilate All Creation Technique. The Assimil...,1


In [20]:
test_size = 0.2
df_train, df_test = train_test_split(df, 
                                     test_size=test_size, 
                                     stratify=df['label'],)

In [22]:
df_train['jutsus'].value_counts()

jutsus
Ninjutsu    1807
Taijutsu     318
Genjutsu      81
Name: count, dtype: int64

In [23]:
model_name = "distilbert/distilbert-base-uncased"

In [26]:
tokenizer = AutoTokenizer.from_pretrained(model_name)



tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to see activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development


config.json:   0%|          | 0.00/483 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

In [27]:
def preprocess_function(tokenizer,examples):
    return tokenizer(examples['text_cleaned'],truncation=True)

In [29]:
# Conver Pandas to a hugging face dataset
train_dataset = Dataset.from_pandas(df_train)
test_dataset = Dataset.from_pandas(df_test)

# tokenize the dataset
tokenized_train = train_dataset.map(lambda examples: preprocess_function(tokenizer, examples),
                                    batched=True)
tokenized_test = test_dataset.map(lambda examples: preprocess_function(tokenizer, examples),
                                    batched=True)

Map:   0%|          | 0/2206 [00:00<?, ? examples/s]

Map:   0%|          | 0/552 [00:00<?, ? examples/s]