<a href="https://colab.research.google.com/github/head1ton/Analysis_System/blob/main/text_classification/jutsu_classifier_development.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [23]:
!pip install datasets

Collecting datasets
  Downloading datasets-3.4.1-py3-none-any.whl.metadata (19 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess<0.70.17 (from datasets)
  Downloading multiprocess-0.70.16-py311-none-any.whl.metadata (7.2 kB)
Downloading datasets-3.4.1-py3-none-any.whl (487 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m487.4/487.4 kB[0m [31m9.0 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading dill-0.3.8-py3-none-any.whl (116 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m8.6 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading multiprocess-0.70.16-py311-none-any.whl (143 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m143.5/143.5 kB[0m [31m14.1 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading x

In [24]:
import pandas as pd
from sklearn import preprocessing
from sklearn.model_selection import train_test_split
from transformers import AutoTokenizer
from datasets import Dataset

In [None]:
# Load DataSet

In [3]:
data_path = "/content/data/jutsus.jsonl"
df = pd.read_json(data_path, lines=True)
df.head()

Unnamed: 0,jutus_name,jutsu_type,jutsu_description
0,100 Metre Punch,Taijutsu,"A shorter version of the 1000 Metre Punch, the..."
1,10 Hit Combo,Taijutsu,Lars punches the opponent before striking them...
2,Anātman,Ninjutsu,Gaara sends a wave of sand towards his opponen...
3,Amputation Punishment,"Bukijutsu, Taijutsu",Hidan attacks the enemy many times with his sc...
4,Animal Path,"Kekkei Genkai, Ninjutsu, Dōjutsu",This article is about the ability. For other ...


In [4]:
def simplify_jutsu(jutsu):
    if "Genjutsu" in jutsu:
        return "Genjutsu"
    if "Ninjutsu" in jutsu:
        return "Ninjutsu"
    if "Taijutsu" in jutsu:
        return "Taijutsu"

In [5]:
df['jutsu_type_simplified'] = df['jutsu_type'].apply(simplify_jutsu)

In [6]:
df.head()

Unnamed: 0,jutus_name,jutsu_type,jutsu_description,jutsu_type_simplified
0,100 Metre Punch,Taijutsu,"A shorter version of the 1000 Metre Punch, the...",Taijutsu
1,10 Hit Combo,Taijutsu,Lars punches the opponent before striking them...,Taijutsu
2,Anātman,Ninjutsu,Gaara sends a wave of sand towards his opponen...,Ninjutsu
3,Amputation Punishment,"Bukijutsu, Taijutsu",Hidan attacks the enemy many times with his sc...,Taijutsu
4,Animal Path,"Kekkei Genkai, Ninjutsu, Dōjutsu",This article is about the ability. For other ...,Ninjutsu


In [7]:
df['jutsu_type_simplified'].value_counts()

Unnamed: 0_level_0,count
jutsu_type_simplified,Unnamed: 1_level_1
Ninjutsu,2269
Taijutsu,398
Genjutsu,101


In [9]:
df['text'] = df['jutus_name'] + ". " + df['jutsu_description']
df['jutsus'] = df['jutsu_type_simplified']
df = df[['text', 'jutsus']]
df = df.dropna()

In [10]:
df.head()

Unnamed: 0,text,jutsus
0,100 Metre Punch. A shorter version of the 1000...,Taijutsu
1,10 Hit Combo. Lars punches the opponent before...,Taijutsu
2,Anātman. Gaara sends a wave of sand towards hi...,Ninjutsu
3,Amputation Punishment. Hidan attacks the enemy...,Taijutsu
4,Animal Path. This article is about the ability...,Ninjutsu


In [11]:
from bs4 import BeautifulSoup

class Cleaner():
    def __init__(self):
        pass

    def put_line_breaks(self, text):
        return text.replace("</p>", "</p>\n")

    def remove_html_tags(self, text):
        clean_text = BeautifulSoup(text, "lxml").text
        return clean_text

    def clean(self, text):
        text = self.put_line_breaks(text)
        text = self.remove_html_tags(text)
        text = text.strip()
        return text

In [12]:
text_column_name = "text"
label_column_name = "jutsus"

In [13]:
cleaner = Cleaner()
df['text_cleaned'] = df[text_column_name].apply(cleaner.clean)

In [14]:
df.head()

Unnamed: 0,text,jutsus,text_cleaned
0,100 Metre Punch. A shorter version of the 1000...,Taijutsu,100 Metre Punch. A shorter version of the 1000...
1,10 Hit Combo. Lars punches the opponent before...,Taijutsu,10 Hit Combo. Lars punches the opponent before...
2,Anātman. Gaara sends a wave of sand towards hi...,Ninjutsu,Anātman. Gaara sends a wave of sand towards hi...
3,Amputation Punishment. Hidan attacks the enemy...,Taijutsu,Amputation Punishment. Hidan attacks the enemy...
4,Animal Path. This article is about the ability...,Ninjutsu,Animal Path. This article is about the ability...


In [15]:
# Encode Labels
le = preprocessing.LabelEncoder()
le.fit(df[label_column_name].tolist())

In [16]:
label_dict = {index: label_name for index, label_name in enumerate(le.__dict__['classes_'].tolist())}
label_dict

{0: 'Genjutsu', 1: 'Ninjutsu', 2: 'Taijutsu'}

In [17]:
df['label'] = le.transform(df[label_column_name].tolist())

In [18]:
df.head()

Unnamed: 0,text,jutsus,text_cleaned,label
0,100 Metre Punch. A shorter version of the 1000...,Taijutsu,100 Metre Punch. A shorter version of the 1000...,2
1,10 Hit Combo. Lars punches the opponent before...,Taijutsu,10 Hit Combo. Lars punches the opponent before...,2
2,Anātman. Gaara sends a wave of sand towards hi...,Ninjutsu,Anātman. Gaara sends a wave of sand towards hi...,1
3,Amputation Punishment. Hidan attacks the enemy...,Taijutsu,Amputation Punishment. Hidan attacks the enemy...,2
4,Animal Path. This article is about the ability...,Ninjutsu,Animal Path. This article is about the ability...,1


In [26]:
test_size = 0.2
df_train, df_test = train_test_split(df, test_size=test_size, stratify=df['label'])

In [27]:
df_train['jutsus'].value_counts()

Unnamed: 0_level_0,count
jutsus,Unnamed: 1_level_1
Ninjutsu,1815
Taijutsu,318
Genjutsu,81


In [36]:
df_train.head()

Unnamed: 0,text,jutsus,text_cleaned,label
2726,Piston Fist: Style One. The user uses their tr...,Ninjutsu,Piston Fist: Style One. The user uses their tr...,1
2301,"Strong Fist. Strong Fist (剛拳, Gōken) is Guy's ...",Taijutsu,"Strong Fist. Strong Fist (剛拳, Gōken) is Guy's ...",2
615,Heaven and Earth Exploding Funeral. Tobi rushe...,Ninjutsu,Heaven and Earth Exploding Funeral. Tobi rushe...,1
2389,Six Paths Chakra: Rebirth. Using Six Paths Cha...,Ninjutsu,Six Paths Chakra: Rebirth. Using Six Paths Cha...,1
2674,Puppet: Demon Castle Mask. Kankurō has Karasu ...,Ninjutsu,Puppet: Demon Castle Mask. Kankurō has Karasu ...,1


In [37]:
df_test.head()

Unnamed: 0,text,jutsus,text_cleaned,label
332,Lightning Release: Electromagnetic Murder. A L...,Ninjutsu,Lightning Release: Electromagnetic Murder. A L...,1
2496,"Seasickness Fist. Similar to the Drunken Fist,...",Taijutsu,"Seasickness Fist. Similar to the Drunken Fist,...",2
602,Heavenly Insertion Hammering Star. Obito throw...,Ninjutsu,Heavenly Insertion Hammering Star. Obito throw...,1
2564,Sage Art: Lava Release Rasenshuriken. A powerf...,Ninjutsu,Sage Art: Lava Release Rasenshuriken. A powerf...,1
1822,Wind Release: Pressure Damage. A powerful wind...,Ninjutsu,Wind Release: Pressure Damage. A powerful wind...,1


In [28]:
model_name = "distilbert/distilbert-base-uncased"

In [32]:
tokenizer = AutoTokenizer.from_pretrained(model_name)

In [33]:
def preprocess_function(tokenizer, examples):
    return tokenizer(examples["text_cleaned"], truncation=True)

In [34]:
# Conver Pandas to hugging face dataset
train_dataset = Dataset.from_pandas(df_train)
test_dataset = Dataset.from_pandas(df_test)

# tokenize the dataset
tokenized_train = train_dataset.map(lambda examples: preprocess_function(tokenizer, examples), batched=True)
tokenized_test = test_dataset.map(lambda examples: preprocess_function(tokenizer, examples), batched=True)

Map:   0%|          | 0/2214 [00:00<?, ? examples/s]

Map:   0%|          | 0/554 [00:00<?, ? examples/s]

In [35]:
tokenized_train

Dataset({
    features: ['text', 'jutsus', 'text_cleaned', 'label', '__index_level_0__', 'input_ids', 'attention_mask'],
    num_rows: 2214
})