In [65]:
import pandas as pd
from sklearn import preprocessing
from sklearn.model_selection import train_test_split
from transformers import AutoTokenizer
from datasets import Dataset

### Loading the Data

In [27]:
data_path = "../data/jutsus.json"
df = pd.read_json(data_path)
df.head()


Unnamed: 0,jutsu_name,jutsu_type,jutsu_description
0,Antlion Ninja Arts: Ephemeral,"Ninjutsu, Kinjutsu",This article is about the anime-only kinjutsu ...
1,Ankorodō Hiden Ninjutsu: Mochi Release: Infini...,"Ninjutsu, Hiden",The user causes the dumplings near them to gro...
2,Apricot Blossom Droop,"Taijutsu, Ninjutsu",Kurenai attacks her opponent with a combinatio...
3,Annihilation,"Kekkei Genkai, Ninjutsu, Space–Time Ninjutsu, ...",Obito lands a series of punches and kicks to h...
4,Animal Path,"Kekkei Genkai, Ninjutsu, Dōjutsu",This article is about the ability. For other ...


Going to create a function to simplify the jutsus to Ninjutsu, Genjutsu and Taijutusu

In [28]:
def simplify_jutsus(jutsu):
    if "Genjutsu" in jutsu:
        return "Genjutsu"
    if "Ninjutsu" in jutsu:
        return "Ninjutsu"
    if "Taijutsu" in jutsu:
        return "Taijutsu"

In [29]:
df['jutsu_type_simplified'] = df['jutsu_type'].apply(simplify_jutsus)

In [31]:
df.head()

Unnamed: 0,jutsu_name,jutsu_type,jutsu_description,jutsu_type_simplified
0,Antlion Ninja Arts: Ephemeral,"Ninjutsu, Kinjutsu",This article is about the anime-only kinjutsu ...,Ninjutsu
1,Ankorodō Hiden Ninjutsu: Mochi Release: Infini...,"Ninjutsu, Hiden",The user causes the dumplings near them to gro...,Ninjutsu
2,Apricot Blossom Droop,"Taijutsu, Ninjutsu",Kurenai attacks her opponent with a combinatio...,Ninjutsu
3,Annihilation,"Kekkei Genkai, Ninjutsu, Space–Time Ninjutsu, ...",Obito lands a series of punches and kicks to h...,Ninjutsu
4,Animal Path,"Kekkei Genkai, Ninjutsu, Dōjutsu",This article is about the ability. For other ...,Ninjutsu


In [32]:
df['jutsu_type_simplified'].value_counts()

jutsu_type_simplified
Ninjutsu    2271
Taijutsu     398
Genjutsu     101
Name: count, dtype: int64

Here Ninjutsu has more samples than the other two, this could make a skewed decision based on the datset representation. We will deal with it by penalizing the function more for lesser datasets.

In [33]:
df['text'] = df['jutsu_name'] + ". " + df['jutsu_description']
df['jutsu'] = df['jutsu_type_simplified']
df = df[['text', 'jutsu']]
df = df.dropna()
df.head()


Unnamed: 0,text,jutsu
0,Antlion Ninja Arts: Ephemeral. This article is...,Ninjutsu
1,Ankorodō Hiden Ninjutsu: Mochi Release: Infini...,Ninjutsu
2,Apricot Blossom Droop. Kurenai attacks her opp...,Ninjutsu
3,Annihilation. Obito lands a series of punches ...,Ninjutsu
4,Animal Path. This article is about the ability...,Ninjutsu


I am just creating a random funtion to clean our text and remove any noise in json file this could be used in any other project as well. 

In [45]:
from bs4 import BeautifulSoup
class cleaner():
    def __init__(self):
        pass
    
    def put_line_breaks(self, text):
        return text.replace("<\p>", "<\p>\n")
    
    def remove_html_tag(self, text):
        clean_text = BeautifulSoup(text, "html.parser").get_text()
        return clean_text
    
    # def remove_special_characters(self, text): 
    #     return text.replace("\n", " ").replace("\r", "")
    
    def clean_text(self, text):
        text = self.put_line_breaks(text)
        text = self.remove_html_tag(text)
        # text = self.remove_special_characters(text)
        text = text.strip()
        return text

  return text.replace("<\p>", "<\p>\n")
  return text.replace("<\p>", "<\p>\n")


In [52]:
text_column_name = 'text'
label_columdn_name = 'jutsu'


In [46]:
#Cleaning the text
cleaner = cleaner()
df['text_cleaned'] = df[text_column_name].apply(cleaner.clean_text)

In [47]:
df.head()

Unnamed: 0,text,jutsu,text_cleaned
0,Antlion Ninja Arts: Ephemeral. This article is...,Ninjutsu,Antlion Ninja Arts: Ephemeral. This article is...
1,Ankorodō Hiden Ninjutsu: Mochi Release: Infini...,Ninjutsu,Ankorodō Hiden Ninjutsu: Mochi Release: Infini...
2,Apricot Blossom Droop. Kurenai attacks her opp...,Ninjutsu,Apricot Blossom Droop. Kurenai attacks her opp...
3,Annihilation. Obito lands a series of punches ...,Ninjutsu,Annihilation. Obito lands a series of punches ...
4,Animal Path. This article is about the ability...,Ninjutsu,Animal Path. This article is about the ability...


Tokenizing the input for the model to implemet.

In [53]:
#Encode Labels
le = preprocessing.LabelEncoder()
le.fit(df[label_columdn_name].tolist())

In [54]:
label_dict = {index:label_name for index, label_name in enumerate(le.__dict__['classes_'].tolist())}
label_dict

{0: 'Genjutsu', 1: 'Ninjutsu', 2: 'Taijutsu'}

In [55]:
df['label'] = le.transform(df[label_columdn_name].tolist())
df.head()

Unnamed: 0,text,jutsu,text_cleaned,label
0,Antlion Ninja Arts: Ephemeral. This article is...,Ninjutsu,Antlion Ninja Arts: Ephemeral. This article is...,1
1,Ankorodō Hiden Ninjutsu: Mochi Release: Infini...,Ninjutsu,Ankorodō Hiden Ninjutsu: Mochi Release: Infini...,1
2,Apricot Blossom Droop. Kurenai attacks her opp...,Ninjutsu,Apricot Blossom Droop. Kurenai attacks her opp...,1
3,Annihilation. Obito lands a series of punches ...,Ninjutsu,Annihilation. Obito lands a series of punches ...,1
4,Animal Path. This article is about the ability...,Ninjutsu,Animal Path. This article is about the ability...,1


In [59]:
df_train, df_test = train_test_split(df, test_size=0.2, random_state=42, stratify=df[label_columdn_name])
print(df_train['jutsu'].value_counts())
print(df_test['jutsu'].value_counts())

jutsu
Ninjutsu    1817
Taijutsu     318
Genjutsu      81
Name: count, dtype: int64
jutsu
Ninjutsu    454
Taijutsu     80
Genjutsu     20
Name: count, dtype: int64


In [61]:
model_name = "distilbert/distilbert-base-uncased"

In [63]:
tokenizer = AutoTokenizer.from_pretrained(model_name)

In [64]:
def preprocess_funtion(tokenizer, example):
    return tokenizer(example['text_cleaned'], truncation=True, padding='max_length', max_length=512, return_tensors="pt")

In [66]:
#Convert Pandas to Hugging face dataset
train_dataset = Dataset.from_pandas(df_train)
test_dataset = Dataset.from_pandas(df_test)

#Tokenize the datatset
tokenizer_train = train_dataset.map(lambda examples: preprocess_funtion(tokenizer, examples), batched=True)
tokenizer_test = test_dataset.map(lambda examples: preprocess_funtion(tokenizer, examples), batched=True)

Map: 100%|██████████| 2216/2216 [00:00<00:00, 5162.37 examples/s]
Map: 100%|██████████| 554/554 [00:00<00:00, 5972.60 examples/s]
