In [86]:
import numpy as np
import pandas as pd
import nltk
import json

from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline

from luwiji.text_proc import illustration, demo

from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from string import punctuation

nltk.download("stopwords")

sw_indo = stopwords.words("indonesian") + list(punctuation) + ["``"]
import warnings

# Menekan peringatan
warnings.filterwarnings("ignore", category=UserWarning, module="sklearn")


[nltk_data] Downloading package stopwords to /Users/izzal/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


### Import Data

In [87]:
with open('../assets/intents.json') as file:
    data = json.load(file)

# Ekstrak kolom "tag" dan "patterns" dari JSON bersarang
selected_data = []

for intent in data["intents"]:
    tag = intent["tag"]
    patterns = intent["patterns"]
    
    for pattern in patterns:
        selected_data.append({"tag": tag, "patterns": pattern})

# Buat DataFrame dari data yang diambil
df = pd.DataFrame(selected_data)
df.head()

Unnamed: 0,tag,patterns
0,sapaan,Halo
1,sapaan,Hai
2,sapaan,Selamat pagi
3,sapaan,Selamat siang
4,sapaan,Selamat sore


### Dataset Splitting

In [88]:
X = df.patterns
y = df.tag

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.23, stratify=y, random_state=42)
X_train.shape, X_test.shape, y_train.shape, y_test.shape

((267,), (80,), (267,), (80,))

In [89]:
df.groupby('tag')['patterns'].count().reset_index()

Unnamed: 0,tag,patterns
0,EyeU,5
1,alergi mata,4
2,alkohol,3
3,anak-anak katarak,4
4,aplikasi,4
...,...,...
75,tidak,10
76,tidur,3
77,usia,3
78,vitamin,4


### Training

In [90]:
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import GridSearchCV
from jcopml.tuning import random_search_params as rsp


#### KNN Clasifier

In [98]:
pipeline = Pipeline([
    ('prep', TfidfVectorizer(tokenizer=word_tokenize, stop_words=sw_indo, ngram_range=(1,3))),
    ('algo', KNeighborsClassifier()),
])

params = {
    "prep__ngram_range": [(1, 1), (1, 2), (1, 3)],
    "algo__n_neighbors": range(1,11,2),
    "algo__weights":["distance"],
    "algo__metric":["manhattan","euclidean"]
}

model_knn = GridSearchCV(pipeline, params, cv=3, n_jobs=-1, verbose=0)
model_knn.fit(X_train, y_train)

print(model_knn.best_params_)
print(model_knn.score(X_train, y_train), model_knn.best_score_, model_knn.score(X_test,y_test))



{'algo__metric': 'euclidean', 'algo__n_neighbors': 1, 'algo__weights': 'distance', 'prep__ngram_range': (1, 1)}
0.9138576779026217 0.6666666666666666 0.725


In [96]:
model_knn.predict(["apa itu rabun jauh"])

array(['sapaan'], dtype=object)

In [102]:
import pickle
# open a file, where you ant to store the data
file = open('../assets/model/knn.pkl', 'wb')

# dump information to that file
pickle.dump(model_knn, file)

# close the file
file.close()