In [27]:
import numpy as np
import pandas as pd
import nltk
import json

from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer

from luwiji.text_proc import illustration, demo

from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from string import punctuation

nltk.download("stopwords")

sw_indo = stopwords.words("indonesian") + list(punctuation)

[nltk_data] Downloading package stopwords to /Users/izzal/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


### Import Data

In [28]:
with open('../assets/intents.json') as file:
    data = json.load(file)

# Ekstrak kolom "tag" dan "patterns" dari JSON bersarang
selected_data = []

for intent in data["intents"]:
    tag = intent["tag"]
    patterns = intent["patterns"]
    
    for pattern in patterns:
        selected_data.append({"tag": tag, "patterns": pattern})

# Buat DataFrame dari data yang diambil
df = pd.DataFrame(selected_data)
df.head()

Unnamed: 0,tag,patterns
0,notfound,
1,notfound,
2,sapaan,Halo
3,sapaan,Hai
4,sapaan,Selamat pagi


### Dataset Splitting

In [35]:
X = df.patterns
y = df.tag

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.23, stratify=y, random_state=42)
X_train.shape, X_test.shape, y_train.shape, y_test.shape

((268,), (81,), (268,), (81,))

In [36]:
df.groupby('tag')['patterns'].count().reset_index()

Unnamed: 0,tag,patterns
0,EyeU,5
1,alergi mata,4
2,alkohol,3
3,anak-anak katarak,4
4,aplikasi,4
...,...,...
76,tidak,10
77,tidur,3
78,usia,3
79,vitamin,4


### Training

In [45]:
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import RandomizedSearchCV
from jcopml.tuning import random_search_params as rsp


In [46]:
pipeline = Pipeline([
    ('prep', TfidfVectorizer(tokenizer=word_tokenize, stop_words=sw_indo)),
    ('algo', LogisticRegression(solver='lbfgs',n_jobs=-1, random_state=42)),
])

# params = {
#     "algo__fit_intercept": [True, False],
#     "algo__C": Real(-3, 3, prior='log-uniform')
# }

model_logreg = RandomizedSearchCV(pipeline, rsp.logreg_params, cv=3, n_iter=50, n_jobs=-1, verbose=1)
model_logreg.fit(X_train, y_train)

print(model_logreg.best_params_)
print(model_logreg.score(X_train, y_train), model_logreg.best_score_, model_logreg.score(X_test,y_test))

Fitting 3 folds for each of 50 candidates, totalling 150 fits


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

{'algo__C': 14.167564636160838, 'algo__fit_intercept': True}
0.917910447761194 0.6977944236371202 0.7037037037037037


In [48]:
model_logreg.predict(["Gejala retinopati diabetik"])

array(['retinopati_diabetik'], dtype=object)