In [None]:
import pandas as pd
import os
import pickle
from sklearn.model_selection import train_test_split, RandomizedSearchCV, StratifiedKFold
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import LabelEncoder, StandardScaler
from imblearn.over_sampling import RandomOverSampler
from imblearn.pipeline import Pipeline
from scipy.stats import randint


csv_file_path = r'C:\Users\PC_ASUS\Prediksi Genre_final\feature ekstrasi finished.csv'
data = pd.read_csv(csv_file_path)


X = data.drop(columns=['label'])
y = data['label']


label_encoder = LabelEncoder()
y_encoded = label_encoder.fit_transform(y)


label_encoder_file_path = os.path.join(os.path.dirname(csv_file_path), 'label_encoder.pkl')
with open(label_encoder_file_path, 'wb') as le_file:
    pickle.dump(label_encoder, le_file)


X_train, X_test, y_train, y_test = train_test_split(X, y_encoded, test_size=0.3, random_state=200)


pipeline = Pipeline([
    ('scaler', StandardScaler()),  # Feature scaling
    ('oversampler', RandomOverSampler(random_state=200)),
    ('classifier', DecisionTreeClassifier(random_state=200))
])

param_dist = {
    'classifier__criterion': ['gini', 'entropy'],
    'classifier__max_depth': randint(3, 13),
    'classifier__min_samples_split': randint(4, 7),
    'classifier__min_samples_leaf': randint(2, 4)
}


cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=200)


random_search = RandomizedSearchCV(pipeline, param_distributions=param_dist, n_iter=100, cv=cv, scoring='accuracy', n_jobs=-1, random_state=200, verbose=1)
random_search.fit(X_train, y_train)

best_model = random_search.best_estimator_

model_file_path = os.path.join(os.path.dirname(csv_file_path), 'final_model.pkl')
with open(model_file_path, 'wb') as model_file:
    pickle.dump(best_model, model_file)


with open(label_encoder_file_path, 'wb') as le_file:
    pickle.dump(label_encoder, le_file)


print("Hyperparameters terbaik:", random_search.best_params_)


y_pred_train = best_model.predict(X_train)
accuracy_train = accuracy_score(y_train, y_pred_train)
print("Akurasi pada data training:", accuracy_train)


y_pred_test = best_model.predict(X_test)
accuracy_test = accuracy_score(y_test, y_pred_test)
print("Akurasi pada data testing:", accuracy_test)