<a href="https://colab.research.google.com/github/ilham-mukti/Machine-Learning/blob/main/0_Workflow_Machine_Learning.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install jcopml

In [4]:
from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import MinMaxScaler, OneHotEncoder, StandardScaler

from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
from sklearn.metrics import confusion_matrix

from sklearn.svm import SVR, SVC
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier
from sklearn.neighbors import KNeighborsRegressor

from skopt import BayesSearchCV
from xgboost import XGBRegressor, XGBClassifier

import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

from jcopml.pipeline import num_pipe, cat_pipe
from jcopml.tuning import grid_search_params as gsp, random_search_params as rsp
from jcopml.feature_importance import mean_score_decrease, mean_loss_decrease
from jcopml.plot import plot_missing_value, plot_confusion_matrix, plot_correlation_matrix
from jcopml.tuning.space import Integer, Real

sns.set()

In [None]:
# 1. Siapin data X dan y, jangan di impute dan scaling

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y, random_state=42)
X_train.shape, X_test.shape

####

numerical_pipeline = Pipeline([
    ('imputer', SimpleImputer(strategy='mean')),
    ('scaler', MinMaxScaler())
])

categorical_pipeline = Pipeline([
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder())
])

preprocessor = ColumnTransformer([
    ('cat_data', categorical_pipeline, ['Location', 'Fuel_Type', 'Transmission', 'Owner_Type']),
    ('num_data', numerical_pipeline, ['Year', 'Kilometers_Driven', 'Mileage_kmpl', 'Engine_CC', 'Power_bhp', 'Seats'])
])
pipeline = Pipeline([
    ('prep', preprocessor),
    ('algo', SVC())
])

####

param = {
    'algo__kernel': ['poly', 'rbf'],
    'algo__random_state': np.arange(0, 2),
    'algo__C': [7.00],
    'algo__gamma': ['scale', 'auto']
}

model = GridSearchCV(pipeline, param_grid=param, cv=3)
model.fit(X_train, y_train)
model.best_params_

###
model.score(X_train, y_train), model.score(X_test, y_test)


### Evaluation
coba = pd.DataFrame(X_test, columns=X.columns)
y_predict = model.predict(coba)

cm = confusion_matrix(y_test, y_predict)
sns.heatmap(cm, annot=True)


### Predict
data = ([16, 1],
       [47, 1])

coba = pd.DataFrame(data, index=['coba1', 'coba2'], columns=X.columns)
y_predict = model.predict(coba)
y_predict


In [None]:
preprocessor = ColumnTransformer([
    ('cat_data', cat_pipe(impute='most_frequent', encoder='onehot'), ['Location', 'Fuel_Type', 'Transmission', 'Owner_Type']),
    ('num_data', num_pipe(impute='median', scaling='minmax', transform='yeo-johnson', poly=2), ['Year', 'Kilometers_Driven', 'Mileage_kmpl', 'Engine_CC', 'Power_bhp', 'Seats'])
])