# Praktikum 2: Masalah Regresi dengan SVM     
## NRP:2372061     
## Nama: Laura

Mari kita import library-library yang dibutuhkan.

In [1]:
# DO NOT CHANGE THE CODE BELOW:
import pandas as pd
import numpy as np

from sklearn.pipeline import make_pipeline
from sklearn.pipeline import Pipeline

from sklearn.impute import SimpleImputer

from sklearn.preprocessing import FunctionTransformer, StandardScaler
from sklearn.preprocessing import OneHotEncoder

from sklearn.base import BaseEstimator, TransformerMixin

from sklearn.compose import make_column_selector, make_column_transformer
from sklearn.compose import ColumnTransformer

from sklearn.metrics.pairwise import rbf_kernel

Mari kita read dataset housing, yaitu `housing_dataset.csv`

In [4]:
housing = pd.read_csv("housing_dataset.csv") # YOUR CODE IS HERE

Mari kita read juga label atau kelasnya, yaitu `housing_labels.csv`

In [5]:
housing_labels = pd.read_csv("housing_labels.csv")# YOUR CODE IS HERE

Kita ubah bentuk `housing_labels` dari Dataframe menjadi Series alias dari bentuk `(16512, 1)` menjadi `(16512,)`

In [6]:
# DO NOT CHANGE THE CODE
housing_labels = housing_labels["median_house_value"]

Mari kita buat _pipeline_ untuk memproses kolom yang bertipe _categorical_.

In [7]:
# DO NOT CHANGE THE CODE
cat_pipeline = make_pipeline(
    SimpleImputer(strategy="most_frequent"),
    OneHotEncoder(handle_unknown="ignore"))

Kita buat kelas `ClusterSimilarity` untuk memproses _latitude_ dan _longitude_.

In [8]:
# DO NOT CHANGE THE CODE
from sklearn.cluster import KMeans

class ClusterSimilarity(BaseEstimator, TransformerMixin):
    def __init__(self, n_clusters=10, gamma=1.0, random_state=None):
        self.n_clusters = n_clusters
        self.gamma = gamma
        self.random_state = random_state

    def fit(self, X, y=None, sample_weight=None):
        self.kmeans_ = KMeans(self.n_clusters, n_init=10,
                              random_state=self.random_state)
        self.kmeans_.fit(X, sample_weight=sample_weight)
        return self  # always return self!

    def transform(self, X):
        return rbf_kernel(X, self.kmeans_.cluster_centers_, gamma=self.gamma)

    def get_feature_names_out(self, names=None):
        return [f"Cluster {i} similarity" for i in range(self.n_clusters)]

Kita juga buat tiga fungsi berikut untuk memproses kolom-kolom yang dapat dijadikan fitur baru dengan operasi ratio.

In [9]:
# DO NOT CHANGE THE CODE
def column_ratio(X):
    return X[:, [0]] / X[:, [1]]

def ratio_name(function_transformer, feature_names_in):
    return ["ratio"]  # feature names out

def ratio_pipeline():
    return make_pipeline(
        SimpleImputer(strategy="median"),
        FunctionTransformer(column_ratio, feature_names_out=ratio_name),
        StandardScaler())

Kita buat juga 2 pipeline untuk `log` transformation dan memproses kolom numerik by default dan `cluster_simil` untuk memproses _latitude_ dan _longitude_.

In [10]:
# DO NOT CHANGE THE CODE
log_pipeline = make_pipeline(
    SimpleImputer(strategy="median"),
    FunctionTransformer(np.log, feature_names_out="one-to-one"),
    StandardScaler())

cluster_simil = ClusterSimilarity(n_clusters=10, gamma=1., random_state=42)

default_num_pipeline = make_pipeline(SimpleImputer(strategy="median"),
                                     StandardScaler())

Terakhir, kita gabung semua pipeline dalam satu kelas `ColumnTransformer` yang dinamakan `preprocessing`.

In [11]:
# DO NOT CHANGE THE CODE
preprocessing = ColumnTransformer([
        ("bedrooms", ratio_pipeline(), ["total_bedrooms", "total_rooms"]),
        ("rooms_per_house", ratio_pipeline(), ["total_rooms", "households"]),
        ("people_per_house", ratio_pipeline(), ["population", "households"]),
        ("log", log_pipeline, ["total_bedrooms", "total_rooms", "population",
                               "households", "median_income"]),
        ("geo", cluster_simil, ["latitude", "longitude"]),
        ("cat", cat_pipeline, make_column_selector(dtype_include=object)),
    ],
    remainder=default_num_pipeline)  # one column remaining: housing_median_age

Mari kita set `param_grid` untuk _hyperparameter tuning_ sebagai berikut:

In [12]:
# DO NOT CHANGE THE CODE
from sklearn.model_selection import GridSearchCV
from sklearn.svm import SVR

param_grid = [
        {'svr__kernel': ['linear'], 'svr__C': [10., 30., 100., 300., 1000.,
                                               3000., 10000., 30000.0]},
        {'svr__kernel': ['rbf'], 'svr__C': [1.0, 3.0, 10., 30., 100., 300.,
                                            1000.0],
         'svr__gamma': [0.01, 0.03, 0.1, 0.3, 1.0, 3.0]},
    ]

Selanjutnya, kita buat `Pipeline` dengan `preprocessing` dan `SVR()`:

In [13]:
svr_pipeline = Pipeline([("preprocessing",preprocessing),("svr",SVR())]) #YOUR CODE IS HERE

Selanjutnya, kita gunakan `GridSearchCV` dengan `svr_pipeline`, `param_grid`, 3-fold cross-validation, dan `scoring` adalah `neg_root_mean_squared_error`.

In [14]:
# YOUR CODE IS HERE
svr_pipeline = Pipeline([("preprocessing",preprocessing), ("svr",SVR())])
grid_search = GridSearchCV(svr_pipeline, param_grid, cv=3, scoring='neg_root_mean_squared_error')

Kita latih `grid_search` tersebut pada 6000 data pertama.

In [15]:
# YOUR CODE is HERE
grid_search.fit(housing.iloc[:6000], housing_labels.iloc[:6000])


Kita tampilkan score terbaik dari hasil `grid_search` berdasarkan RMSE.

In [16]:
svr_grid_search_rmse = -grid_search.best_score_# YOUR CODE is HERE
svr_grid_search_rmse

70754.47631870436

Kita tampilkan best hyperparameter dari hasil `grid_search` sbb:

In [17]:
# YOUR CODE is HERE
grid_search.best_params_

{'svr__C': 10000.0, 'svr__kernel': 'linear'}

<h1>
    <center>The End</center>
</h1>

Praktikum 2 - Kecerdasan Mesin
2372061 - Laura Puspa Ameliana

1. Bagaimana (performance) kinerja dari prediktor SVR?
berhasil mendapatkan 70754.47632158002

2. Seperti apakah setting hyperparameter yang terbaik yang diperoleh?
berdasarkan output {'svr__C': 10000.0, 'svr__kernel': 'linear'}, maka hyperparameter terbaik
