# Machine Learning HW2

## Preprocessing the Dataset

In [325]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
# read csv
dataset = pd.read_csv('./train.csv')
# print(len(dataset))
data_x = dataset[['age_of_car', 'make', 'population_density']]
data_y = dataset['is_claim']

# divide_dataset
train_x, test_x, train_y, test_y = train_test_split(data_x, data_y, test_size = 0.2, random_state = 587)
train_x = train_x.values.reshape(-1, len(data_x.columns))
test_x = test_x.values.reshape(-1, len(data_x.columns))

## Question 1-i



In [323]:
class NaiveBayes:
    def fit(self, X, y):
        self.n_samples, self.n_features = X.shape
        self.n_classes = len(np.unique(y))

        self.mean = np.zeros((self.n_classes, self.n_features))
        self.variance = np.zeros((self.n_classes, self.n_features))
        self.priors = np.zeros(self.n_classes)

        for c in range(self.n_classes):
            X_c = X[y == c]

            self.mean[c, :] = np.mean(X_c, axis=0)
            self.variance[c, :] = np.var(X_c, axis=0)
            self.priors[c] = X_c.shape[0] / self.n_samples

    def predict(self, X):
        y_hat = [self.get_class_probability(x) for x in X]
        return np.array(y_hat)

    def get_class_probability(self, x):
        posteriors = list()

        for c in range(self.n_classes):
            mean = self.mean[c]
            variance = self.variance[c]
            prior = np.log(self.priors[c])

            posterior = np.sum(np.log(self.gaussian_density(x, mean, variance)))
            posterior = prior + posterior
            posteriors.append(posterior)

        return np.argmax(posteriors)

    def gaussian_density(self, x, mean, var):
        const = 1 / np.sqrt(var * 2 * np.pi)
        proba = np.exp(-0.5 * ((x - mean) ** 2 / var))

        return const * proba
        
def get_accuracy(y_true, y_pred):
    accuracy = np.sum(y_true == y_pred) / len(y_true)
    return accuracy

In [326]:
nb = NaiveBayes()
nb.fit(train_x, train_y)
pred_y = nb.predict(test_x)

print('Accuracy_score = ', get_accuracy(pred_y, test_y)*100, '%')

Accuracy_score =  93.54893762266406 %


## Question 1-ii


In [314]:
from sklearn.tree import DecisionTreeClassifier

class RandomForest:
    def __init__(self, n_trees=10, max_depth=10, min_samples_split=2, n_feature=None):
        self.n_trees = n_trees
        self.max_depth=max_depth
        self.min_samples_split=min_samples_split
        self.n_features=n_feature
        self.trees = []

    def fit(self, X, y):
        self.trees = []
        for _ in range(self.n_trees):
            tree = DecisionTreeClassifier()
            X_sample, y_sample = self._bootstrap_samples(X, y)
            tree.fit(X_sample, y_sample)
            self.trees.append(tree)

    def _bootstrap_samples(self, X, y):
        n_samples = X.shape[0]
        idxs = np.random.choice(n_samples, n_samples, replace=True)
        return X[idxs], y[idxs]

    def _most_common_label(self, y):
        counter = Counter(y)
        most_common = counter.most_common(1)[0][0]
        return most_common

    def predict(self, X):
        predictions = np.array([tree.predict(X) for tree in self.trees])
        tree_preds = np.swapaxes(predictions, 0, 1)
        predictions = np.array([self._most_common_label(pred) for pred in tree_preds])
        return predictions

# model train and fit
model = RandomForest(n_trees=2)
train_y_1 = train_y.values.reshape(len(train_y),)
model.fit(train_x, train_y_1)
pred_y = model.predict(test_x)

print('Accuracy_score = ', get_accuracy(pred_y, test_y)*100, '%')

Accuracy_score =  89.92722178562886 %


## Question 1-iii


In [315]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score


randomforest = RandomForestClassifier(n_estimators=100)
randomforest.fit(train_x, train_y)
pred_y = randomforest.predict(test_x)

print(f'Accuracy_score = {accuracy_score(pred_y, test_y)*100} %')

Accuracy_score = 92.33266135921004 %


## Question 1-iv
I choose to use XGboost


In [None]:
!pip install catboost

In [317]:
from xgboost import XGBClassifier

xgboostModel = XGBClassifier(n_estimators=100, learning_rate= 0.3)
xgboostModel.fit(train_x, train_y)
pred_y = xgboostModel.predict(test_x)

print(f'xgboost accuracy_score = {accuracy_score(pred_y, test_y)*100} %')

import lightgbm as lgb
from lightgbm import LGBMClassifier

classifier = lgb.LGBMClassifier(objective = 'binary', 
        learning_rate = 0.05, 
        n_estimators = 100, 
        random_state=0)
classifier.fit(train_x, train_y)
pred_y = classifier.predict(test_x)
print(f'lightgbm accuracy_score = {accuracy_score(pred_y, test_y)*100} %')

xgboost accuracy_score = 92.72047015409848 %
lightgbm accuracy_score = 93.60371749752281 %


## Question 2-i

In [318]:
from sklearn.model_selection import cross_val_score
# K = 3
data_x_2 = data_x.values.reshape(-1, len(data_x.columns))
data_y_2 = data_y.values.reshape((len(data_y)))
scores = cross_val_score(xgboostModel, data_x_2, data_y_2, cv=3, scoring='accuracy')
print(scores)
print(scores.mean())
# K = 5
scores = cross_val_score(xgboostModel, data_x_2, data_y_2, cv=5, scoring='accuracy')
print(scores)
print(scores.mean())
# K = 10
scores = cross_val_score(xgboostModel, data_x_2, data_y_2, cv=10, scoring='accuracy')
print(scores)
print(scores.mean())

[0.93605038 0.93599918 0.9359959 ]
0.9360151553240786
[0.93600137 0.93600137 0.93608124 0.93608124 0.9359959 ]
0.9360322238823928
[0.93600683 0.93600683 0.93616658 0.93616658 0.9359959  0.9359959
 0.9359959  0.9359959  0.93582523 0.9359959 ]
0.9360151559369878


## Question 2-ii

In [319]:
import time
# xgboosting time compute
print('xgboosting:')
for i in [3, 5, 10]:
  start_time = time.time()
  scores = cross_val_score(xgboostModel, data_x_2, data_y_2, cv=i, scoring='accuracy')
  print(f'mean_score = {scores.mean()}')
  end_time = time.time()
  print(f'run_time : {end_time - start_time} sec')
  print('----------------------')
# forest time compute
print('RandomForest:')
for i in [3, 5, 10]:
  start_time = time.time()
  scores = cross_val_score(randomforest, data_x_2, data_y_2, cv=i, scoring='accuracy')
  print(f'mean_score = {scores.mean()}')
  end_time = time.time()
  print(f'run_time : {end_time - start_time} sec')
  print('----------------------')  

xgboosting:
mean_score = 0.9360151553240786
run_time : 4.724787950515747 sec
----------------------
mean_score = 0.9360322238823928
run_time : 26.38509488105774 sec
----------------------
mean_score = 0.9360151559369878
run_time : 15.970160484313965 sec
----------------------
RandomForest:
mean_score = 0.9352983269235513
run_time : 9.045294284820557 sec
----------------------
mean_score = 0.9353324719787783
run_time : 18.104450941085815 sec
----------------------
mean_score = 0.9352642065793008
run_time : 24.760231256484985 sec
----------------------


## Question 2-iii

In [320]:
# randomforest
scores = cross_val_score(randomforest, data_x_2, data_y_2, cv=5, scoring='accuracy')
print(scores)
print(scores.mean())
# xgboost
scores = cross_val_score(xgboostModel, data_x_2, data_y_2, cv=5, scoring='accuracy')
print(scores)
print(scores.mean())

scores = cross_val_score(classifier, data_x_2, data_y_2, cv=5, scoring='accuracy')
print(scores)
print(scores.mean())


[0.93489206 0.93531871 0.93582523 0.93565455 0.93548387]
0.9354348829020509
[0.93600137 0.93600137 0.93608124 0.93608124 0.9359959 ]
0.9360322238823928
[0.93600137 0.93600137 0.93608124 0.93608124 0.9359959 ]
0.9360322238823928
