In [1]:
import numpy as np
import pandas as pd
RANDOM_STATE = 11
pd.set_option('display.max_columns', 0)

In [2]:
train_X = pd.read_csv("../output/train_X.csv")
train_y = pd.read_csv("../output/train_y.csv").squeeze()

test_X = pd.read_csv("../output/test_X.csv")
test_y = pd.read_csv("../output/test_y.csv").squeeze()

### validation data

In [3]:
from toolbox import train_test_split

In [4]:
train_X, val_X, train_y, val_y = train_test_split(train_X, train_y, train_size=0.8, random_state=RANDOM_STATE)
print('{:<15} :'.format('train_X shape'), train_X.shape)
print('{:<15} :'.format('train_y shape'), train_y.shape)
print('-'*20)
print('{:<15} :'.format('val_X shape'), val_X.shape)
print('{:<15} :'.format('val_y shape'), val_y.shape)
print('-'*20)
print('{:<15} :'.format('test_X shape'), test_X.shape)
print('{:<15} :'.format('test_y shape'), test_y.shape)

train_X shape   : (37499, 55)
train_y shape   : (37499,)
--------------------
val_X shape     : (9375, 55)
val_y shape     : (9375,)
--------------------
test_X shape    : (11718, 55)
test_y shape    : (11718,)


In [5]:
from toolbox import evaluate

# Classification Task

In [6]:
# cat_cols = ["airbags", "is_esc", "is_adjustable_steering", "is_tpms",
#             "is_parking_sensors", "is_parking_camera", "is_front_fog_lights",
#             "is_rear_window_wiper", "is_rear_window_washer", "is_rear_window_defogger", "is_brake_assist", ""]
# num_cols = ["policy_tenure", "age_of_car", "age_of_policyholder", "area_cluster",
#             "population_density", "model", "max_torque", "max_power",
#             "engine_type", "displacement", "gear_box", "displacement",
#             "turning_radius", "length", "width", "height",
#             "gross_weight", ]

### Naive Bayes
<pre>
若是類別資料，計算兩種狀況各自的條件機率
若是數值資料，透過 guassian MLE 來計算可能性
</pre>

In [7]:
class NaiveBayesClassifier:
    def __init__(self):
        pass

    def _divide_cat_num_cols(self):
        num_cols = []
        cat_cols = []
        for col in self.X.columns:
            if len(self.X[col].unique()) > 5:
                num_cols.append(col)
            else:
                cat_cols.append(col)
        return num_cols, cat_cols

    def _likelihood(self, x, mean, var):
        eps = 1e-4
        # print((1 / np.sqrt(2 * np.pi * var + eps)) * np.exp( -1 * (x-mean)**2 / (2 * var + eps)))
        return (1 / np.sqrt(2 * np.pi * var + eps)) * np.exp( -1 * (x-mean)**2 / (2 * var + eps))

    def fit(self, X, y):
        self.X = X
        self.y = y
        self.num_cols, self.cat_cols = self._divide_cat_num_cols()
        self.y_classes = y.unique()
        self.parameter = dict()

        for c in self.y_classes:
            mask = (y == c)
            c_X = X[mask]
            self.parameter[c] = dict()
            for col in self.X.columns:
                if col in self.num_cols:
                    # 數值特徵紀錄 mean, var
                    self.parameter[c][col] = [
                        round(c_X[col].mean(), 2),
                        round(c_X[col].var(), 2)
                    ]
                elif col in self.cat_cols:
                    # 類別特徵直接計算各類別的機率
                    self.parameter[c][col] = dict()
                    for cat in c_X[col].unique():
                        mask = (c_X[col] == cat)
                        self.parameter[c][col][cat] = round(len(c_X.loc[mask] ) / len(c_X), 2)
        return self.parameter
    def _calc_prob(self, X):
        all_prob = []
        for c in self.y_classes:
            c_prob = 1
            for key, value in X.items():
                if key in self.num_cols:
                    mean, var = self.parameter[c][key]
                    c_prob *= self._likelihood(value, mean, var)
                elif key in self.cat_cols:
                    c_prob *= self.parameter[c][key][value]
            all_prob.append(c_prob)
        return all_prob

    def predict(self, test_X):
        pred_y = np.array([self._calc_prob(X) for i, X in test_X.iterrows()])
        return pred_y.argmax(axis=1)

    def evaluate(self):
        pass
    

nb_clf = NaiveBayesClassifier()
nb_clf.fit(train_X, train_y)
pred_y = nb_clf.predict(test_X)
evaluate(test_y, pred_y)

Accuracy        : 0.46944871138419525
Precision       : 0.06671936758893281
Recall          : 0.5733695652173914
F1              : 0.11952981164141055
              precision    recall  f1-score   support

           0       0.94      0.46      0.62     10982
           1       0.07      0.57      0.12       736

    accuracy                           0.47     11718
   macro avg       0.50      0.52      0.37     11718
weighted avg       0.89      0.47      0.59     11718



### Random Forest Classifier

In [8]:
class Node:
    '''
        特徵：data, 剩餘 features, left_child, right_child
    '''
    @staticmethod
    def _calc_gini(y):
        total_len = len(y)
        if total_len == 0:
            return 0
        pos_len = len(y[y==1])
        neg_len = total_len - pos_len
        return 1 - (pos_len / total_len)**2 - (neg_len / total_len)**2

    def __init__(self, X, y, feat_candidate):
        self.X = X
        self.y = y
        self.feat_candidate = feat_candidate
        self.gini = self._calc_gini(y)
        self.l_child = None
        self.r_child = None

class DecisionTreeClassifier:
    '''
        * 二元類別：
        * 多元類別：
        * 數值特徵：sort -> 中位數切分
    '''
    @staticmethod
    def _calc_gini(y):
        total_len = len(y)
        pos_len = len(y[y==1])
        neg_len = total_len - pos_len
        return 1 - (pos_len / total_len)**2 - (neg_len / total_len)**2

    def __init__(self):
        self.min_num = 15
        pass

    def choose_best_feat(self, node):
        best_feat = node.feat_candidate[0]
        best_gini_gain = 0
        best_mask = None
        feat_del = []
        for feat in node.feat_candidate:
            uniq = node.X[feat].unique()
            if len(uniq) == 1:
                # 類別，單獨值，此類別無用處
                feat_del.append(feat)
                continue
            elif len(uniq) == 2:
                # 類別
                mask = (node.X[feat] == uniq[0])
            else:
                # 數值
                median = np.median(node.X[feat].to_numpy())
                mask = (node.X[feat] <= median)

            # print('---choose feature')
            gini_gain = (node.gini
                - len(node.X[mask]) / len(node.X) * self._calc_gini(node.y[mask])
                - len(node.X[~mask]) / len(node.X) * self._calc_gini(node.y[~mask]))
            if gini_gain > best_gini_gain:
                best_gini_gain = gini_gain
                best_feat = feat
                best_mask = mask
        return best_feat, best_mask, feat_del

    def divide_branch(self, node):
        if node.gini == 0 or len(node.feat_candidate) == 0 or len(node.X) < self.min_num:
            # 乾淨 node / 沒特徵可用 / 小於最小數量
            return None
        
        best_feat, best_mask, feat_del = self.choose_best_feat(node)
        node.feat_candidate = [feat for feat in node.feat_candidate if feat not in feat_del]

        # print('--new branch')
        
        
        if best_mask is not None:
            # 左邊分支確認有找到
            node.l_child = Node(
                node.X[best_mask], node.y[best_mask],
                feat_candidate=[feat for feat in node.feat_candidate if feat != best_feat])
            # print('yes')
            self.divide_branch(node.l_child)
            

        if len(node.y[~best_mask]) != 0:
            # 右邊分支先確認數量
            node.r_child = Node(
                node.X[~best_mask], node.y[~best_mask],
                feat_candidate=[feat for feat in node.feat_candidate if feat != best_feat])
            self.divide_branch(node.r_child)
            
    def fit(self, X, y):
        self.root = Node(X, y, list(X.columns))
        self.divide_branch(self.root)

    def predict(self, X):
        pass

dt_clf = DecisionTreeClassifier()
# dt_clf.fit(train_X, train_y)

In [9]:
class RandomForestClassifier:
    def __init__(self):
        pass

    def fit(self):
        pass

    def evaluate(self):
        pass
    def predict(self):
        pass

### Random Forest Classifier: sklearn version

In [10]:
from sklearn.ensemble import RandomForestClassifier

In [11]:
rf_clf = RandomForestClassifier(
    n_estimators=100, max_depth=5,
    class_weight='balanced' ,random_state=RANDOM_STATE)

# rf_clf = RandomForestClassifier(
#     n_estimators=100, max_depth=2, 
#     class_weight={0:1, 1:17} ,random_state=RANDOM_STATE)
rf_clf.fit(train_X, train_y)
pred_y = rf_clf.predict(test_X[train_X.columns])
evaluate(test_y, pred_y)

Accuracy        : 0.5640040962621607
Precision       : 0.09199477514461653
Recall          : 0.6698369565217391
F1              : 0.16177194421657096
              precision    recall  f1-score   support

           0       0.96      0.56      0.71     10982
           1       0.09      0.67      0.16       736

    accuracy                           0.56     11718
   macro avg       0.53      0.61      0.43     11718
weighted avg       0.91      0.56      0.67     11718



### XGBoost, Catboost, LightGBM

In [12]:
# from xgboost
from lightgbm import LGBMClassifier
from xgboost import XGBClassifier
from catboost import CatBoostClassifier
lgb_clf = LGBMClassifier(class_weight="balanced")
xgb_clf = XGBClassifier(scale_pos_weight=99)
cat_clf = CatBoostClassifier(auto_class_weights='Balanced')

In [13]:
# lgb_clf.fit(train_X, train_y)
# pred_y = lgb_clf.predict(test_X)
# evaluate(test_y, pred_y)

In [14]:
# xgb_clf.fit(train_X, train_y)
# pred_y = xgb_clf.predict(test_X[train_X.columns])
# evaluate(test_y, pred_y)

In [15]:
# cat_clf.fit(train_X, train_y, verbose=0)
# pred_y = cat_clf.predict(test_X)
# evaluate(test_y, pred_y)

# Cross-Validation
<pre>
k = 3, 5, 10
</pre>

In [16]:
from toolbox import cross_validation
from sklearn.metrics import accuracy_score, f1_score

In [17]:
result = []
for k in [3, 5, 10]:
    cv = cross_validation(train_X, train_y, k=k)
    all_acc = dict()
    for train_X, val_X, train_y, val_y, i in cv:
        nb_clf = NaiveBayesClassifier()
        rf_clf = RandomForestClassifier(
                n_estimators=100, max_depth=5,
                class_weight='balanced' ,random_state=RANDOM_STATE)
        lgb_clf = LGBMClassifier(class_weight="balanced")
        xgb_clf = XGBClassifier(scale_pos_weight=99)
        cat_clf = CatBoostClassifier(auto_class_weights='Balanced', verbose=0)
        nb_clf.fit(train_X, train_y)
        rf_clf.fit(train_X, train_y)
        lgb_clf.fit(train_X, train_y)
        xgb_clf.fit(train_X, train_y)
        cat_clf.fit(train_X, train_y)

        all_acc['nb'] = f1_score(val_y, nb_clf.predict(val_X))
        all_acc['rf'] = f1_score(val_y, rf_clf.predict(val_X))
        all_acc['lgb'] = f1_score(val_y, lgb_clf.predict(val_X))
        all_acc['xgb'] = f1_score(val_y, xgb_clf.predict(val_X))
        all_acc['cat'] = f1_score(val_y, cat_clf.predict(val_X))
    result.append([
        k, all_acc['nb'], all_acc['rf'], all_acc['lgb'], all_acc['xgb'], all_acc['cat']
    ])

        

In [18]:
pd.DataFrame(result, columns=['k', 'nb_f1', 'rf_f1', 'lgb_f1', 'xgb_f1', 'cat_f1'])

Unnamed: 0,k,nb_f1,rf_f1,lgb_f1,xgb_f1,cat_f1
0,3,0.128443,0.163924,0.168973,0.140397,0.152752
1,5,0.130027,0.165946,0.169072,0.151739,0.160593
2,10,0.104712,0.14128,0.122667,0.114613,0.13245
