In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/nycu-dilab-2023-onboard/data_train.csv
/kaggle/input/nycu-dilab-2023-onboard/data_test.csv
/kaggle/input/nycu-dilab-2023-onboard/answer_train.csv
/kaggle/input/nycu-dilab-2023-onboard/answer_sample.csv


# Requirement
* Submit your predicted results on kaggle, and you should be able to beat ‘wee3_medium.csv’, and it is not hard to beat ‘week_3_hard’
* Improve over last week’s performance by using ensemble models or dimension reduction methods (RF, XGBoost, Lightgbm, Catboost).

In [2]:
import warnings
warnings.filterwarnings("ignore")
import numpy as np
import pandas as pd

from sklearn.decomposition import PCA
from sklearn.ensemble import RandomForestClassifier

from sklearn.metrics import confusion_matrix
from sklearn.metrics import recall_score,accuracy_score
from sklearn.metrics import precision_score

import csv


def generate_csv(ypredict, filename = 'result.csv'):
    with open(filename,'w',newline = '') as fd:
        writer = csv.writer(fd)
        writer.writerow(['index','default.payment.next.month'])

    for index,pred in enumerate(ypredict):
        with open(filename,'a',newline = '') as fd:
            writer = csv.writer(fd)
            writer.writerow([index,pred])
            
def show_result(ground_truth, prediction):
    print("Accuracy: ", accuracy_score(ground_truth, prediction))
    print("Precision: ",precision_score(ground_truth, prediction))
    print("Recall: ",recall_score(ground_truth, prediction))
    
    plt.figure(figsize=(4,3))
    cm = confusion_matrix(ground_truth, prediction)
    sns.heatmap(cm, annot=True, fmt="d", cmap="Blues", xticklabels=["0", "1"], yticklabels=["0", "1"])
    plt.xlabel("Predicted Labels")
    plt.ylabel("True Labels")
    plt.show()
    return accuracy_score(ground_truth, prediction)

# Load Data

In [3]:
train_x = pd.read_csv('/kaggle/input/nycu-dilab-2023-onboard/data_train.csv')
train_x = train_x.iloc[:,1:-1]
train_y = pd.read_csv('/kaggle/input/nycu-dilab-2023-onboard/answer_train.csv')
train_y = train_y.iloc[:,-1]
test_x = pd.read_csv('/kaggle/input/nycu-dilab-2023-onboard/data_test.csv')
test_x = test_x.iloc[:,1:-1]
rawdata = pd.concat([train_x, train_y], axis=1, join='inner')

# PCA

In [4]:
n_components = 5
pca = PCA(n_components=n_components)
pca.fit(train_x)
train_x_pca = pca.transform(train_x)
test_x_pca = pca.transform(test_x)

# Ensemble

In [5]:
#Original
model=RandomForestClassifier(n_estimators = 200)
model.fit(train_x, train_y)
ypredict=model.predict_proba(test_x)
ypredict = [pair[1] for pair in ypredict]
generate_csv(ypredict, 'Original.csv')

In [6]:
#PCA
model=RandomForestClassifier(n_estimators = 200)
model.fit(train_x_pca, train_y)
ypredict=model.predict_proba(test_x_pca)
ypredict = [pair[1] for pair in ypredict]
generate_csv(ypredict, 'PCA.csv')

In [7]:
import xgboost as xgb

model = xgb.XGBClassifier(n_estimators=100)
model.fit(train_x, train_y)
ypredict=model.predict_proba(test_x)
ypredict = [pair[1] for pair in ypredict]
generate_csv(ypredict, 'XGB.csv')

In [8]:
import lightgbm as lgb

model = lgb.LGBMClassifier(n_estimators=100)
model.fit(train_x, train_y)
ypredict=model.predict_proba(test_x)
ypredict = [pair[1] for pair in ypredict]
generate_csv(ypredict, 'LGB.csv')

In [9]:
import catboost as cb

model = cb.CatBoostClassifier(n_estimators=100, logging_level='Silent')
model.fit(train_x, train_y)
ypredict=model.predict_proba(test_x)
ypredict = [pair[1] for pair in ypredict]
generate_csv(ypredict, 'CB.csv')

## Analyze
* The score after applying dimension reduction dropped by 10%, it seems like the data loss too much information after PCA.
* Test performance improved 3~5% in comparison to traditional models such as decision tree, SVM.
* Test score: RF > LGB > CB > XGB