# BT5126 Assignment 2
Student Name: Hai-hsin HUANG

Student Number: A0231906J

In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV
import seaborn as sn
import lightgbm as ltb
import xgboost as xgb
from sklearn.metrics import mean_squared_error
import matplotlib.pyplot as plt
from sklearn.preprocessing import PolynomialFeatures
from sklearn.compose import make_column_selector
from sklearn import set_config
set_config(display="diagram")
from sklearn.compose import make_column_transformer
from sklearn.impute import SimpleImputer
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import OrdinalEncoder
from sklearn.preprocessing import OneHotEncoder
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.metrics import classification_report
import warnings
warnings.filterwarnings('ignore')

In [2]:
df = pd.read_csv(r'C:\Users\Joyce Huang\Downloads\BT5126-HW2\Q1_train.csv')
df.sic = df.sic.astype('object')
X = df.drop(['bankrupt','gvkey','datadate','conm'],axis=1)
y = df.bankrupt
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42, stratify=y)

## Split up Categorical and numeric features

In [3]:
cat_selector = make_column_selector(dtype_include=object)
num_selector = make_column_selector(dtype_include=np.number)

num_selector(X)
cat_selector(X)

['sic']

## Custom Transformer for SIC grouping

In [4]:
class sic_grouping(BaseEstimator, TransformerMixin):
    def __init__(self):
        super().__init__()

    # Return self nothing else to do here
    def fit(self, X, y=None):
        return self

    # Helper function that converts values to Binary depending on input
    def create_group(self, obj):
        if 999>obj>100:
            return '1'
        elif 1000<obj<1499:
            return '2'
        elif 1500<obj<1799:
            return '3'
        elif 1800<obj<1999:
            return np.nan
        elif 2000<obj<3999:
            return '4'
        elif 4000<obj<4999:
            return '5'
        elif 5000<obj<5199:
            return '6'
        elif 5200<obj<5999:
            return '7'
        elif 6000<obj<6799:
            return '8'
        elif 7000<obj<8999:
            return '9'
        elif 9100<obj<9729:
            return '10'
        elif 9900<obj<9999:
            return '11'
        else:
            return np.nan

    # Transformer method for this transformer
    def transform(self, X, y=None):
        # Categorical features to pass down the categorical pipeline
        return X[['sic']].values

## Adding interaction terms

In [5]:
interaction_terms = PolynomialFeatures(interaction_only=True,include_bias = False)

## Processor 01

In [6]:
from sklearn import preprocessing
cat_processor = sic_grouping()

preprocessor = make_column_transformer(
    (interaction_terms, num_selector), (cat_processor, cat_selector)
)

preprocessor

## Processor 2: Standard Scaler and One hot encoding

In [7]:
# Standardization
standard_scaler = StandardScaler()

In [8]:
# MinMax Scaling
# minmax_scaler = MinMaxScaler()

In [9]:
cat_onehot = make_pipeline(cat_processor,OneHotEncoder(handle_unknown="ignore"))

standard_processor = make_pipeline(interaction_terms,standard_scaler)

# minmax_processor = make_pipeline(interaction_terms,minmax_scaler)

second_preprocessor = make_column_transformer(
    (standard_processor, num_selector),(cat_onehot, cat_selector)
)

second_preprocessor

## Stacking Level 0

In [10]:
import lightgbm as lgb
lightgbm_pipeline = make_pipeline(second_preprocessor, lgb.LGBMClassifier())
lightgbm_pipeline

In [11]:
from xgboost import XGBClassifier
xgboost_pipeline = make_pipeline(second_preprocessor, XGBClassifier(objective='binary:logistic',use_label_encoder=False))
xgboost_pipeline

In [12]:
from sklearn.svm import SVC
svm_pipeline = make_pipeline(second_preprocessor, SVC(kernel ='rbf'))
svm_pipeline

## Stacking Level 1

In [13]:
from sklearn.ensemble import StackingClassifier

estimators = [
    ("LightGBM", lightgbm_pipeline),
    ("XGBoost", xgboost_pipeline),
    ("SVM-RBF", svm_pipeline),
]

stacking_classifier = StackingClassifier(estimators=estimators, final_estimator=lgb.LGBMClassifier())
stacking_classifier

## Performance on Stacking model without tuning

In [14]:
stacking_classifier.fit(X_train,y_train)
# Accuracy = stacking_classifier.score(X,y)
# print('The accuracy score of stacking without tuning is', Accuracy)



In [15]:
y_pred = stacking_classifier.predict(X_test)

In [16]:
import sklearn.metrics as skmetrics

def auc_pr(y_true, y_prob):
  precisions, recalls, thresholds = skmetrics.precision_recall_curve(y_true, y_prob)
  return skmetrics.auc(recalls, precisions)

In [17]:
print('PR-AUC Score =', auc_pr(y_test, y_pred),'\n')
print(classification_report(y_test, y_pred))

PR-AUC Score = 0.3111297539149888 

              precision    recall  f1-score   support

           0       0.99      1.00      1.00      7992
           1       0.45      0.17      0.24        54

    accuracy                           0.99      8046
   macro avg       0.72      0.58      0.62      8046
weighted avg       0.99      0.99      0.99      8046



## Stacking model tuning

In [18]:
from sklearn import preprocessing
cat_processor = sic_grouping()

preprocessor = make_column_transformer((interaction_terms, num_selector), (cat_processor, cat_selector))

standard_scaler = StandardScaler()
standard_processor = make_pipeline(interaction_terms,standard_scaler)

second_preprocessor = make_column_transformer((standard_processor, num_selector),(cat_onehot, cat_selector))
lightgbm_pipeline = make_pipeline(second_preprocessor, lgb.LGBMClassifier(max_depth=10, num_leaves=50, learning_rate
=0.1, min_child_samples=10))
xgboost_pipeline = make_pipeline(second_preprocessor, XGBClassifier(n_estimators=250, learning_rate=0.1, objective='binary:logistic',use_label_encoder=False))
svm_pipeline = make_pipeline(second_preprocessor, SVC(C=1, random_state=0, kernel ='rbf'))

estimators = [
    ("LightGBM", lightgbm_pipeline),
    ("XGBoost", xgboost_pipeline),
    ("SVM-RBF", svm_pipeline),
]

stacking_classifier_tuned = StackingClassifier(estimators=estimators, final_estimator=lgb.LGBMClassifier())
stacking_classifier_tuned

In [19]:
stacking_classifier_tuned.fit(X_train,y_train)



In [20]:
y_pred_2 = stacking_classifier_tuned.predict(X_test)

print('PR-AUC Score =', auc_pr(y_test, y_pred_2),'\n')
print(classification_report(y_test, y_pred_2))

PR-AUC Score = 0.3229718591781467 

              precision    recall  f1-score   support

           0       0.99      1.00      1.00      7992
           1       0.47      0.17      0.25        54

    accuracy                           0.99      8046
   macro avg       0.73      0.58      0.62      8046
weighted avg       0.99      0.99      0.99      8046



## Test data prediction

In [23]:
df_test = pd.read_csv(r'C:\Users\Joyce Huang\Downloads\BT5126-HW2\Q1_test.csv')
df_test.sic = df_test.sic.astype('object')
X_test = df_test.drop(['id','bankrupt'],axis=1)
y_test = df_test.bankrupt

In [24]:
y_pred = stacking_classifier_tuned.predict(X_test)
df_test.bankrupt = y_pred
df_test.to_csv(r'C:\Users\Joyce Huang\Downloads\BT5126-HW2\q3_pred.csv', index=False)