### Import libraries and dataset

In [1]:
import pandas as pd
import numpy as np
import joblib
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, classification_report, confusion_matrix

In [2]:
df_original = pd.read_csv('../data/UCI_Credit_Card.csv')

In [3]:
df = df_original.copy()

### Inspect Data

In [4]:
df.shape

(30000, 25)

In [5]:
df.dtypes

ID                              int64
LIMIT_BAL                     float64
SEX                             int64
EDUCATION                       int64
MARRIAGE                        int64
AGE                             int64
PAY_0                           int64
PAY_2                           int64
PAY_3                           int64
PAY_4                           int64
PAY_5                           int64
PAY_6                           int64
BILL_AMT1                     float64
BILL_AMT2                     float64
BILL_AMT3                     float64
BILL_AMT4                     float64
BILL_AMT5                     float64
BILL_AMT6                     float64
PAY_AMT1                      float64
PAY_AMT2                      float64
PAY_AMT3                      float64
PAY_AMT4                      float64
PAY_AMT5                      float64
PAY_AMT6                      float64
default.payment.next.month      int64
dtype: object

In [6]:
df.head()

Unnamed: 0,ID,LIMIT_BAL,SEX,EDUCATION,MARRIAGE,AGE,PAY_0,PAY_2,PAY_3,PAY_4,...,BILL_AMT4,BILL_AMT5,BILL_AMT6,PAY_AMT1,PAY_AMT2,PAY_AMT3,PAY_AMT4,PAY_AMT5,PAY_AMT6,default.payment.next.month
0,1,20000.0,2,2,1,24,2,2,-1,-1,...,0.0,0.0,0.0,0.0,689.0,0.0,0.0,0.0,0.0,1
1,2,120000.0,2,2,2,26,-1,2,0,0,...,3272.0,3455.0,3261.0,0.0,1000.0,1000.0,1000.0,0.0,2000.0,1
2,3,90000.0,2,2,2,34,0,0,0,0,...,14331.0,14948.0,15549.0,1518.0,1500.0,1000.0,1000.0,1000.0,5000.0,0
3,4,50000.0,2,2,1,37,0,0,0,0,...,28314.0,28959.0,29547.0,2000.0,2019.0,1200.0,1100.0,1069.0,1000.0,0
4,5,50000.0,1,2,1,57,-1,0,-1,0,...,20940.0,19146.0,19131.0,2000.0,36681.0,10000.0,9000.0,689.0,679.0,0


In [7]:
df.describe()

Unnamed: 0,ID,LIMIT_BAL,SEX,EDUCATION,MARRIAGE,AGE,PAY_0,PAY_2,PAY_3,PAY_4,...,BILL_AMT4,BILL_AMT5,BILL_AMT6,PAY_AMT1,PAY_AMT2,PAY_AMT3,PAY_AMT4,PAY_AMT5,PAY_AMT6,default.payment.next.month
count,30000.0,30000.0,30000.0,30000.0,30000.0,30000.0,30000.0,30000.0,30000.0,30000.0,...,30000.0,30000.0,30000.0,30000.0,30000.0,30000.0,30000.0,30000.0,30000.0,30000.0
mean,15000.5,167484.322667,1.603733,1.853133,1.551867,35.4855,-0.0167,-0.133767,-0.1662,-0.220667,...,43262.948967,40311.400967,38871.7604,5663.5805,5921.163,5225.6815,4826.076867,4799.387633,5215.502567,0.2212
std,8660.398374,129747.661567,0.489129,0.790349,0.52197,9.217904,1.123802,1.197186,1.196868,1.169139,...,64332.856134,60797.15577,59554.107537,16563.280354,23040.87,17606.96147,15666.159744,15278.305679,17777.465775,0.415062
min,1.0,10000.0,1.0,0.0,0.0,21.0,-2.0,-2.0,-2.0,-2.0,...,-170000.0,-81334.0,-339603.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,7500.75,50000.0,1.0,1.0,1.0,28.0,-1.0,-1.0,-1.0,-1.0,...,2326.75,1763.0,1256.0,1000.0,833.0,390.0,296.0,252.5,117.75,0.0
50%,15000.5,140000.0,2.0,2.0,2.0,34.0,0.0,0.0,0.0,0.0,...,19052.0,18104.5,17071.0,2100.0,2009.0,1800.0,1500.0,1500.0,1500.0,0.0
75%,22500.25,240000.0,2.0,2.0,2.0,41.0,0.0,0.0,0.0,0.0,...,54506.0,50190.5,49198.25,5006.0,5000.0,4505.0,4013.25,4031.5,4000.0,0.0
max,30000.0,1000000.0,2.0,6.0,3.0,79.0,8.0,8.0,8.0,8.0,...,891586.0,927171.0,961664.0,873552.0,1684259.0,896040.0,621000.0,426529.0,528666.0,1.0


### Handle missing values

In [8]:
df.isnull().sum()

ID                            0
LIMIT_BAL                     0
SEX                           0
EDUCATION                     0
MARRIAGE                      0
AGE                           0
PAY_0                         0
PAY_2                         0
PAY_3                         0
PAY_4                         0
PAY_5                         0
PAY_6                         0
BILL_AMT1                     0
BILL_AMT2                     0
BILL_AMT3                     0
BILL_AMT4                     0
BILL_AMT5                     0
BILL_AMT6                     0
PAY_AMT1                      0
PAY_AMT2                      0
PAY_AMT3                      0
PAY_AMT4                      0
PAY_AMT5                      0
PAY_AMT6                      0
default.payment.next.month    0
dtype: int64

No missing values were found

### Handle outliers

In [9]:
# Function to detect outliers using IQR method
def detect_outliers_iqr(data, column):
    Q1 = data[column].quantile(0.25)
    Q3 = data[column].quantile(0.75)
    IQR = Q3 - Q1
    lower_bound = Q1 - 1.5 * IQR
    upper_bound = Q3 + 1.5 * IQR
    outliers = data[(data[column] < lower_bound) | (data[column] > upper_bound)]
    return len(outliers), lower_bound, upper_bound

# Check outliers in key columns
numerical_features = ['LIMIT_BAL', 'AGE', 'BILL_AMT1', 'BILL_AMT2', 
                      'BILL_AMT3', 'BILL_AMT4', 'BILL_AMT5', 'BILL_AMT6',
                      'PAY_AMT1', 'PAY_AMT2', 'PAY_AMT3', 'PAY_AMT4', 
                      'PAY_AMT5', 'PAY_AMT6']

for col in numerical_features:
    n_outliers, lower, upper = detect_outliers_iqr(df, col)
    print(f"{col}: {n_outliers} outliers ({n_outliers/len(df)*100:.2f}%)")

print()

# Cap outliers at 1st and 99th percentile for bill and payment amounts
bill_payment_cols = ['BILL_AMT1', 'BILL_AMT2', 'BILL_AMT3', 'BILL_AMT4', 
                     'BILL_AMT5', 'BILL_AMT6', 'PAY_AMT1', 'PAY_AMT2', 
                     'PAY_AMT3', 'PAY_AMT4', 'PAY_AMT5', 'PAY_AMT6']

for col in bill_payment_cols:
    lower_bound = df[col].quantile(0.01)
    upper_bound = df[col].quantile(0.99)
    df[col] = df[col].clip(lower=lower_bound, upper=upper_bound)
    print(f"Capped {col} at [{lower_bound:.2f}, {upper_bound:.2f}]")

# Remove clearly erroneous values (e.g., invalid ages)
print(f"\nRecords before age filtering: {len(df)}")
df = df[(df['AGE'] >= 18) & (df['AGE'] <= 100)]
print(f"Records after age filtering: {len(df)}")

LIMIT_BAL: 167 outliers (0.56%)
AGE: 272 outliers (0.91%)
BILL_AMT1: 2400 outliers (8.00%)
BILL_AMT2: 2395 outliers (7.98%)
BILL_AMT3: 2469 outliers (8.23%)
BILL_AMT4: 2622 outliers (8.74%)
BILL_AMT5: 2725 outliers (9.08%)
BILL_AMT6: 2693 outliers (8.98%)
PAY_AMT1: 2745 outliers (9.15%)
PAY_AMT2: 2714 outliers (9.05%)
PAY_AMT3: 2598 outliers (8.66%)
PAY_AMT4: 2994 outliers (9.98%)
PAY_AMT5: 2945 outliers (9.82%)
PAY_AMT6: 2958 outliers (9.86%)

Capped BILL_AMT1 at [-81.00, 350110.68]
Capped BILL_AMT2 at [-200.00, 337495.28]
Capped BILL_AMT3 at [-200.00, 325030.39]
Capped BILL_AMT4 at [-212.02, 304997.27]
Capped BILL_AMT5 at [-232.01, 285868.33]
Capped BILL_AMT6 at [-331.03, 279505.06]
Capped PAY_AMT1 at [0.00, 66522.18]
Capped PAY_AMT2 at [0.00, 76651.02]
Capped PAY_AMT3 at [0.00, 70000.00]
Capped PAY_AMT4 at [0.00, 67054.44]
Capped PAY_AMT5 at [0.00, 65607.56]
Capped PAY_AMT6 at [0.00, 82619.05]

Records before age filtering: 30000
Records after age filtering: 30000


### Clean categorical variables

In [10]:
# Check current distribution of categorical variables
print("EDUCATION distribution (before cleaning):")
print(df['EDUCATION'].value_counts().sort_index())

print("\nMARRIAGE distribution (before cleaning):")
print(df['MARRIAGE'].value_counts().sort_index())

print("\nSEX distribution:")
print(df['SEX'].value_counts().sort_index())

# Clean EDUCATION: recode 0, 5, 6 as 4 (others)
df['EDUCATION'] = df['EDUCATION'].replace({0: 4, 5: 4, 6: 4})
print("\nEDUCATION distribution (after cleaning):")
print(df['EDUCATION'].value_counts().sort_index())

# Clean MARRIAGE: recode 0 as 3 (others)
df['MARRIAGE'] = df['MARRIAGE'].replace({0: 3})
print("\nMARRIAGE distribution (after cleaning):")
print(df['MARRIAGE'].value_counts().sort_index())

# Verify SEX values
print(f"\nSEX unique values: {sorted(df['SEX'].unique())}")
assert df['SEX'].isin([1, 2]).all(), "Invalid SEX values found"
print("✓ SEX values are valid (1 or 2)")

# Validate payment status variables
pay_cols = ['PAY_0', 'PAY_2', 'PAY_3', 'PAY_4', 'PAY_5', 'PAY_6']
print("\nPayment status ranges:")
for col in pay_cols:
    print(f"{col}: min={df[col].min()}, max={df[col].max()}, unique values={len(df[col].unique())}")

EDUCATION distribution (before cleaning):
EDUCATION
0       14
1    10585
2    14030
3     4917
4      123
5      280
6       51
Name: count, dtype: int64

MARRIAGE distribution (before cleaning):
MARRIAGE
0       54
1    13659
2    15964
3      323
Name: count, dtype: int64

SEX distribution:
SEX
1    11888
2    18112
Name: count, dtype: int64

EDUCATION distribution (after cleaning):
EDUCATION
1    10585
2    14030
3     4917
4      468
Name: count, dtype: int64

MARRIAGE distribution (after cleaning):
MARRIAGE
1    13659
2    15964
3      377
Name: count, dtype: int64

SEX unique values: [np.int64(1), np.int64(2)]
✓ SEX values are valid (1 or 2)

Payment status ranges:
PAY_0: min=-2, max=8, unique values=11
PAY_2: min=-2, max=8, unique values=11
PAY_3: min=-2, max=8, unique values=11
PAY_4: min=-2, max=8, unique values=11
PAY_5: min=-2, max=8, unique values=10
PAY_6: min=-2, max=8, unique values=10


### Feature engineering

In [11]:
# Create Gender-Marriage combined feature
def create_gender_marriage_category(row):
    sex = row['SEX']
    marriage = row['MARRIAGE']
    
    if sex == 1:  # Male
        if marriage == 1:
            return 1  # Married man
        elif marriage == 2:
            return 2  # Single man
        else:  # marriage == 3 (others/divorced)
            return 3  # Divorced man
    else:  # Female (sex == 2)
        if marriage == 1:
            return 4  # Married woman
        elif marriage == 2:
            return 5  # Single woman
        else:  # marriage == 3 (others/divorced)
            return 6  # Divorced woman

df['GENDER_MARRIAGE'] = df.apply(create_gender_marriage_category, axis=1)

print("Gender-Marriage category distribution:")
print(df['GENDER_MARRIAGE'].value_counts().sort_index())
print(f"\nCategory labels:")
print("1: Married man")
print("2: Single man")
print("3: Divorced man")
print("4: Married woman")
print("5: Single woman")
print("6: Divorced woman")

Gender-Marriage category distribution:
GENDER_MARRIAGE
1    5190
2    6553
3     145
4    8469
5    9411
6     232
Name: count, dtype: int64

Category labels:
1: Married man
2: Single man
3: Divorced man
4: Married woman
5: Single woman
6: Divorced woman


#### Exclude divorced women

In [12]:
# Count divorced women before exclusion
divorced_women_count = (df['GENDER_MARRIAGE'] == 6).sum()
print(f"\nDivorced women records: {divorced_women_count}")
print(f"Dataset size before exclusion: {len(df)}")

# Exclude divorced women (category 6)
df = df[df['GENDER_MARRIAGE'] != 6].copy()

print(f"Dataset size after exclusion: {len(df)}")
print(f"Records removed: {divorced_women_count}")

print("\nRemaining Gender-Marriage categories:")
print(df['GENDER_MARRIAGE'].value_counts().sort_index())


Divorced women records: 232
Dataset size before exclusion: 30000
Dataset size after exclusion: 29768
Records removed: 232

Remaining Gender-Marriage categories:
GENDER_MARRIAGE
1    5190
2    6553
3     145
4    8469
5    9411
Name: count, dtype: int64


### Prepare features and target

In [13]:
# Remove ID column if it exists
if 'ID' in df.columns:
    df = df.drop('ID', axis=1)
    print("ID column removed")

# Define target variable
target_col = 'default.payment.next.month'

print(f"\nTarget variable: {target_col}")
print("Distribution:")
print(df[target_col].value_counts())
print(f"Default rate: {df[target_col].mean():.1%}")

# Separate features and target
y = df[target_col].copy()

# Define feature columns (all except target)
feature_cols = ['LIMIT_BAL', 'SEX', 'EDUCATION', 'MARRIAGE', 'AGE',
                'PAY_0', 'PAY_2', 'PAY_3', 'PAY_4', 'PAY_5', 'PAY_6',
                'BILL_AMT1', 'BILL_AMT2', 'BILL_AMT3', 'BILL_AMT4', 
                'BILL_AMT5', 'BILL_AMT6',
                'PAY_AMT1', 'PAY_AMT2', 'PAY_AMT3', 'PAY_AMT4', 
                'PAY_AMT5', 'PAY_AMT6',
                'GENDER_MARRIAGE']

X = df[feature_cols].copy()

print(f"\nFeature matrix shape: {X.shape}")
print(f"Target vector shape: {y.shape}")
print(f"\nFeatures being used: {list(X.columns)}")

# Check for missing values
print(f"\nMissing values in features: {X.isnull().sum().sum()}")
print(f"Missing values in target: {y.isnull().sum()}")

ID column removed

Target variable: default.payment.next.month
Distribution:
default.payment.next.month
0    23179
1     6589
Name: count, dtype: int64
Default rate: 22.1%

Feature matrix shape: (29768, 24)
Target vector shape: (29768,)

Features being used: ['LIMIT_BAL', 'SEX', 'EDUCATION', 'MARRIAGE', 'AGE', 'PAY_0', 'PAY_2', 'PAY_3', 'PAY_4', 'PAY_5', 'PAY_6', 'BILL_AMT1', 'BILL_AMT2', 'BILL_AMT3', 'BILL_AMT4', 'BILL_AMT5', 'BILL_AMT6', 'PAY_AMT1', 'PAY_AMT2', 'PAY_AMT3', 'PAY_AMT4', 'PAY_AMT5', 'PAY_AMT6', 'GENDER_MARRIAGE']

Missing values in features: 0
Missing values in target: 0


### Train-test split

In [14]:
# Train-test split with stratification
X_train, X_test, y_train, y_test = train_test_split(
    X, y,
    test_size=0.3,
    random_state=42,
    stratify=y
)

print("=" * 50)
print("TRAIN-TEST SPLIT RESULTS")
print("=" * 50)

print(f"\nTraining set size: {X_train.shape[0]} ({X_train.shape[0]/len(X)*100:.1f}%)")
print(f"Test set size: {X_test.shape[0]} ({X_test.shape[0]/len(X)*100:.1f}%)")

print(f"\nTraining set default rate: {y_train.mean():.3%}")
print(f"Test set default rate: {y_test.mean():.3%}")

# Verify stratification worked
print("\n" + "-" * 50)
print("Class Distribution Comparison")
print("-" * 50)
print("\nOriginal dataset:")
print(y.value_counts(normalize=True).sort_index())
print("\nTraining set:")
print(y_train.value_counts(normalize=True).sort_index())
print("\nTest set:")
print(y_test.value_counts(normalize=True).sort_index())

# Check if distributions are similar (difference should be < 1%)
train_default_rate = y_train.mean()
test_default_rate = y_test.mean()
overall_default_rate = y.mean()

diff_train = abs(train_default_rate - overall_default_rate)
diff_test = abs(test_default_rate - overall_default_rate)

if diff_train < 0.01 and diff_test < 0.01:
    print("\n✓ Stratification successful - distributions are similar")
else:
    print("\n⚠ Warning: Distributions differ more than expected")

TRAIN-TEST SPLIT RESULTS

Training set size: 20837 (70.0%)
Test set size: 8931 (30.0%)

Training set default rate: 22.134%
Test set default rate: 22.136%

--------------------------------------------------
Class Distribution Comparison
--------------------------------------------------

Original dataset:
default.payment.next.month
0    0.778655
1    0.221345
Name: proportion, dtype: float64

Training set:
default.payment.next.month
0    0.778663
1    0.221337
Name: proportion, dtype: float64

Test set:
default.payment.next.month
0    0.778636
1    0.221364
Name: proportion, dtype: float64

✓ Stratification successful - distributions are similar


### Train a model

In [None]:
# Parameters for grid search
param_grid = {
    'max_depth': [3, 5, 7, 10, 15, 20, None],
    'min_samples_split': [2, 5, 10, 20],
    'min_samples_leaf': [1, 2, 5, 10],
    'criterion': ['gini', 'entropy'],
    'class_weight': [None, 'balanced'] + [{0: 1, 1: w} for w in [1, 2, 3, 5, 10]]
}

model = DecisionTreeClassifier(random_state=42)
grid_search = GridSearchCV(
    model, 
    param_grid, 
    cv=5, 
    scoring='f1',
    n_jobs=-1,
    verbose=1
)
grid_search.fit(X_train, y_train)

best_model = grid_search.best_estimator_
y_pred = best_model.predict(X_test)
y_pred_proba = best_model.predict_proba(X_test)[:, 1]

print(f"Best parameters: {grid_search.best_params_}")
print(f"\nPerformance Metrics:")
print(f"Accuracy: {accuracy_score(y_test, y_pred):.3f}")
print(f"Precision: {precision_score(y_test, y_pred):.3f}")
print(f"Recall: {recall_score(y_test, y_pred):.3f}")
print(f"F1-score: {f1_score(y_test, y_pred):.3f}")
print(f"ROC-AUC: {roc_auc_score(y_test, y_pred_proba):.3f}")

print("\nClassification Report:")
print(classification_report(y_test, y_pred, target_names=['No Default', 'Default']))

print("\nConfusion Matrix:")
cm = confusion_matrix(y_test, y_pred)
print(cm)

# Export the best model
joblib.dump(best_model, '../models/best_decision_tree_model.pkl')
print("\n✓ Best model saved as 'best_decision_tree_model.pkl'")

Fitting 10 folds for each of 1568 candidates, totalling 15680 fits


In [None]:
# 特徴量重要度を取得
importance = best_model.feature_importances_

# 見やすく表示
pd.DataFrame({
    'Feature': feature_cols,
    'Importance': importance
}).sort_values('Importance', ascending=False)