<a href="https://colab.research.google.com/github/lazy-scribe/ssu_ml/blob/main/final_term_project.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

**0. Import**

In [None]:
try:
    import google.colab
    IN_COLAB = True
except:
    IN_COLAB = False
# print(IN_COLAB)

if IN_COLAB:
    !git clone https://github.com/ssuai/ISLP.git
    #!pip install ISLP # This takes too much time

Cloning into 'ISLP'...
remote: Enumerating objects: 63, done.[K
remote: Counting objects: 100% (63/63), done.[K
remote: Compressing objects: 100% (55/55), done.[K
remote: Total 63 (delta 14), reused 53 (delta 6), pack-reused 0 (from 0)[K
Receiving objects: 100% (63/63), 2.34 MiB | 15.64 MiB/s, done.
Resolving deltas: 100% (14/14), done.


In [None]:
import numpy as np
import pandas as pd
import seaborn as sns
import statsmodels.api as sm
import matplotlib.pyplot as plt
from matplotlib.pyplot import subplots, cm

import sklearn.model_selection as skm
import sklearn.linear_model as skl
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVC
from sklearn.metrics import RocCurveDisplay

from ISLP.utils import load_data, confusion_table
from ISLP.models import summarize, poly, ModelSpec as MS
from ISLP.svm import plot as plot_svm
from functools import partial

import warnings
warnings.filterwarnings('ignore', category=UserWarning, module='sklearn.linear_model._coordinate_descent')

!pip install kaggle



In [None]:
from google.colab import drive
drive.mount('/content/drive')

!mkdir -p ~/.kaggle
!cp "/content/drive/MyDrive/data/kaggle.json" ~/.kaggle/
!chmod 600 ~/.kaggle/kaggle.json

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
!kaggle competitions download -c playground-series-s4e1

Downloading playground-series-s4e1.zip to /content
  0% 0.00/6.81M [00:00<?, ?B/s]
100% 6.81M/6.81M [00:00<00:00, 494MB/s]


**1. Load Data**

In [None]:
!unzip playground-series-s4e1.zip

df_train = pd.read_csv('train.csv')
df_test = pd.read_csv('test.csv')

Archive:  playground-series-s4e1.zip
  inflating: sample_submission.csv   
  inflating: test.csv                
  inflating: train.csv               


**2. Preprocessing**

In [None]:
#remove columns

df_train.drop(columns=['id', 'Surname'], inplace=True)
df_test.drop(columns=['id', 'Surname'], inplace=True)

In [None]:
#one-hot encoding

df_train = pd.get_dummies(df_train, columns=['Geography', 'Gender'], dtype=int)
df_test = pd.get_dummies(df_test, columns=['Geography', 'Gender'], dtype=int)

df_train.head()

Unnamed: 0,CustomerId,CreditScore,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Exited,Geography_France,Geography_Germany,Geography_Spain,Gender_Female,Gender_Male
0,15674932,668,33.0,3,0.0,2,1.0,0.0,181449.97,0,1,0,0,0,1
1,15749177,627,33.0,1,0.0,2,1.0,1.0,49503.5,0,1,0,0,0,1
2,15694510,678,40.0,10,0.0,2,1.0,0.0,184866.69,0,1,0,0,0,1
3,15741417,581,34.0,2,148882.54,1,1.0,1.0,84560.88,0,1,0,0,0,1
4,15766172,716,33.0,5,0.0,2,1.0,1.0,15068.83,0,0,0,1,0,1


In [None]:
#min-max scaling

scale_cols = ['Age','CreditScore', 'Balance','EstimatedSalary', 'Tenure', 'NumOfProducts']

for c in scale_cols:
    min_value = df_train[c].min()
    max_value = df_train[c].max()
    df_train[c] = (df_train[c] - min_value) / (max_value - min_value)
    df_test[c] = (df_test[c] - min_value) / (max_value - min_value)

df_train.head()

Unnamed: 0,CustomerId,CreditScore,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Exited,Geography_France,Geography_Germany,Geography_Spain,Gender_Female,Gender_Male
0,15674932,0.636,0.202703,0.3,0.0,0.333333,1.0,0.0,0.907279,0,1,0,0,0,1
1,15749177,0.554,0.202703,0.1,0.0,0.333333,1.0,1.0,0.247483,0,1,0,0,0,1
2,15694510,0.656,0.297297,1.0,0.0,0.333333,1.0,0.0,0.924364,0,1,0,0,0,1
3,15741417,0.462,0.216216,0.2,0.593398,0.0,1.0,1.0,0.422787,0,1,0,0,0,1
4,15766172,0.732,0.202703,0.5,0.0,0.333333,1.0,1.0,0.075293,0,0,0,1,0,1


**3. Model Selection**

In [None]:
#forward stepwise selection

Y = df_train['Exited']
X = df_train.drop(columns=['Exited', 'CustomerId'])

X.head()

Unnamed: 0,CreditScore,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Geography_France,Geography_Germany,Geography_Spain,Gender_Female,Gender_Male
0,0.636,0.202703,0.3,0.0,0.333333,1.0,0.0,0.907279,1,0,0,0,1
1,0.554,0.202703,0.1,0.0,0.333333,1.0,1.0,0.247483,1,0,0,0,1
2,0.656,0.297297,1.0,0.0,0.333333,1.0,0.0,0.924364,1,0,0,0,1
3,0.462,0.216216,0.2,0.593398,0.0,1.0,1.0,0.422787,1,0,0,0,1
4,0.732,0.202703,0.5,0.0,0.333333,1.0,1.0,0.075293,0,0,1,0,1


In [None]:
from sklearn.linear_model import LogisticRegression
from mlxtend.feature_selection import SequentialFeatureSelector as SFS

lr = LogisticRegression(solver='liblinear', random_state=42)

sfs = SFS(lr,
          k_features=(1, X.shape[1]),
          forward=True,
          floating=False,
          scoring='roc_auc',
          cv=5,
          n_jobs=-1)

sfs.fit(X, Y)

print('Selected features (k_features=best):', sfs.k_feature_names_)

sfs_results = pd.DataFrame(sfs.subsets_).T
print("\nForward Stepwise Selection Results:")
print(sfs_results)

Selected features (k_features=best): ('CreditScore', 'Age', 'Tenure', 'NumOfProducts', 'HasCrCard', 'IsActiveMember', 'EstimatedSalary', 'Geography_France', 'Geography_Germany', 'Geography_Spain', 'Gender_Female', 'Gender_Male')

Forward Stepwise Selection Results:
                                   feature_idx  \
1                                         (1,)   
2                                       (1, 6)   
3                                    (1, 4, 6)   
4                                 (1, 4, 6, 9)   
5                             (1, 4, 6, 9, 12)   
6                          (1, 4, 5, 6, 9, 12)   
7                       (0, 1, 4, 5, 6, 9, 12)   
8                    (0, 1, 2, 4, 5, 6, 9, 12)   
9                 (0, 1, 2, 4, 5, 6, 7, 9, 12)   
10             (0, 1, 2, 4, 5, 6, 7, 8, 9, 12)   
11         (0, 1, 2, 4, 5, 6, 7, 8, 9, 10, 12)   
12     (0, 1, 2, 4, 5, 6, 7, 8, 9, 10, 11, 12)   
13  (0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12)   

                                 

In [None]:
#I'll not use
#['Balance', 'Geography_Spain', 'Geography_France','Geography_Germany', 'Gender_Male', 'Gender_Female']

selected_features = [
    'CreditScore',
    'Age',
    'Tenure',
    'NumOfProducts',
    'HasCrCard',
    'IsActiveMember',
    'EstimatedSalary'
]

**4. Models**

In [None]:
#SVM
roc_curve = RocCurveDisplay.from_estimator # shorthand
svm_linear = SVC(C=10, kernel='linear', class_weight='balanced', probability=True)

X_train = df_train[selected_features]
Y_train = df_train['Exited']

X_test = df_test[selected_features]

svm_linear = SVC(C=10, kernel='linear', class_weight='balanced', probability=True, random_state=42)
svm_linear.fit(X_train, Y_train)

In [None]:
test_id = df_test['CustomerId']
predicted_probabilities = svm_linear.predict_proba(X_test)[:, 1]
submission_df = pd.DataFrame({
    'id': test_id,
    'Exited': predicted_probabilities
})

submission_df.to_csv('submission.csv', index=False)
print("submission complete")

submission complete


In [None]:
print(submission_df.head())
submission_df.to_csv('/content/drive/MyDrive/data/submission.csv', index=False)
print("\n'submission_svm_linear.csv' 파일이 성공적으로 생성되었습니다!")

         id    Exited
0  15773898  0.021071
1  15782418  0.524222
2  15807120  0.117816
3  15808905  0.270063
4  15607314  0.300689

'submission_svm_linear.csv' 파일이 성공적으로 생성되었습니다!
