## Exercise 9: Choosing the best performing model on a dataset

Instructions:

- Use the Dataset File to train your model
- Use the Test File to generate your results
- Use the Sample Submission file to generate the same format
- Use all classification models

Submit your results to:
https://www.kaggle.com/competitions/playground-series-s4e10/overview



In [1]:
import pandas as pd
import seaborn as sns
import numpy as np

from matplotlib import pyplot as plt
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split

## Dataset File

In [2]:
dataset_url = 'https://github.com/robitussin/CCMACLRL_EXERCISES/blob/main/datasets/loan_approval/train.csv?raw=true'
df = pd.read_csv(dataset_url)

## Test File

In [52]:
test_url = 'https://github.com/robitussin/CCMACLRL_EXERCISES/blob/main/datasets/loan_approval/test.csv?raw=true'
dt=pd.read_csv(test_url)

In [4]:
dt.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 39098 entries, 0 to 39097
Data columns (total 12 columns):
 #   Column                      Non-Null Count  Dtype  
---  ------                      --------------  -----  
 0   id                          39098 non-null  int64  
 1   person_age                  39098 non-null  int64  
 2   person_income               39098 non-null  int64  
 3   person_home_ownership       39098 non-null  object 
 4   person_emp_length           39098 non-null  float64
 5   loan_intent                 39098 non-null  object 
 6   loan_grade                  39098 non-null  object 
 7   loan_amnt                   39098 non-null  int64  
 8   loan_int_rate               39098 non-null  float64
 9   loan_percent_income         39098 non-null  float64
 10  cb_person_default_on_file   39098 non-null  object 
 11  cb_person_cred_hist_length  39098 non-null  int64  
dtypes: float64(3), int64(5), object(4)
memory usage: 3.6+ MB


## Sample Submission File

In [5]:
sample_submission_url ='https://github.com/robitussin/CCMACLRL_EXERCISES/blob/main/datasets/loan_approval/sample_submission.csv?raw=true'

sf=pd.read_csv(sample_submission_url)

In [6]:
sf.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 39098 entries, 0 to 39097
Data columns (total 2 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   id           39098 non-null  int64  
 1   loan_status  39098 non-null  float64
dtypes: float64(1), int64(1)
memory usage: 611.0 KB


## Data Preprocessing

In [7]:
df.head()

Unnamed: 0,id,person_age,person_income,person_home_ownership,person_emp_length,loan_intent,loan_grade,loan_amnt,loan_int_rate,loan_percent_income,cb_person_default_on_file,cb_person_cred_hist_length,loan_status
0,0,37,35000,RENT,0.0,EDUCATION,B,6000,11.49,0.17,N,14,0
1,1,22,56000,OWN,6.0,MEDICAL,C,4000,13.35,0.07,N,2,0
2,2,29,28800,OWN,8.0,PERSONAL,A,6000,8.9,0.21,N,10,0
3,3,30,70000,RENT,14.0,VENTURE,B,12000,11.11,0.17,N,5,0
4,4,22,60000,RENT,2.0,MEDICAL,A,6000,6.92,0.1,N,3,0


In [8]:
dt.head()

Unnamed: 0,id,person_age,person_income,person_home_ownership,person_emp_length,loan_intent,loan_grade,loan_amnt,loan_int_rate,loan_percent_income,cb_person_default_on_file,cb_person_cred_hist_length
0,58645,23,69000,RENT,3.0,HOMEIMPROVEMENT,F,25000,15.76,0.36,N,2
1,58646,26,96000,MORTGAGE,6.0,PERSONAL,C,10000,12.68,0.1,Y,4
2,58647,26,30000,RENT,5.0,VENTURE,E,4000,17.19,0.13,Y,2
3,58648,33,50000,RENT,4.0,DEBTCONSOLIDATION,A,7000,8.9,0.14,N,7
4,58649,26,102000,MORTGAGE,8.0,HOMEIMPROVEMENT,D,15000,16.32,0.15,Y,4


In [9]:
df.drop(['id'], axis=1, inplace=True)

### Applying label encoder for columns `loan_intent`, `loan_grade`, and `person_home_ownership`

In [10]:
from sklearn.preprocessing import LabelEncoder
pd_dummies = pd.get_dummies(df['cb_person_default_on_file'], prefix='cb_person_default_on_file')
le = LabelEncoder()
df['person_home_ownership'] = le.fit_transform(df['person_home_ownership'])
df['loan_intent'] = le.fit_transform(df['loan_intent'])
df['loan_grade'] = le.fit_transform(df['loan_grade'])

### Applying dummies to `cb_person_default_on_file`

In [11]:
df = pd.concat([df, pd_dummies], axis=1)
df['cb_person_default_on_file_N'] = df['cb_person_default_on_file'].map({'N': 1, 'Y': 0})
df['cb_person_default_on_file_Y'] = df['cb_person_default_on_file'].map({'Y': 1, 'N': 0})
df.drop('cb_person_default_on_file', axis=1, inplace=True)


In [12]:
df.head()

Unnamed: 0,person_age,person_income,person_home_ownership,person_emp_length,loan_intent,loan_grade,loan_amnt,loan_int_rate,loan_percent_income,cb_person_cred_hist_length,loan_status,cb_person_default_on_file_N,cb_person_default_on_file_Y
0,37,35000,3,0.0,1,1,6000,11.49,0.17,14,0,1,0
1,22,56000,2,6.0,3,2,4000,13.35,0.07,2,0,1,0
2,29,28800,2,8.0,4,0,6000,8.9,0.21,10,0,1,0
3,30,70000,3,14.0,5,1,12000,11.11,0.17,5,0,1,0
4,22,60000,3,2.0,3,0,6000,6.92,0.1,3,0,1,0


### Splitting the dataset to train and test

In [13]:
X = df.drop('loan_status', axis=1)
y = df['loan_status']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

## Applying smote

In [14]:
from imblearn.over_sampling import SMOTE

smote = SMOTE(random_state=42)
X_train_smote, y_train_smote = smote.fit_resample(X_train, y_train)

In [15]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
X_train_smote_scaled = scaler.fit_transform(X_train_smote)
X_test_scaled = scaler.transform(X_test)

In [16]:
X_train_smote_scaled

array([[ 1.5371648 , -0.22511535,  0.76641975, ...,  2.48988162,
        -1.53037024,  2.49372642],
       [-0.0451018 , -0.42688216,  0.76641975, ...,  0.90591497,
        -1.53037024,  2.49372642],
       [-0.39671661,  0.36831644, -1.44661969, ..., -0.67805168,
         0.65343665, -0.4010063 ],
       ...,
       [-0.57252401,  1.01583954,  0.02873994, ..., -0.41405724,
        -1.53037024,  2.49372642],
       [ 0.306513  , -0.0767574 ,  0.76641975, ..., -0.1500628 ,
         0.65343665, -0.4010063 ],
       [ 0.8339352 ,  0.07160055,  0.76641975, ...,  0.90591497,
        -1.53037024, -0.4010063 ]])

In [17]:
X_test_scaled

array([[-0.74833141, -0.52183124,  0.76641975, ..., -0.41405724,
         0.65343665, -0.4010063 ],
       [-0.39671661,  0.07160055, -1.44661969, ..., -0.41405724,
         0.65343665, -0.4010063 ],
       [ 0.8339352 ,  0.25853156,  0.76641975, ...,  0.11393164,
         0.65343665, -0.4010063 ],
       ...,
       [ 0.1307056 , -0.16577217,  0.76641975, ...,  0.11393164,
         0.65343665, -0.4010063 ],
       [-0.0451018 , -0.58117442,  0.76641975, ..., -0.1500628 ,
         0.65343665, -0.4010063 ],
       [-0.92413881,  0.36819775,  0.76641975, ..., -0.41405724,
         0.65343665, -0.4010063 ]])

## 1. Train a KNN Classifier

In [19]:
knn = KNeighborsClassifier(n_neighbors=5)
knn.fit(X_train_smote_scaled, y_train_smote)

# Evaluate the model
knn_score = knn.score(X_test_scaled, y_test)
print(f"KNN Classifier Accuracy: {knn_score}")

KNN Classifier Accuracy: 0.8760337624690937


- Perform cross validation

In [20]:
from sklearn.model_selection import cross_val_score, KFold

# Define the KNN model
knn = KNeighborsClassifier(n_neighbors=5)

# Set up K-Fold cross-validation
kf = KFold(n_splits=5, shuffle=True, random_state=42)  # 5 folds

# Perform cross-validation
cv_scores = cross_val_score(knn, X_train_smote_scaled, y_train_smote, cv=kf)

# Print the accuracy for each fold and the mean accuracy
print(f"K-Fold Accuracy for each fold: {cv_scores}")
print(f"Mean K-Fold Accuracy: {np.mean(cv_scores)}")


K-Fold Accuracy for each fold: [0.90506093 0.90325188 0.90008083 0.90312753 0.90524156]
Mean K-Fold Accuracy: 0.9033525454090642


## 2. Train a Logistic Regression Classifier

In [21]:
log_reg = LogisticRegression(max_iter=1000)
log_reg.fit(X_train_smote_scaled, y_train_smote)

# Evaluate the model
log_reg_score = log_reg.score(X_test_scaled, y_test)
print(f"Logistic Regression Accuracy: {log_reg_score}")

Logistic Regression Accuracy: 0.8253048000682071


- Perform cross validation

In [22]:
log_reg = LogisticRegression(max_iter=1000)

# Set up K-Fold cross-validation
kf = KFold(n_splits=5, shuffle=True, random_state=42)  # 5 folds

# Perform cross-validation
cv_scores = cross_val_score(log_reg, X_train_smote_scaled, y_train_smote, cv=kf)

# Print the accuracy for each fold and the mean accuracy
print(f"K-Fold Accuracy for each fold (Logistic Regression): {cv_scores}")
print(f"Mean K-Fold Accuracy (Logistic Regression): {np.mean(cv_scores)}")

K-Fold Accuracy for each fold (Logistic Regression): [0.83841084 0.83802773 0.8372816  0.83647329 0.8403283 ]
Mean K-Fold Accuracy (Logistic Regression): 0.8381043535305219


## 3. Train a Naive Bayes Classifier

In [23]:
nb = GaussianNB()
nb.fit(X_train_smote_scaled, y_train_smote)

nb_score = nb.score(X_test_scaled, y_test)
print(f"Naive Bayes Accuracy: {nb_score}")

Naive Bayes Accuracy: 0.7837837837837838


- Perform cross validation

In [24]:
nb = GaussianNB()

# Set up K-Fold cross-validation
kf = KFold(n_splits=5, shuffle=True, random_state=42)  # 5 folds

# Perform cross-validation
cv_scores = cross_val_score(nb, X_train_smote_scaled, y_train_smote, cv=kf)

# Print the accuracy for each fold and the mean accuracy
print(f"K-Fold Accuracy for each fold (Naive Bayes): {cv_scores}")
print(f"Mean K-Fold Accuracy (Naive Bayes): {np.mean(cv_scores)}")

K-Fold Accuracy for each fold (Naive Bayes): [0.81665009 0.81564385 0.81477336 0.81583038 0.81421377]
Mean K-Fold Accuracy (Naive Bayes): 0.8154222887510161


## 4. Train a SVM Classifier

In [25]:
svm = SVC()
svm.fit(X_train_smote_scaled, y_train_smote)

# Evaluate the model
svm_score = svm.score(X_test_scaled, y_test)
print(f"SVM Classifier Accuracy: {svm_score}")

SVM Classifier Accuracy: 0.8982010401568761


- Perform cross validation

In [26]:
svm = SVC()

# Set up K-Fold cross-validation
kf = KFold(n_splits=5, shuffle=True, random_state=42)  # 5 folds

# Perform cross-validation
cv_scores = cross_val_score(svm, X_train_smote_scaled, y_train_smote, cv=kf)

# Print the accuracy for each fold and the mean accuracy
print(f"K-Fold Accuracy for each fold (SVM): {cv_scores}")
print(f"Mean K-Fold Accuracy (SVM): {np.mean(cv_scores)}")

K-Fold Accuracy for each fold (SVM): [0.89598359 0.89268171 0.89125163 0.88957284 0.89597712]
Mean K-Fold Accuracy (SVM): 0.8930933783052145


## 5. Train a Decision Tree Classifier

In [27]:
dtree = DecisionTreeClassifier(random_state=42)
dtree.fit(X_train_smote_scaled, y_train_smote)

# Evaluate the model
dtree_score = dtree.score(X_test_scaled, y_test)
print(f"Decision Tree Classifier Accuracy: {dtree_score}")

Decision Tree Classifier Accuracy: 0.8870321425526473


- Perform cross validation

In [28]:
dtree = DecisionTreeClassifier(random_state=42)

# Set up K-Fold cross-validation
kf = KFold(n_splits=5, shuffle=True, random_state=42)  # 5 folds

# Perform cross-validation
cv_scores = cross_val_score(dtree, X_train_smote_scaled, y_train_smote, cv=kf)

# Print the accuracy for each fold and the mean accuracy
print(f"K-Fold Accuracy for each fold (Decision Tree): {cv_scores}")
print(f"Mean K-Fold Accuracy (Decision Tree): {np.mean(cv_scores)}")

K-Fold Accuracy for each fold (Decision Tree): [0.92197215 0.92520052 0.9208481  0.91805012 0.91624697]
Mean K-Fold Accuracy (Decision Tree): 0.9204635705758101


## 6. Train a Random Forest Classifier

In [35]:
rf = RandomForestClassifier(random_state=42)
rf.fit(X_train_smote_scaled, y_train_smote)

# Evaluate the model
rf_score = rf.score(X_test_scaled, y_test)
print(f"Random Forest Classifier Accuracy: {rf_score}")

Random Forest Classifier Accuracy: 0.9325603205729389


- Perform cross validation

In [36]:
rf1 = RandomForestClassifier(random_state=42)

# Set up K-Fold cross-validation
kf = KFold(n_splits=5, shuffle=True, random_state=42)  # 5 folds

# Perform cross-validation
cv_scores = cross_val_score(rf1, X_train_smote_scaled, y_train_smote, cv=kf)

# Print the accuracy for each fold and the mean accuracy
print(f"K-Fold Accuracy for each fold (Random Forest): {cv_scores}")
print(f"Mean K-Fold Accuracy (Random Forest): {np.mean(cv_scores)}")

K-Fold Accuracy for each fold (Random Forest): [0.94957722 0.9517503  0.95193683 0.94988497 0.95019586]
Mean K-Fold Accuracy (Random Forest): 0.9506690346674178


## 7. Compare all the performance of all classification models

In [31]:
model_performance = {
    'KNN': knn_score,
    'Logistic Regression': log_reg_score,
    'Naive Bayes': nb_score,
    'SVM': svm_score,
    'Decision Tree': dtree_score,
    'Random Forest': rf_score
}

performance_df = pd.DataFrame(list(model_performance.items()), columns=['Model', 'Accuracy'])

performance_df = performance_df.sort_values(by='Accuracy', ascending=False)

print(performance_df)

                 Model  Accuracy
5        Random Forest  0.932560
3                  SVM  0.898201
4        Decision Tree  0.887032
0                  KNN  0.876034
1  Logistic Regression  0.825305
2          Naive Bayes  0.783784


## Do Hyperparamer tuning for the greatest model

In [32]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import RandomizedSearchCV
from sklearn.metrics import accuracy_score, classification_report
import numpy as np

param_grid = {
    'n_estimators': [100, 200, 300, 400, 500],  # number of trees in the forest
    'max_features': ['auto', 'sqrt'],  # number of features to consider at each split
    'max_depth': [10, 20, 30, 40, 50, None],  # max depth of the tree
    'min_samples_split': [2, 5, 10],  # minimum number of samples required to split an internal node
    'min_samples_leaf': [1, 2, 4],  # minimum number of samples required to be at a leaf node
    'bootstrap': [True, False]  # whether bootstrap samples are used when building trees
}

rf = RandomForestClassifier(random_state=42)

random_search = RandomizedSearchCV(estimator=rf,
                                   param_distributions=param_grid,
                                   n_iter=100,  # number of parameter settings sampled
                                   scoring='accuracy',
                                   cv=5,  # 5-fold cross-validation
                                   verbose=2,  # to display progress
                                   random_state=42,
                                   n_jobs=-1)  # use all available CPU cores

random_search.fit(X_train_smote_scaled, y_train_smote)

best_rf = random_search.best_estimator_
print(f"Best Hyperparameters: {random_search.best_params_}")

y_pred = best_rf.predict(X_test_scaled)

accuracy = accuracy_score(y_test, y_pred)
print(f"Test Accuracy after Hyperparameter Tuning: {accuracy}")
print(classification_report(y_test, y_pred))


Fitting 5 folds for each of 100 candidates, totalling 500 fits


KeyboardInterrupt: 

## sobrang tagal ng hyper params tuning ni hinto q na

## 9. Generate Submission File

Choose the model that has the best performance to generate a submission file.

## Doing preprocessing to test values

In [57]:
from sklearn.preprocessing import LabelEncoder
pd_dummies = pd.get_dummies(dt['cb_person_default_on_file'], prefix='cb_person_default_on_file')

dt.drop(['id'], axis=1, inplace=True)

le = LabelEncoder()
dt['person_home_ownership'] = le.fit_transform(dt['person_home_ownership'])
dt['loan_intent'] = le.fit_transform(dt['loan_intent'])
dt['loan_grade'] = le.fit_transform(dt['loan_grade'])

dt = pd.concat([dt, pd_dummies], axis=1)
dt['cb_person_default_on_file_N'] = dt['cb_person_default_on_file'].map({'N': 1, 'Y': 0})
dt['cb_person_default_on_file_Y'] = dt['cb_person_default_on_file'].map({'Y': 1, 'N': 0})
dt.drop('cb_person_default_on_file', axis=1, inplace=True)

scaler1 = StandardScaler()
dt = scaler1.fit_transform(dt)


IndexError: only integers, slices (`:`), ellipsis (`...`), numpy.newaxis (`None`) and integer or boolean arrays are valid indices

In [56]:
# id = sf.pop('id')
y_pred = rf.predict(dt)
submission_df = pd.DataFrame({
    'id': id,
    'loan_status': y_pred
})

# Save the submission DataFrame to a CSV file
submission_df.to_csv('submission_file_randomforesttanign.csv', index=False)
print("Submission file created: submission_file_randomforesttanigna.csv")



Submission file created: submission_file_randomforesttanigna.csv
