<a href="https://colab.research.google.com/github/jallenrobern/CCMACLRL_EXERCISES_COM221ML/blob/main/Exercise9.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## Exercise 9: Choosing the best performing model on a dataset

Instructions:

- Use the Dataset File to train your model
- Use the Test File to generate your results
- Use the Sample Submission file to generate the same format
- Use all classification models

Submit your results to:
https://www.kaggle.com/competitions/playground-series-s4e10/overview



In [1]:
import pandas as pd
import seaborn as sns

from matplotlib import pyplot as plt
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split

## Dataset File

In [2]:
dataset_url = 'https://github.com/robitussin/CCMACLRL_EXERCISES/blob/main/datasets/loan_approval/train.csv?raw=true'
df = pd.read_csv(dataset_url)

In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 58645 entries, 0 to 58644
Data columns (total 13 columns):
 #   Column                      Non-Null Count  Dtype  
---  ------                      --------------  -----  
 0   id                          58645 non-null  int64  
 1   person_age                  58645 non-null  int64  
 2   person_income               58645 non-null  int64  
 3   person_home_ownership       58645 non-null  object 
 4   person_emp_length           58645 non-null  float64
 5   loan_intent                 58645 non-null  object 
 6   loan_grade                  58645 non-null  object 
 7   loan_amnt                   58645 non-null  int64  
 8   loan_int_rate               58645 non-null  float64
 9   loan_percent_income         58645 non-null  float64
 10  cb_person_default_on_file   58645 non-null  object 
 11  cb_person_cred_hist_length  58645 non-null  int64  
 12  loan_status                 58645 non-null  int64  
dtypes: float64(3), int64(6), object

## Test File

In [6]:
test_url = 'https://github.com/robitussin/CCMACLRL_EXERCISES/blob/main/datasets/loan_approval/test.csv?raw=true'
dt=pd.read_csv(test_url)

In [7]:
dt.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 39098 entries, 0 to 39097
Data columns (total 12 columns):
 #   Column                      Non-Null Count  Dtype  
---  ------                      --------------  -----  
 0   id                          39098 non-null  int64  
 1   person_age                  39098 non-null  int64  
 2   person_income               39098 non-null  int64  
 3   person_home_ownership       39098 non-null  object 
 4   person_emp_length           39098 non-null  float64
 5   loan_intent                 39098 non-null  object 
 6   loan_grade                  39098 non-null  object 
 7   loan_amnt                   39098 non-null  int64  
 8   loan_int_rate               39098 non-null  float64
 9   loan_percent_income         39098 non-null  float64
 10  cb_person_default_on_file   39098 non-null  object 
 11  cb_person_cred_hist_length  39098 non-null  int64  
dtypes: float64(3), int64(5), object(4)
memory usage: 3.6+ MB


## Sample Submission File

In [4]:
sample_submission_url ='https://github.com/robitussin/CCMACLRL_EXERCISES/blob/main/datasets/loan_approval/sample_submission.csv?raw=true'

sf=pd.read_csv(sample_submission_url)

In [5]:
sf.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 39098 entries, 0 to 39097
Data columns (total 2 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   id           39098 non-null  int64  
 1   loan_status  39098 non-null  float64
dtypes: float64(1), int64(1)
memory usage: 611.0 KB


In [8]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder

# Map categorical columns to numerical values
mapping_dict = {
    'person_home_ownership': {"RENT": 1, "OWN": 0},
    'loan_intent': {"PERSONAL": 1, "MORTGAGE": 2, "MEDICAL": 3, "VENTURE": 4, "EDUCATION": 5},
    'loan_grade': {"A": 1, "B": 2, "C": 3, "D": 4, "E": 5},
    'cb_person_default_on_file': {"Y": 1, "N": 0}
}

for column, mapping in mapping_dict.items():
    df[column] = df[column].map(mapping)

# Fill missing values with mode
columns_with_null = ['person_home_ownership', 'loan_intent', 'loan_grade']
for column in columns_with_null:
    df[column].fillna(df[column].mode()[0], inplace=True)

# Prepare features X and target y
X = df.drop(columns=['id', 'loan_status']).values
y = df['loan_status'].values

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=0)

# Check if the mapping and missing value filling was successful
print(df.isnull().sum())


id                            0
person_age                    0
person_income                 0
person_home_ownership         0
person_emp_length             0
loan_intent                   0
loan_grade                    0
loan_amnt                     0
loan_int_rate                 0
loan_percent_income           0
cb_person_default_on_file     0
cb_person_cred_hist_length    0
loan_status                   0
dtype: int64


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df[column].fillna(df[column].mode()[0], inplace=True)


## 1. Train a KNN Classifier

In [9]:
df.sample(15)

Unnamed: 0,id,person_age,person_income,person_home_ownership,person_emp_length,loan_intent,loan_grade,loan_amnt,loan_int_rate,loan_percent_income,cb_person_default_on_file,cb_person_cred_hist_length,loan_status
32348,32348,30,50000,1.0,4.0,5.0,3.0,6000,13.98,0.12,1,9,0
17214,17214,27,70000,1.0,2.0,3.0,2.0,12000,11.89,0.17,0,10,0
9926,9926,37,100000,1.0,0.0,4.0,2.0,15000,10.99,0.15,0,14,0
25863,25863,26,69000,1.0,10.0,3.0,1.0,10000,8.94,0.14,0,4,0
2254,2254,22,36996,1.0,2.0,5.0,2.0,12000,11.58,0.33,0,2,1
30499,30499,25,35000,1.0,9.0,5.0,1.0,11000,7.88,0.31,0,3,0
3758,3758,28,110000,1.0,1.0,5.0,5.0,5000,17.51,0.05,0,8,0
5228,5228,24,69600,1.0,8.0,5.0,2.0,15000,11.99,0.22,0,4,0
46621,46621,35,50400,1.0,11.0,5.0,4.0,12000,15.2,0.24,0,6,0
29892,29892,23,24996,1.0,4.0,5.0,2.0,6000,10.99,0.24,0,2,0


In [10]:
score_list = {}

In [11]:
from sklearn.neighbors import KNeighborsClassifier

KNN = KNeighborsClassifier(n_neighbors=22)
KNN.fit(X_train,y_train)
knn_score = KNN.score(X_test,y_test)
print(f"Score is {knn_score}")

Score is 0.8936569284983518


- Perform cross validation

In [12]:
from sklearn.model_selection import cross_val_score

scores = cross_val_score(KNN, X, y, cv=10)
scores

array([0.89326513, 0.89462916, 0.89428815, 0.89360614, 0.89445865,
       0.89427012, 0.89222374, 0.89648704, 0.88864256, 0.89768076])

In [13]:
print("%0.2f accuracy with a standard deviation of %0.2f" % (scores.mean(), scores.std()))
score_list["KNN Classifier"] = scores.mean()

0.89 accuracy with a standard deviation of 0.00


## 2. Train a Logistic Regression Classifier

In [14]:
from sklearn.linear_model import LogisticRegression

LR = LogisticRegression()
LR.fit(X_train,y_train)

lr_score = LR.score(X_test,y_test)
print(f"Score is {lr_score}")

Score is 0.8802432647493463


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


- Perform cross validation

In [15]:
from sklearn.model_selection import cross_val_score

scores = cross_val_score(LR, X, y, cv=10)
scores

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

array([0.88098892, 0.8801364 , 0.88184143, 0.88354646, 0.88269395,
       0.87755798, 0.87943383, 0.88250341, 0.88045703, 0.88267394])

In [16]:
print("%0.2f accuracy with a standard deviation of %0.2f" % (scores.mean(), scores.std()))
score_list["Logistic Regression"] = scores.mean()

0.88 accuracy with a standard deviation of 0.00


## 3. Train a Naive Bayes Classifier

In [17]:
from sklearn.naive_bayes import GaussianNB

nbc = GaussianNB()
nbc.fit(X_train,y_train)
nbc_score = nbc.score(X_test,y_test)

print(f"Score is {nbc_score}")

Score is 0.8827441173127203


- Perform cross validation

In [18]:
from sklearn.model_selection import cross_val_score

scores = cross_val_score(nbc, X, y, cv=10)
scores

array([0.88218244, 0.87536232, 0.88439898, 0.88150043, 0.88422847,
       0.88011596, 0.88369714, 0.88540246, 0.88489086, 0.88096862])

In [19]:
print("%0.2f accuracy with a standard deviation of %0.2f" % (scores.mean(), scores.std()))
score_list["GaussianNBC"] = scores.mean()

0.88 accuracy with a standard deviation of 0.00


## 4. Train a SVM Classifier

In [20]:
from sklearn.svm import SVC

svc = SVC()
svc.fit(X_train,y_train)
svc_score = svc.score(X_test,y_test)

print(f"Score is {svc_score}")

Score is 0.8578492667954984


- Perform cross validation

In [21]:
from sklearn.model_selection import cross_val_score

scores = cross_val_score(svc, X, y, cv=5)
scores

array([0.85761787, 0.85761787, 0.85761787, 0.85753261, 0.85761787])

## 5. Train a Decision Tree Classifier

In [22]:
from sklearn.tree import DecisionTreeClassifier

dtc = DecisionTreeClassifier()
dtc.fit(X_train,y_train)
dtc_score = dtc.score(X_test,y_test)

print(f"Score is {dtc_score}")

Score is 0.8867795839490735


- Perform cross validation

In [23]:
from sklearn.model_selection import cross_val_score

scores = cross_val_score(dtc, X, y, cv=10)
scores

array([0.89343564, 0.88371697, 0.88661552, 0.8855925 , 0.88746803,
       0.88625512, 0.88591405, 0.88693724, 0.87841064, 0.88472033])

## 6. Train a Random Forest Classifier

In [24]:
from sklearn.ensemble import RandomForestClassifier

rfc = RandomForestClassifier(n_estimators=50,random_state=1)
rfc.fit(X_train,y_train)
rfc_score = rfc.score(X_test,y_test)
score_list["RFC"]=rfc_score

print(f"Score is {rfc_score}")

Score is 0.928782539502103


## 7. Compare all the performance of all classification models

In [25]:
score_list = list(score_list.items())
for alg, score in score_list:
    print(f"{alg} Score is {str(score)[:4]} ")

KNN Classifier Score is 0.89 
Logistic Regression Score is 0.88 
GaussianNBC Score is 0.88 
RFC Score is 0.92 


In [26]:
# Define the mapping dictionary
mapping_dict_dt = {
    'person_home_ownership': {"RENT": 1, "OWN": 0},
    'loan_intent': { "PERSONAL": 1, "MORTGAGE": 2, "MEDICAL": 3,
        "VENTURE": 4, "EDUCATION": 5, "HOMEIMPROVEMENT": 6,
        "DEBTCONSOLIDATION": 7 },
    'loan_grade': {"A": 1, "B": 2, "C": 3, "D": 4, "E": 5, "F": 6},
    'cb_person_default_on_file': {"Y": 1, "N": 0}
}

# Apply the mappings
for column, mapping in mapping_dict_dt.items():
    dt[column] = dt[column].map(mapping)

## 9. Generate Submission File

Choose the model that has the best performance to generate a submission file.

In [27]:
id = dt.pop('id')
y_pred = rfc.predict(dt)

# Create a submission DataFrame
submission_df = pd.DataFrame({
    'id': id,
    'loan_status': y_pred
})

# Save the submission DataFrame to a CSV file
submission_df.to_csv('submission_file.csv', index=False)
print("Submission file created: submission_file.csv")



Submission file created: submission_file.csv
