In [67]:
import pandas as pd
from sklearn.discriminant_analysis import StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder

from lazypredict.Supervised import LazyClassifier
from sklearn.datasets import load_breast_cancer
from sklearn.model_selection import train_test_split

from imblearn.over_sampling import SMOTE

smote = SMOTE(random_state=42)
X_resampled, y_resampled = smote.fit_resample(X, y)  


df = pd.read_csv('D:\Hutson\learning-materials\AI_ML\AIMLDLCV_advance\Class\Datasets\Loan Data.csv',delimiter=';')

In [68]:
df.head()

Unnamed: 0,YOB,NKID,DEP,PHON,SINC,AES,DAINC,RES,DHVAL,DMORT,DOUTM,DOUTL,DOUTHP,DOUTCC,BAD
0,19.0,4.0,0.0,1,0.0,R,0.0,O,14464.0,4.0,0.0,0.0,0.0,0.0,0.0
1,41.0,2.0,0.0,1,0.0,P,36000.0,O,0.0,0.0,280.0,664.0,0.0,80.0,0.0
2,66.0,0.0,0.0,1,0.0,N,30000.0,N,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,51.0,2.0,0.0,1,0.0,P,464.0,O,24928.0,8464.0,584.0,320.0,0.0,60.0,0.0
4,65.0,0.0,0.0,1,0.0,P,15000.0,P,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [69]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1225 entries, 0 to 1224
Data columns (total 15 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   YOB     1225 non-null   float64
 1   NKID    1225 non-null   float64
 2   DEP     1225 non-null   float64
 3   PHON    1225 non-null   int64  
 4   SINC    1225 non-null   float64
 5   AES     1225 non-null   object 
 6   DAINC   1225 non-null   float64
 7   RES     1225 non-null   object 
 8   DHVAL   1225 non-null   float64
 9   DMORT   1225 non-null   float64
 10  DOUTM   1225 non-null   float64
 11  DOUTL   1225 non-null   float64
 12  DOUTHP  1225 non-null   float64
 13  DOUTCC  1225 non-null   float64
 14  BAD     1225 non-null   float64
dtypes: float64(12), int64(1), object(2)
memory usage: 143.7+ KB


In [70]:
target = 'BAD'

x = df.drop(target, axis=1)
y = df[target]

In [71]:
num_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler()),
])

In [72]:
nom_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('scaler', OneHotEncoder())
])

In [73]:
df.select_dtypes(include=['int64','float64']).columns.tolist()

['YOB',
 'NKID',
 'DEP',
 'PHON',
 'SINC',
 'DAINC',
 'DHVAL',
 'DMORT',
 'DOUTM',
 'DOUTL',
 'DOUTHP',
 'DOUTCC',
 'BAD']

In [74]:
from sklearn.compose import ColumnTransformer


preprocessor = ColumnTransformer(transformers=[
    ("num_features", num_transformer, ['YOB',
 'NKID',
 'DEP',
 'PHON',
 'SINC',
 'DAINC',
 'DHVAL',
 'DMORT',
 'DOUTM',
 'DOUTL',
 'DOUTHP',
 'DOUTCC'] ),
    ("nom_features", nom_transformer,df.select_dtypes(include=['object']).columns.tolist())
    ]
    )

In [75]:
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=42)

In [76]:
# clf = LazyClassifier(verbose=0,ignore_warnings=True, custom_metric=None)
# models,predictions = clf.fit(x_train, x_test, y_train, y_test)
# models

In [77]:
from sklearn.ensemble import RandomForestClassifier


reg = Pipeline(steps=[('preprocessor', preprocessor),
                      ('model', RandomForestClassifier())
                    ])



# processed_data = reg.fit_transform(x_train)
# pd.DataFrame(processed_data)
reg.fit(x_train, y_train)
y_pred = reg.predict(x_test)
# for i, j in zip(y_pred, y_test):
#     print("Predicted: ", i, "Actual: ", j)

In [78]:
from sklearn.metrics import classification_report


print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

         0.0       0.76      0.92      0.83       181
         1.0       0.46      0.19      0.27        64

    accuracy                           0.73       245
   macro avg       0.61      0.56      0.55       245
weighted avg       0.68      0.73      0.69       245

