In [1]:
import pandas as pd
from sklearn.preprocessing import OrdinalEncoder
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import MaxAbsScaler
from sklearn.preprocessing import RobustScaler
from sklearn.preprocessing import StandardScaler
from imblearn.over_sampling import SMOTE
from imblearn.over_sampling import RandomOverSampler
from imblearn.under_sampling import RandomUnderSampler

### Data Preprocessing

In [2]:
loan_data = pd.read_csv(r'D:\FCI\Semster 1\Data Visualization\Fall 2025\Lectures\Lab6\Loan_Default.csv')
loan_data.drop(['ID', 'year'], axis=1, inplace =True)

In [3]:
categorical_features = loan_data.select_dtypes(include=['object']).columns.tolist()

In [4]:
high_missing_cols = ['rate_of_interest', 'Interest_rate_spread', 'Upfront_charges', 'property_value', 'LTV', 'dtir1']
Cols_to_be_imputed = ['term', 'income', 'age', 'loan_limit_freq', 'approv_in_adv_freq', 'loan_purpose_freq', 'Neg_ammortization_freq', 'submission_of_application_freq']
loan_data =  loan_data.drop(high_missing_cols, axis=1)

In [5]:
Ordinal_features = ['age']
Nominal_features = categorical_features.copy()
Nominal_features.remove('age')

In [6]:
enc = OrdinalEncoder()
loan_data[Ordinal_features] = enc.fit_transform(loan_data[Ordinal_features])

In [7]:
for c in Nominal_features:
    frequency = loan_data[c].value_counts(normalize=True)
    loan_data[c+'_freq'] = loan_data[c].map(frequency)

loan_data =  loan_data.drop(Nominal_features, axis=1)

In [8]:
for c in (Cols_to_be_imputed):
    loan_data[c].fillna(loan_data[c].mean(), inplace = True)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  loan_data[c].fillna(loan_data[c].mean(), inplace = True)


### Feature Scaling

In [9]:
sc = StandardScaler()
trans_loan_data1 = pd.DataFrame(sc.fit_transform(loan_data), columns=loan_data.columns)  #fit and transforming StandardScaler the dataframe
trans_loan_data1

Unnamed: 0,loan_amount,term,income,Credit_Score,age,Status,loan_limit_freq,Gender_freq,approv_in_adv_freq,loan_type_freq,...,lump_sum_payment_freq,construction_type_freq,occupancy_type_freq,Secured_by_freq,total_units_freq,credit_type_freq,co-applicant_credit_type_freq,submission_of_application_freq,Region_freq,Security_Type_freq
0,-1.166980,0.425737,-0.829008,0.502357,-1.474340,1.748627,0.274622,-0.086254,0.432241,0.559687,...,0.152617,0.0149,0.275199,0.0149,0.122273,-0.032010,0.999233,0.741826,-0.099392,0.0149
1,-0.677607,0.425737,-0.314189,-1.275413,0.500233,1.748627,0.274622,0.773193,0.432241,-1.722629,...,-6.552344,0.0149,0.275199,0.0149,0.122273,-2.826994,-1.000767,0.741826,0.556135,0.0149
2,0.409890,0.425737,0.400838,1.158234,-0.816149,-0.571877,0.274622,0.773193,-2.327742,0.559687,...,0.152617,0.0149,0.275199,0.0149,0.122273,-0.032010,0.999233,0.741826,-0.099392,0.0149
3,0.681764,0.425737,0.782185,-0.973365,-0.157958,-0.571877,0.274622,0.773193,0.432241,0.559687,...,0.152617,0.0149,0.275199,0.0149,0.122273,-0.032010,0.999233,-1.349842,0.556135,0.0149
4,1.986759,0.425737,0.553377,-0.843916,-1.474340,-0.571877,0.274622,0.599543,-2.327742,0.559687,...,0.152617,0.0149,0.275199,0.0149,0.122273,0.245329,-1.000767,-1.349842,0.556135,0.0149
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
148665,0.573014,-2.656410,0.143428,-0.352008,0.500233,-0.571877,0.274622,-0.086254,0.432241,0.559687,...,0.152617,0.0149,0.275199,0.0149,0.122273,0.701940,-1.000767,0.741826,-0.099392,0.0149
148666,1.388636,0.425737,0.029024,-1.128704,-1.474340,-0.571877,0.274622,0.773193,0.432241,0.559687,...,0.152617,0.0149,-3.595669,0.0149,-8.223628,0.701940,0.999233,-1.349842,-0.099392,0.0149
148667,0.627389,-2.656410,-0.009111,0.019080,-0.157958,-0.571877,0.274622,0.773193,0.432241,0.559687,...,0.152617,0.0149,0.275199,0.0149,0.122273,0.701940,-1.000767,-1.349842,0.556135,0.0149
148668,-0.731981,-2.656410,0.029024,0.321128,0.500233,-0.571877,0.274622,-1.992000,0.432241,0.559687,...,0.152617,0.0149,0.275199,0.0149,0.122273,-0.032010,-1.000767,0.741826,0.556135,0.0149


In [10]:
sc = MinMaxScaler()
trans_loan_data2 = pd.DataFrame(sc.fit_transform(loan_data), columns=loan_data.columns)  #fit and transforming StandardScaler the dataframe
trans_loan_data2

Unnamed: 0,loan_amount,term,income,Credit_Score,age,Status,loan_limit_freq,Gender_freq,approv_in_adv_freq,loan_type_freq,...,lump_sum_payment_freq,construction_type_freq,occupancy_type_freq,Secured_by_freq,total_units_freq,credit_type_freq,co-applicant_credit_type_freq,submission_of_application_freq,Region_freq,Security_Type_freq
0,0.028090,1.000000,0.003007,0.6450,0.000000,1.0,1.0,0.689191,1.0,1.000000,...,1.0,1.0,1.000000,1.0,1.0,0.792019,1.0,1.0,0.854314,1.0
1,0.053371,1.000000,0.008607,0.1300,0.500000,1.0,1.0,1.000000,1.0,0.061226,...,0.0,1.0,1.000000,1.0,1.0,0.000000,0.0,1.0,1.000000,1.0
2,0.109551,1.000000,0.016385,0.8350,0.166667,0.0,1.0,1.000000,0.0,1.000000,...,1.0,1.0,1.000000,1.0,1.0,0.792019,1.0,1.0,0.854314,1.0
3,0.123596,1.000000,0.020533,0.2175,0.333333,0.0,1.0,1.000000,1.0,1.000000,...,1.0,1.0,1.000000,1.0,1.0,0.792019,1.0,0.0,1.000000,1.0
4,0.191011,1.000000,0.018044,0.2550,0.000000,0.0,1.0,0.937202,0.0,1.000000,...,1.0,1.0,1.000000,1.0,1.0,0.870609,0.0,0.0,1.000000,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
148665,0.117978,0.318182,0.013585,0.3975,0.500000,0.0,1.0,0.689191,1.0,1.000000,...,1.0,1.0,1.000000,1.0,1.0,1.000000,0.0,1.0,0.854314,1.0
148666,0.160112,1.000000,0.012341,0.1725,0.000000,0.0,1.0,1.000000,1.0,1.000000,...,1.0,1.0,0.031176,1.0,0.0,1.000000,1.0,0.0,0.854314,1.0
148667,0.120787,0.318182,0.011926,0.5050,0.333333,0.0,1.0,1.000000,1.0,1.000000,...,1.0,1.0,1.000000,1.0,1.0,1.000000,0.0,0.0,1.000000,1.0
148668,0.050562,0.318182,0.012341,0.5925,0.500000,0.0,1.0,0.000000,1.0,1.000000,...,1.0,1.0,1.000000,1.0,1.0,0.792019,0.0,1.0,1.000000,1.0


In [11]:
sc = MaxAbsScaler()
trans_loan_data1 = pd.DataFrame(sc.fit_transform(loan_data), columns=loan_data.columns)  #fit and transforming StandardScaler the dataframe
trans_loan_data1

Unnamed: 0,loan_amount,term,income,Credit_Score,age,Status,loan_limit_freq,Gender_freq,approv_in_adv_freq,loan_type_freq,...,lump_sum_payment_freq,construction_type_freq,occupancy_type_freq,Secured_by_freq,total_units_freq,credit_type_freq,co-applicant_credit_type_freq,submission_of_application_freq,Region_freq,Security_Type_freq
0,0.032574,1.000000,0.003007,0.842222,0.000000,1.0,1.0,0.889317,1.000000,1.000000,...,1.000000,1.0,1.000000,1.0,1.000000,0.858095,1.000000,1.000000,0.856722,1.0
1,0.057738,1.000000,0.008607,0.613333,0.500000,1.0,1.0,1.000000,1.000000,0.183454,...,0.023292,1.0,1.000000,1.0,1.000000,0.317702,0.998468,1.000000,1.000000,1.0
2,0.113659,1.000000,0.016385,0.926667,0.166667,0.0,1.0,1.000000,0.185691,1.000000,...,1.000000,1.0,1.000000,1.0,1.000000,0.858095,1.000000,1.000000,0.856722,1.0
3,0.127639,1.000000,0.020533,0.652222,0.333333,0.0,1.0,1.000000,1.000000,1.000000,...,1.000000,1.0,1.000000,1.0,1.000000,0.858095,1.000000,0.549565,1.000000,1.0
4,0.194743,1.000000,0.018044,0.668889,0.000000,0.0,1.0,0.977637,0.185691,1.000000,...,1.000000,1.0,1.000000,1.0,1.000000,0.911717,0.998468,0.549565,1.000000,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
148665,0.122047,0.500000,0.013585,0.732222,0.500000,0.0,1.0,0.889317,1.000000,1.000000,...,1.000000,1.0,1.000000,1.0,1.000000,1.000000,0.998468,1.000000,0.856722,1.0
148666,0.163987,1.000000,0.012341,0.632222,0.000000,0.0,1.0,1.000000,1.000000,1.000000,...,1.000000,1.0,0.053111,1.0,0.002185,1.000000,1.000000,0.549565,0.856722,1.0
148667,0.124843,0.500000,0.011926,0.780000,0.333333,0.0,1.0,1.000000,1.000000,1.000000,...,1.000000,1.0,1.000000,1.0,1.000000,1.000000,0.998468,0.549565,1.000000,1.0
148668,0.054942,0.500000,0.012341,0.818889,0.500000,0.0,1.0,0.643886,1.000000,1.000000,...,1.000000,1.0,1.000000,1.0,1.000000,0.858095,0.998468,1.000000,1.000000,1.0


In [12]:
sc = RobustScaler()
trans_loan_data1 = pd.DataFrame(sc.fit_transform(loan_data), columns=loan_data.columns)  #fit and transforming StandardScaler the dataframe
trans_loan_data1

Unnamed: 0,loan_amount,term,income,Credit_Score,age,Status,loan_limit_freq,Gender_freq,approv_in_adv_freq,loan_type_freq,...,lump_sum_payment_freq,construction_type_freq,occupancy_type_freq,Secured_by_freq,total_units_freq,credit_type_freq,co-applicant_credit_type_freq,submission_of_application_freq,Region_freq,Security_Type_freq
0,-0.750000,0.0,-0.959459,0.293532,-1.0,1.0,0.0,-0.797952,0.00000,0.000000,...,0.000000,0.0,0.000000,0.0,0.000000,-0.377872,0.0,0.0,-1.0,0.0
1,-0.375000,0.0,-0.229730,-0.731343,0.5,1.0,0.0,0.202048,0.00000,-0.621585,...,-0.954476,0.0,0.000000,0.0,0.000000,-4.186009,-1.0,0.0,0.0,0.0
2,0.458333,0.0,0.783784,0.671642,-0.5,0.0,0.0,0.202048,-0.68678,0.000000,...,0.000000,0.0,0.000000,0.0,0.000000,-0.377872,0.0,0.0,-1.0,0.0
3,0.666667,0.0,1.324324,-0.557214,0.0,0.0,0.0,0.202048,0.00000,0.000000,...,0.000000,0.0,0.000000,0.0,0.000000,-0.377872,0.0,-1.0,0.0,0.0
4,1.666667,0.0,1.000000,-0.482587,-1.0,0.0,0.0,0.000000,-0.68678,0.000000,...,0.000000,0.0,0.000000,0.0,0.000000,0.000000,-1.0,-1.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
148665,0.583333,-180.0,0.418919,-0.199005,0.5,0.0,0.0,-0.797952,0.00000,0.000000,...,0.000000,0.0,0.000000,0.0,0.000000,0.622128,-1.0,0.0,-1.0,0.0
148666,1.208333,0.0,0.256757,-0.646766,-1.0,0.0,0.0,0.202048,0.00000,0.000000,...,0.000000,0.0,-0.880211,0.0,-0.983117,0.622128,0.0,-1.0,-1.0,0.0
148667,0.625000,-180.0,0.202703,0.014925,0.0,0.0,0.0,0.202048,0.00000,0.000000,...,0.000000,0.0,0.000000,0.0,0.000000,0.622128,-1.0,-1.0,0.0,0.0
148668,-0.416667,-180.0,0.256757,0.189055,0.5,0.0,0.0,-3.015362,0.00000,0.000000,...,0.000000,0.0,0.000000,0.0,0.000000,-0.377872,-1.0,0.0,0.0,0.0


In [13]:
for col in loan_data.columns:
    col_mean = loan_data[col].mean()
    col_max  = loan_data[col].max()
    col_min  = loan_data[col].min()
    
    loan_data[col] = (loan_data[col] - col_mean) / (col_max - col_min)
loan_data

Unnamed: 0,loan_amount,term,income,Credit_Score,age,Status,loan_limit_freq,Gender_freq,approv_in_adv_freq,loan_type_freq,...,lump_sum_payment_freq,construction_type_freq,occupancy_type_freq,Secured_by_freq,total_units_freq,credit_type_freq,co-applicant_credit_type_freq,submission_of_application_freq,Region_freq,Security_Type_freq
0,-0.060286,0.094180,-0.009017,0.145527,-0.373331,0.753555,0.068659,-0.031193,0.15661,0.230213,...,0.022762,0.000222,0.068879,0.000222,0.014651,-0.009071,0.499617,0.354658,-0.022089,0.000222
1,-0.035005,0.094180,-0.003418,-0.369473,0.126669,0.753555,0.068659,0.279616,0.15661,-0.708560,...,-0.977238,0.000222,0.068879,0.000222,0.014651,-0.801090,-0.500383,0.354658,0.123597,0.000222
2,0.021175,0.094180,0.004360,0.335527,-0.206665,-0.246445,0.068659,0.279616,-0.84339,0.230213,...,0.022762,0.000222,0.068879,0.000222,0.014651,-0.009071,0.499617,0.354658,-0.022089,0.000222
3,0.035220,0.094180,0.008508,-0.281973,-0.039998,-0.246445,0.068659,0.279616,0.15661,0.230213,...,0.022762,0.000222,0.068879,0.000222,0.014651,-0.009071,0.499617,-0.645342,0.123597,0.000222
4,0.102635,0.094180,0.006019,-0.244473,-0.373331,-0.246445,0.068659,0.216818,-0.84339,0.230213,...,0.022762,0.000222,0.068879,0.000222,0.014651,0.069519,-0.500383,-0.645342,0.123597,0.000222
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
148665,0.029602,-0.587639,0.001560,-0.101973,0.126669,-0.246445,0.068659,-0.031193,0.15661,0.230213,...,0.022762,0.000222,0.068879,0.000222,0.014651,0.198910,-0.500383,0.354658,-0.022089,0.000222
148666,0.071737,0.094180,0.000316,-0.326973,-0.373331,-0.246445,0.068659,0.279616,0.15661,0.230213,...,0.022762,0.000222,-0.899946,0.000222,-0.985349,0.198910,0.499617,-0.645342,-0.022089,0.000222
148667,0.032411,-0.587639,-0.000099,0.005527,-0.039998,-0.246445,0.068659,0.279616,0.15661,0.230213,...,0.022762,0.000222,0.068879,0.000222,0.014651,0.198910,-0.500383,-0.645342,0.123597,0.000222
148668,-0.037814,-0.587639,0.000316,0.093027,0.126669,-0.246445,0.068659,-0.720384,0.15661,0.230213,...,0.022762,0.000222,0.068879,0.000222,0.014651,-0.009071,-0.500383,0.354658,0.123597,0.000222


### Data Resampling

In [14]:
Insurance_data = pd.read_csv(r'D:\FCI\Semster 1\Data Visualization\Fall 2025\Lectures\Lab6\Insurance.csv')

In [15]:
Insurance_data['Response'].value_counts() 

Response
0    319553
1     62601
Name: count, dtype: int64

In [16]:
X=Insurance_data.drop('Response', axis=1)
y=Insurance_data['Response']

In [17]:
# define oversampling strategy
Insurance_data2 = Insurance_data.copy()
oversample = RandomOverSampler(sampling_strategy=0.5) # ensure that the minority class was oversampled to have half the number of examples as the majority class
X_over, y_over = oversample.fit_resample(X, y)
Insurance_data2 = pd.DataFrame(X_over)
Insurance_data2['Response'] = y_over
Insurance_data2['Response'].value_counts()

Response
0    319553
1    159776
Name: count, dtype: int64

In [18]:
# define undersampling strategy
Insurance_data3 = Insurance_data.copy()
undersample = RandomUnderSampler(sampling_strategy=0.5)
X_under, y_under = undersample.fit_resample(X, y)
Insurance_data3 = pd.DataFrame(X_under)
Insurance_data3['Response'] = y_under
Insurance_data3['Response'].value_counts()

Response
0    125202
1     62601
Name: count, dtype: int64

In [19]:
# define oversampling strategy
Insurance_data4 = Insurance_data.copy()

smote = SMOTE(sampling_strategy='minority')
X_sm, y_sm = smote.fit_resample(X, y)
Insurance_data4 = pd.DataFrame(X_sm)
Insurance_data4['Response'] = y_sm
Insurance_data4['Response'].value_counts()

ValueError: could not convert string to float: 'Male'

In [20]:
categorical_features = Insurance_data.select_dtypes(include=['object']).columns.tolist()
Nominal_features = categorical_features.copy()
Nominal_features.remove('Vehicle_Age')

In [21]:
enc = OrdinalEncoder()
Insurance_data[['Vehicle_Age']] = enc.fit_transform(Insurance_data[['Vehicle_Age']])

In [22]:
for c in Nominal_features:
    frequency = Insurance_data[c].value_counts(normalize=True)
    Insurance_data[c+'_freq'] = Insurance_data[c].map(frequency)

Insurance_data =  Insurance_data.drop(Nominal_features, axis=1)

In [23]:
X=Insurance_data.drop('Response', axis=1)
y=Insurance_data['Response']
Insurance_data4 = Insurance_data.copy()

smote = SMOTE(sampling_strategy='minority')
X_sm, y_sm = smote.fit_resample(X, y)
Insurance_data4 = pd.DataFrame(X_sm)
Insurance_data4['Response'] = y_sm
Insurance_data4['Response'].value_counts()

Response
0    319553
1    319553
Name: count, dtype: int64