In [2]:
import pandas as pd
import numpy as np
import warnings
warnings.filterwarnings("ignore")


from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.preprocessing import StandardScaler, LabelEncoder, OrdinalEncoder


from sklearn.metrics import accuracy_score, classification_report 
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.pipeline import Pipeline

from sklearn.linear_model import LogisticRegression

In [3]:
from imblearn.over_sampling import RandomOverSampler
from imblearn.under_sampling import RandomUnderSampler


In [None]:
from imblearn.over_sampling import SMOTE
smote = SMOTE(random_state=RANDOM_STATE)
X_train_resampled, y_train_resampled = smote.fit_resample(X_train_scaled, y_train)


In [6]:
train_data = pd.read_csv('loan_train.csv')
test_data = pd.read_csv('loan_test.csv')


train_df = train_data.copy()
test_df = test_data.copy()


TARGET = "Status"

In [7]:
print(f"The train dataset has {train_df.shape[0]} rows and {train_df.shape[1]} columns.")
print(f"The test dataset has {test_df.shape[0]} rows and {test_df.shape[1]} columns.")

The train dataset has 614 rows and 12 columns.
The test dataset has 367 rows and 11 columns.


In [9]:
test_df.head()

Unnamed: 0,Gender,Married,Dependents,Education,Self_Employed,Applicant_Income,Coapplicant_Income,Loan_Amount,Term,Credit_History,Area
0,Male,Yes,0,Graduate,No,572000,0,11000000,360.0,1.0,Urban
1,Male,Yes,1,Graduate,No,307600,150000,12600000,360.0,1.0,Urban
2,Male,Yes,2,Graduate,No,500000,180000,20800000,360.0,1.0,Urban
3,Male,Yes,2,Graduate,No,234000,254600,10000000,360.0,,Urban
4,Male,No,0,Not Graduate,No,327600,0,7800000,360.0,1.0,Urban


In [10]:
train_df.isna().sum()

Gender                13
Married                3
Dependents            15
Education              0
Self_Employed         32
Applicant_Income       0
Coapplicant_Income     0
Loan_Amount            0
Term                  14
Credit_History        50
Area                   0
Status                 0
dtype: int64

In [11]:
def select_columns(df: pd.DataFrame, threshold: int = 10) -> list[str]:
    
    
   
    discrete_columns = []
    continuous_columns = []

    for column in df.columns:
       
        if df[column].nunique() <= threshold:
            
            discrete_columns.append(column)
        
        else:
            continuous_columns.append(column)


    return discrete_columns, continuous_columns

In [12]:
discrete_columns, continuous_columns = select_columns(train_df)

In [13]:
discrete_columns, continuous_columns

(['Gender',
  'Married',
  'Dependents',
  'Education',
  'Self_Employed',
  'Term',
  'Credit_History',
  'Area',
  'Status'],
 ['Applicant_Income', 'Coapplicant_Income', 'Loan_Amount'])

In [14]:
def replace_outliers(df: pd.DataFrame, inner_fence_multiplier: float = 1.5, outer_fence_multiplier: float = 3.0) -> pd.DataFrame:

    for column in continuous_columns:
        
        Q1 = df[column].quantile(0.25)
        Q3 = df[column].quantile(0.75)
        IQR = Q3 - Q1

        inner_fence_low = Q1 - inner_fence_multiplier * IQR
        inner_fence_high = Q3 + inner_fence_multiplier * IQR
        outer_fence_low = Q1 - outer_fence_multiplier * IQR
        outer_fence_high = Q3 + outer_fence_multiplier * IQR

       
        outliers = (df[column] < inner_fence_low) | (df[column] > inner_fence_high) | \
                   (df[column] < outer_fence_low) | (df[column] > outer_fence_high)

       
        df.loc[outliers, column] = df[column].mean()
        return df

In [15]:
train_df = replace_outliers(train_df)
test_df = replace_outliers(test_df)

In [16]:
def replace_missing_values(df: pd.DataFrame) -> pd.DataFrame:
    
    subset_discrete_columns = [column for column in discrete_columns if column != TARGET]

    for column_discrete in subset_discrete_columns:
        
        if df[column_discrete].isnull().any():
            
            df[column_discrete] = df[column_discrete].fillna(df[column_discrete].mode()[0])

    for column_continuous in continuous_columns:
        
        if df[column_continuous].isnull().any():
            df[column_continuous] = df[column_continuous].fillna(df[column_continuous].mean())


    
    return df

In [17]:
train_df = replace_missing_values(train_df)
test_df = replace_missing_values(test_df)

In [18]:
label_encoder = LabelEncoder()
ordinal_encoder = OrdinalEncoder()


label_columns_train = ["Gender", "Married", "Education", "Self_Employed", "Area", "Status"]
label_columns_test = [column for column in label_columns_train if column != TARGET]
ordinal_columns = ["Dependents"]


train_df[label_columns_train] = train_df[label_columns_train].apply(label_encoder.fit_transform)
test_df[label_columns_test] = test_df[label_columns_test].apply(label_encoder.fit_transform)

train_df[ordinal_columns] = ordinal_encoder.fit_transform(train_df[ordinal_columns])
test_df[ordinal_columns] = ordinal_encoder.fit_transform(test_df[ordinal_columns])

In [19]:
X = train_df.drop(columns=[TARGET], axis=1)
y = train_df[TARGET]

In [20]:

RANDOM_STATE = 2


X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=RANDOM_STATE)


print(f"Size of training data: {X_train.shape[0]} rows and {X_train.shape[1]} columns.")
print(f"Size of testing data: {X_test.shape[0]} rows and {X_test.shape[1]} columns.")

Size of training data: 429 rows and 11 columns.
Size of testing data: 185 rows and 11 columns.


In [21]:
scaler = StandardScaler()


X_train_scaled = scaler.fit_transform(X_train)

X_test_scaled = scaler.transform(X_test)

In [22]:
oversampler = RandomOverSampler(random_state=RANDOM_STATE)


X_train_oversampled, y_train_oversampled = oversampler.fit_resample(X_train_scaled, y_train)


undersampler = RandomUnderSampler(random_state=RANDOM_STATE)


X_train_undersampled, y_train_undersampled = undersampler.fit_resample(X_train_scaled, y_train)

In [29]:
from imblearn.over_sampling import SMOTE
smote = SMOTE(random_state=RANDOM_STATE)
X_train_resampled, y_train_resampled = smote.fit_resample(X_train_scaled, y_train)


In [30]:
X_train_oversampled

array([[ 0.49343516,  0.75592895,  1.22292747, ...,  2.0824315 ,
         0.41873914, -0.02961563],
       [ 0.49343516, -1.32287566, -0.74568748, ...,  0.27656307,
         0.41873914,  1.24089471],
       [ 0.49343516,  0.75592895, -0.74568748, ...,  0.27656307,
         0.41873914, -0.02961563],
       ...,
       [ 0.49343516,  0.75592895,  1.22292747, ...,  0.27656307,
         0.41873914, -1.30012596],
       [-2.02660871, -1.32287566, -0.74568748, ...,  0.27656307,
         0.41873914, -0.02961563],
       [ 0.49343516,  0.75592895,  1.22292747, ...,  0.27656307,
         0.41873914, -1.30012596]], shape=(590, 11))

In [None]:
#model = LogisticRegression(solver = 'saga', max_iter = 500, random_state = 1)

In [None]:
#lr = LogisticRegression(solver = 'saga', max_iter = 500, random_state = 1)
#lr.fit(X_train_oversampled, y_train_oversampled)

In [31]:
from sklearn.ensemble import RandomForestClassifier
rf = RandomForestClassifier(n_estimators=100, random_state=RANDOM_STATE)
rf.fit(X_train_resampled, y_train_resampled)


In [32]:
y_pred = rf.predict(X_test)
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.31      1.00      0.48        58
           1       0.00      0.00      0.00       127

    accuracy                           0.31       185
   macro avg       0.16      0.50      0.24       185
weighted avg       0.10      0.31      0.15       185



In [34]:
example=[1,0,0.0,1,0,584900.0,0.0,15000000.0,360.0,1.0,1]
example_reshaped= np.array(example).reshape(1,-1)
sol= rf.predict(example_reshaped)
print(sol)

[0]


In [28]:
test_df.head()

Unnamed: 0,Gender,Married,Dependents,Education,Self_Employed,Applicant_Income,Coapplicant_Income,Loan_Amount,Term,Credit_History,Area
0,1,1,0.0,0,0,572000.0,0,11000000,360.0,1.0,2
1,1,1,1.0,0,0,307600.0,150000,12600000,360.0,1.0,2
2,1,1,2.0,0,0,500000.0,180000,20800000,360.0,1.0,2
3,1,1,2.0,0,0,234000.0,254600,10000000,360.0,1.0,2
4,1,0,0.0,1,0,327600.0,0,7800000,360.0,1.0,2
