In [1]:
from typing import List
import pandas as pd
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.pipeline import Pipeline
class DataCleaner(BaseEstimator, TransformerMixin):

    def __init__(self, age_column:str=None, age_threshold:int=None, column_to_drop=None, emp_length_column:str=None, emp_length_threshold:int=None):
        self.age_column = age_column
        self.age_threshold = age_threshold
        self.column_to_drop = column_to_drop
        self.emp_length_column = emp_length_column
        self.emp_length_threshold = emp_length_threshold

    def fit(self, X, y=None):
        return self

    def transform(self, X):
        if not isinstance(X, pd.DataFrame):
            raise ValueError("Input must be a pandas DataFrame.")

        # Check for nulls before dropping them
        if X.isnull().values.any():
            X = X.dropna()

        # Filter by age threshold if the column and threshold are set
        if self.age_column and self.age_threshold is not None:
            X[self.age_column] = X[self.age_column].astype(int)
            max_age = X[self.age_column].max()
            if max_age > self.age_threshold:
                X = X[X[self.age_column] <= self.age_threshold]

        # Filter by employment length
        if self.emp_length_column and self.emp_length_threshold is not None:
            X = X[X[self.emp_length_column] <= self.emp_length_threshold]

        # Drop the specified column
        if self.column_to_drop and self.column_to_drop in X.columns:
            X = X.drop(columns=[self.column_to_drop])

        # Reset the index of the DataFrame
        X = X.reset_index(drop=True)

        return X

In [2]:
df = pd.read_csv("../Data/credit_risk_dataset.csv")
df.head()

Unnamed: 0,person_age,person_income,person_home_ownership,person_emp_length,loan_intent,loan_grade,loan_amnt,loan_int_rate,loan_status,loan_percent_income,cb_person_default_on_file,cb_person_cred_hist_length
0,22,59000,RENT,123.0,PERSONAL,D,35000,16.02,1,0.59,Y,3
1,21,9600,OWN,5.0,EDUCATION,B,1000,11.14,0,0.1,N,2
2,25,9600,MORTGAGE,1.0,MEDICAL,C,5500,12.87,1,0.57,N,3
3,23,65500,RENT,4.0,MEDICAL,C,35000,15.23,1,0.53,N,2
4,24,54400,RENT,8.0,MEDICAL,C,35000,14.27,1,0.55,Y,4


In [3]:
data = df.dropna()

In [4]:
data.isnull().sum()

person_age                    0
person_income                 0
person_home_ownership         0
person_emp_length             0
loan_intent                   0
loan_grade                    0
loan_amnt                     0
loan_int_rate                 0
loan_status                   0
loan_percent_income           0
cb_person_default_on_file     0
cb_person_cred_hist_length    0
dtype: int64

In [5]:
data_cleaner = DataCleaner(
                                        age_column='person_age',
                                        age_threshold=80,
                                        column_to_drop='index',
                                        emp_length_column='person_emp_length',
                                        emp_length_threshold=60
                                    )

In [6]:
data_transformed = data_cleaner.fit_transform(data)
data_transformed

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X[self.age_column] = X[self.age_column].astype(int)


Unnamed: 0,person_age,person_income,person_home_ownership,person_emp_length,loan_intent,loan_grade,loan_amnt,loan_int_rate,loan_status,loan_percent_income,cb_person_default_on_file,cb_person_cred_hist_length
0,21,9600,OWN,5.0,EDUCATION,B,1000,11.14,0,0.10,N,2
1,25,9600,MORTGAGE,1.0,MEDICAL,C,5500,12.87,1,0.57,N,3
2,23,65500,RENT,4.0,MEDICAL,C,35000,15.23,1,0.53,N,2
3,24,54400,RENT,8.0,MEDICAL,C,35000,14.27,1,0.55,Y,4
4,21,9900,OWN,2.0,VENTURE,A,2500,7.14,1,0.25,N,2
...,...,...,...,...,...,...,...,...,...,...,...,...
28626,57,53000,MORTGAGE,1.0,PERSONAL,C,5800,13.16,0,0.11,N,30
28627,54,120000,MORTGAGE,4.0,PERSONAL,A,17625,7.49,0,0.15,N,19
28628,65,76000,RENT,3.0,HOMEIMPROVEMENT,B,35000,10.99,1,0.46,N,28
28629,56,150000,MORTGAGE,5.0,PERSONAL,B,15000,11.48,0,0.10,N,26


In [8]:
max(data['person_age'])

144

In [7]:
max(data_transformed['person_age'])

80

In [10]:
max(data['person_emp_length'])

123.0

In [11]:
max(data_transformed['person_emp_length'])

41.0

In [12]:
class AgeGroupCategorizer(BaseEstimator, TransformerMixin):
    def __init__(self, input_col:str=None, output_col:str=None, bins=None, labels=None):
        self.input_col = input_col
        self.output_col = output_col
        self.bins = bins
        self.labels = labels

    def fit(self, X, y=None):
        return self

    def transform(self, X):
        if not isinstance(X, pd.DataFrame):
            raise ValueError("Input must be a pandas DataFrame.")
        
        # Ensure bins and labels are provided
        if self.bins is None or self.labels is None:
            raise ValueError("Both bins and labels must be provided.")
        
        # Check that the number of labels is one less than the number of bins
        if len(self.labels) != len(self.bins) - 1:
            raise ValueError("The number of labels must be equal to len(bins) - 1.")

        # Use pandas cut to assign age groups
        X[self.output_col] = pd.cut(X[self.input_col], bins=self.bins, labels=self.labels, right=False, include_lowest=True)
        X[self.output_col] = X[self.output_col].astype(str)

        return X

In [14]:
AGE_BINS = [20, 26, 36, 46, 56, 66, 80]
AGE_LABELS = ['20-25', '26-35', '36-45', '46-55', '56-65', '66-80']

age_group_categorizer = AgeGroupCategorizer(input_col='person_age', output_col='age_group', bins=AGE_BINS, labels=AGE_LABELS)

In [15]:
data_transformed = age_group_categorizer.fit_transform(data_transformed)
data_transformed.head()

Unnamed: 0,person_age,person_income,person_home_ownership,person_emp_length,loan_intent,loan_grade,loan_amnt,loan_int_rate,loan_status,loan_percent_income,cb_person_default_on_file,cb_person_cred_hist_length,age_group
0,21,9600,OWN,5.0,EDUCATION,B,1000,11.14,0,0.1,N,2,20-25
1,25,9600,MORTGAGE,1.0,MEDICAL,C,5500,12.87,1,0.57,N,3,20-25
2,23,65500,RENT,4.0,MEDICAL,C,35000,15.23,1,0.53,N,2,20-25
3,24,54400,RENT,8.0,MEDICAL,C,35000,14.27,1,0.55,Y,4,20-25
4,21,9900,OWN,2.0,VENTURE,A,2500,7.14,1,0.25,N,2,20-25


In [16]:
import numpy as np

In [17]:
class IncomeGroupCategorizer(BaseEstimator, TransformerMixin):
    def __init__(self, input_col:str=None, output_col:str=None):
        self.input_col = input_col
        self.output_col = output_col

    def fit(self, X, y=None):
        return self

    def transform(self, X):
        # Check if the input is a valid pandas DataFrame or Series
        if not isinstance(X, (pd.DataFrame, pd.Series)):
            raise ValueError("Input must be a pandas DataFrame or Series.")
        
        # If X is a scalar value (a single income value), wrap it in a DataFrame
        if isinstance(X, pd.Series) or np.isscalar(X):
            X = pd.DataFrame({self.input_col: [X]})
        
        # Create a new column for the income group categorization
        X[self.output_col] = np.select(
            [
                X[self.input_col].between(0, 25000),
                X[self.input_col].between(25001, 50000),
                X[self.input_col].between(50001, 75000),
                X[self.input_col].between(75001, 100000)
            ],
            [
                'low',
                'low-middle',
                'middle',
                'high-middle'
            ],
            default='high'
        )
        
        return X

In [19]:
income_group_categorizer = IncomeGroupCategorizer(input_col='person_income', output_col='income_group')
data_transformed  =income_group_categorizer.fit_transform(data_transformed)
data_transformed.head()

Unnamed: 0,person_age,person_income,person_home_ownership,person_emp_length,loan_intent,loan_grade,loan_amnt,loan_int_rate,loan_status,loan_percent_income,cb_person_default_on_file,cb_person_cred_hist_length,age_group,income_group
0,21,9600,OWN,5.0,EDUCATION,B,1000,11.14,0,0.1,N,2,20-25,low
1,25,9600,MORTGAGE,1.0,MEDICAL,C,5500,12.87,1,0.57,N,3,20-25,low
2,23,65500,RENT,4.0,MEDICAL,C,35000,15.23,1,0.53,N,2,20-25,middle
3,24,54400,RENT,8.0,MEDICAL,C,35000,14.27,1,0.55,Y,4,20-25,middle
4,21,9900,OWN,2.0,VENTURE,A,2500,7.14,1,0.25,N,2,20-25,low


In [21]:
class LoanAmountCategorizer(BaseEstimator, TransformerMixin):
    def __init__(self, input_col:str = None, output_col:str=None):
        self.input_col = input_col
        self.output_col = output_col

    def fit(self, X, y=None):
        return self

    def transform(self, X):
        # Check if the input is a valid pandas DataFrame
        if not isinstance(X, pd.DataFrame):
            raise ValueError("Input must be a pandas DataFrame.")

        # Create a new column for the loan amount group categorization
        X[self.output_col] = np.select(
            [
                X[self.input_col].between(0, 5000),
                X[self.input_col].between(5001, 10000),
                X[self.input_col].between(10001, 15000)
            ],
            [
                'small',
                'medium',
                'high'
            ],
            default='very_high'
        )
        
        return X

In [22]:
loan_amount_categorizer = LoanAmountCategorizer(input_col='loan_amnt',output_col='loan_amount_group')
data_transformed = loan_amount_categorizer.fit_transform(data_transformed)
data_transformed.head()

Unnamed: 0,person_age,person_income,person_home_ownership,person_emp_length,loan_intent,loan_grade,loan_amnt,loan_int_rate,loan_status,loan_percent_income,cb_person_default_on_file,cb_person_cred_hist_length,age_group,income_group,loan_amount_group
0,21,9600,OWN,5.0,EDUCATION,B,1000,11.14,0,0.1,N,2,20-25,low,small
1,25,9600,MORTGAGE,1.0,MEDICAL,C,5500,12.87,1,0.57,N,3,20-25,low,medium
2,23,65500,RENT,4.0,MEDICAL,C,35000,15.23,1,0.53,N,2,20-25,middle,very_high
3,24,54400,RENT,8.0,MEDICAL,C,35000,14.27,1,0.55,Y,4,20-25,middle,very_high
4,21,9900,OWN,2.0,VENTURE,A,2500,7.14,1,0.25,N,2,20-25,low,small


In [26]:
class RatioFeatureGenerator(BaseEstimator, TransformerMixin):
    def __init__(self):
        pass  # No parameters needed for this transformer

    def fit(self, X, y=None):
        return self

    def transform(self, X):
        # Check if the input is a valid pandas DataFrame
        if not isinstance(X, pd.DataFrame):
            raise ValueError("Input must be a pandas DataFrame.")

        # Ensure required columns exist
        required_columns = ['loan_amnt', 'person_income', 'person_emp_length', 'loan_int_rate']
        for col in required_columns:
            if col not in X.columns:
                raise ValueError(f"Column '{col}' is not in the DataFrame.")

        # Create new ratio columns 
        X['loan_to_income_ratio'] = (X['loan_amnt'] / X['person_income'])
        X['loan_to_emp_length_ratio'] = (X['person_emp_length'] / X['loan_amnt'])
        X['int_rate_to_loan_amt_ratio'] = (X['loan_int_rate'] / X['loan_amnt'])

        return X

In [27]:
ratio_feature_generator = RatioFeatureGenerator()
data_transformed = ratio_feature_generator.fit_transform(data_transformed)
data_transformed.head()

Unnamed: 0,person_age,person_income,person_home_ownership,person_emp_length,loan_intent,loan_grade,loan_amnt,loan_int_rate,loan_status,loan_percent_income,cb_person_default_on_file,cb_person_cred_hist_length,age_group,income_group,loan_amount_group,loan_to_income_ratio,loan_to_emp_length_ratio,int_rate_to_loan_amt_ratio
0,21,9600,OWN,5.0,EDUCATION,B,1000,11.14,0,0.1,N,2,20-25,low,small,0.104167,0.005,0.01114
1,25,9600,MORTGAGE,1.0,MEDICAL,C,5500,12.87,1,0.57,N,3,20-25,low,medium,0.572917,0.000182,0.00234
2,23,65500,RENT,4.0,MEDICAL,C,35000,15.23,1,0.53,N,2,20-25,middle,very_high,0.534351,0.000114,0.000435
3,24,54400,RENT,8.0,MEDICAL,C,35000,14.27,1,0.55,Y,4,20-25,middle,very_high,0.643382,0.000229,0.000408
4,21,9900,OWN,2.0,VENTURE,A,2500,7.14,1,0.25,N,2,20-25,low,small,0.252525,0.0008,0.002856


In [None]:
from sklearn.preprocessing import OneHotEncoder
from 