In [1]:
# lets load the data from train.csv and test.csv files
import pandas as pd
import numpy as np

train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')


  train = pd.read_csv('train.csv')


In [2]:
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100000 entries, 0 to 99999
Data columns (total 28 columns):
 #   Column                    Non-Null Count   Dtype  
---  ------                    --------------   -----  
 0   ID                        100000 non-null  object 
 1   Customer_ID               100000 non-null  object 
 2   Month                     100000 non-null  object 
 3   Name                      90015 non-null   object 
 4   Age                       100000 non-null  object 
 5   SSN                       100000 non-null  object 
 6   Occupation                100000 non-null  object 
 7   Annual_Income             100000 non-null  object 
 8   Monthly_Inhand_Salary     84998 non-null   float64
 9   Num_Bank_Accounts         100000 non-null  int64  
 10  Num_Credit_Card           100000 non-null  int64  
 11  Interest_Rate             100000 non-null  int64  
 12  Num_of_Loan               100000 non-null  object 
 13  Type_of_Loan              88592 non-null   ob

In [3]:
from sklearn.base import BaseEstimator, TransformerMixin
import pandas as pd

class NumericCleaner(BaseEstimator, TransformerMixin):
   def __init__(self):
      self.columns_to_clean =  ['Age', 'Annual_Income', 'Num_of_Loan', 'Num_of_Delayed_Payment',
       'Changed_Credit_Limit', 'Outstanding_Debt', 'Amount_invested_monthly', 'Monthly_Balance']

   def fit(self, X, y=None):
      # No fitting necessary for this transformer
      return self

   def transform(self, X):
      # Make a copy of the DataFrame to avoid modifying the original data
      X = X.copy()
            
      # Apply cleaning function to each specified column
      for column in self.columns_to_clean:
         X[column] = pd.to_numeric(X[column].str.replace(r'[^0-9.-]', '', regex=True), errors='coerce')

        
      return X

In [4]:
columns_to_clean = [
    'Age', 'Annual_Income', 'Num_of_Loan', 'Num_of_Delayed_Payment',
    'Changed_Credit_Limit', 'Outstanding_Debt', 'Amount_invested_monthly', 'Monthly_Balance'
]
# Create an instance of the transformer
numeric_cleaner = NumericCleaner()

train= numeric_cleaner.fit_transform(train)



In [5]:
hi_lo_bounds = {
    # Numerical Columns
    'Age': {'lo': 0, 'hi': 100},
    'Annual_Income': {'lo': 5000, 'hi': 500000},
    'Monthly_Inhand_Salary': {'lo': 500, 'hi': 30000},
    'Num_Bank_Accounts': {'lo': 0, 'hi': 20},
    'Num_Credit_Card': {'lo': 0, 'hi': 15},
    'Interest_Rate': {'lo': 0, 'hi': 50},
    'Delay_from_due_date': {'lo': 0, 'hi': 100},
    'Num_Credit_Inquiries': {'lo': 0, 'hi': 30},
    'Changed_Credit_Limit': {'lo': 0, 'hi': 30},  # Bounds after converting to numeric
    'Num_of_Loan': {'lo': 0, 'hi': 10},  # Bounds after converting to numeric
    
    # Categorical Columns (ordered)
    'Credit_Mix': {'lo': 0, 'hi': 2},  # Assuming "Bad" = 0, "Standard" = 1, "Good" = 2

    # Categorical Column (nominal)
    'Occupation': {'lo': 0, 'hi': 14}  # No bounds, only impute missing values
}


In [6]:
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.utils.validation import check_is_fitted
import pandas as pd

class CreditHistoryTransformer(BaseEstimator, TransformerMixin):
   def __init__(self):
      self.column = 'Credit_History_Age'
        

   def fit(self, X, y=None): 
      assert isinstance(X, pd.DataFrame) and self.column in X.columns
      #set number of features
      self._n_features = X.shape[1]
      return self
   
   def transform(self, X):
      check_is_fitted(self, '_n_features')
      assert isinstance(X, pd.DataFrame) and self.column in X.columns   
      #now lets transformt the data 
      # we need to convert the verbal description to numerical values
      # it's in this form '22 Years and 1 Months'
      # first we change the column to string type 
      #tthen process the string to get the numerical values
      # then multiply the years by 12 and add the months
      X[self.column] = X[self.column].astype(str)

      years = X[self.column].str.extract(r'(\d+)\s*Years?')[0].fillna(0).astype(int)
      months = X[self.column].str.extract(r'(\d+)\s*Months?')[0].fillna(0).astype(int)

    # Calculate total months
      X[self.column] = years * 12 + months
      return X

      

   
     


In [7]:
creditHistory = CreditHistoryTransformer()
creditHistory.fit(train)
traim = creditHistory.transform(train)


In [8]:
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.utils.validation import check_is_fitted
import pandas as pd

class PaymentBehaviourTransformer(BaseEstimator, TransformerMixin):
   def __init__(self):
      self.charge_mapping = {'Low_spent': 0, 'Medium_spent': 1, 'High_spent': 2}
      self.payment_mapping = {'Small_value_payments': 0, 'Medium_value_payments': 1, 'Large_value_payments': 2}
      self.columns = 'Payment_Behaviour'

   def fit(self, X, y=None):
      assert isinstance(X, pd.DataFrame)
      'Payment_Behaviour' in X.columns

      return self
   
   def transform(self, X):
      charge, payment = X['Payment_Behaviour'].apply(self.split_payment_behavior)
      X['Charge'] = charge
      X['Payment'] = payment
      X.drop('Payment_Behaviour', axis=1, inplace=True)
      return X
   
   def split_payment_behavior(self, value):
      if isinstance(value, str):
         # Split the value into components based on the underscore
         parts = value.split('_')
         
         # The first part corresponds to charge, and the last two parts correspond to payment
         charge_part = '_'.join(parts[:2])  # Join the first two parts for charge
         payment_part = '_'.join(parts[2:])  # Join the rest for payment
         
         # Get the corresponding numerical values from the mappings
         charge_value = self.charge_mapping.get(charge_part, np.nan)
         payment_value = self.payment_mapping.get(payment_part, np.nan)
         
         return pd.Series([charge_value, payment_value])
      else:
         return pd.Series([np.nan, np.nan])  # Return NaN for non-string values

   # Assuming X is your DataFrame


           

In [9]:
PaymentBehaviourTransformer  = PaymentBehaviourTransformer()
PaymentBehaviourTransformer.fit(train)
train= PaymentBehaviourTransformer.transform(train)

In [10]:
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.utils.validation import check_is_fitted
import pandas as pd

class LoanType(BaseEstimator, TransformerMixin):
    def __init__(self):
        self.ml_binarizer = MultiLabelBinarizer()

    def fit(self, X, y=None):
        assert isinstance(X, pd.DataFrame) and 'Type_of_Loan' in X.columns
        # Ensure the column is treated as a string and handle NaN values
        loan_types = X['Type_of_Loan'].fillna('Not Specified').astype(str).str.split(', ')
        # Clean up the loan types to remove unwanted phrases
        loan_types = loan_types.apply(lambda x: [item.strip() for item in x if 'and' not in item])
        self.ml_binarizer.fit(loan_types)      
        self._n_features = len(self.ml_binarizer.classes_)
        return self 

    def transform(self, X):
        # Check if the transformer has been fitted
        check_is_fitted(self, '_n_features')
        assert isinstance(X, pd.DataFrame) and 'Type_of_Loan' in X.columns
        # Ensure the column is treated as a string and handle NaN values
        loan_types = X['Type_of_Loan'].fillna('Not Specified').astype(str).str.split(', ')
        # Clean up the loan types to remove unwanted phrases
        loan_types = loan_types.apply(lambda x: [item.strip() for item in x if 'and' not in item])
        loan_type_dummies = self.ml_binarizer.transform(loan_types)
        
        # Create a DataFrame for the binary columns
        loan_type_df = pd.DataFrame(loan_type_dummies, columns=self.ml_binarizer.classes_, index=X.index)
        
        # Concatenate the new binary columns with the original DataFrame
        X = pd.concat([X, loan_type_df], axis=1)
        X.drop('Type_of_Loan', axis=1, inplace=True)
        return X



In [11]:
train['Type_of_Loan'].info()



<class 'pandas.core.series.Series'>
RangeIndex: 100000 entries, 0 to 99999
Series name: Type_of_Loan
Non-Null Count  Dtype 
--------------  ----- 
88592 non-null  object
dtypes: object(1)
memory usage: 781.4+ KB


In [12]:
Loan = LoanType()
Loan.fit(train)
train=Loan.transform(train)


In [13]:
from numpy import NAN
import pandas as pd
import numpy as np
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.utils.validation import check_is_fitted

class HiLoImputer(BaseEstimator, TransformerMixin):
    def __init__(self, lo, hi, columnName):
        self.hi = hi
        self.lo = lo
        self.columnName = columnName
        self.isCategory = False

    def fit(self, X, y=None):
        assert isinstance(X, pd.DataFrame) and len(X.columns) > 1
        assert self.columnName in X.columns
        
        # Check if the column is categorical
        self.isCategory = X[self.columnName].dtype.name == 'object' or X[self.columnName].dtype.name == 'category'
        if self.isCategory:
            #convert to category

            self.categories_ = X[self.columnName].astype('category').cat.categories

        # Store the number of features
        self._n_features = X.shape[1]
        return self

    def transform(self, X):
        check_is_fitted(self, '_n_features')
        assert isinstance(X, pd.DataFrame) and X.shape[1] == self._n_features

        if self.isCategory:
            # Convert categories to codes for hi/lo processing
            X[self.columnName] = X[self.columnName].astype('category').cat.codes

            # Apply hi/lo bounds, setting out-of-bounds to NaN
            X[self.columnName] = np.where(
                (X[self.columnName] > self.hi) | (X[self.columnName] < self.lo), 
                np.nan, X[self.columnName]
            )
            
            # Replace NaN values with the mode per Customer_ID group
            X[self.columnName] = X.groupby('Customer_ID', group_keys=False)[self.columnName].apply(
                lambda x: x.fillna(x.mode().iloc[0] if not x.mode().empty else x)
            )

            # Convert back to categories
            X[self.columnName] = pd.Categorical.from_codes(
                X[self.columnName].fillna(-1).astype(int), categories=self.categories_, ordered=True
            ).remove_unused_categories()
            X[self.columnName] = X[self.columnName].astype('category')

        else:
            # Apply hi/lo bounds for numerical data
            X[self.columnName] = np.where(
                (X[self.columnName] > self.hi) | (X[self.columnName] < self.lo), 
                np.nan, X[self.columnName]
            )

            # Replace NaN values with the mode per Customer_ID group
            X[self.columnName] = X.groupby('Customer_ID', group_keys=False)[self.columnName].apply(
                lambda x: x.fillna(x.mode().iloc[0] if not x.mode().empty else x)
            )

        return X


In [14]:
train['Occupation'].value_counts()

Occupation
_______          7062
Lawyer           6575
Architect        6355
Engineer         6350
Scientist        6299
Mechanic         6291
Accountant       6271
Developer        6235
Media_Manager    6232
Teacher          6215
Entrepreneur     6174
Doctor           6087
Journalist       6085
Manager          5973
Musician         5911
Writer           5885
Name: count, dtype: int64

In [15]:

# print row 31 
print(train.loc[38])

ID                               0x1638
Customer_ID                  CUS_0x1cdb
Month                              July
Name                             Deepaa
Age                                  21
SSN                         615-06-7821
Occupation                    Developer
Annual_Income                  35547.71
Monthly_Inhand_Salary       2853.309167
Num_Bank_Accounts                     7
Num_Credit_Card                       5
Interest_Rate                         5
Num_of_Loan                           0
Delay_from_due_date                  10
Num_of_Delayed_Payment             15.0
Changed_Credit_Limit               2.58
Num_Credit_Inquiries                4.0
Credit_Mix                     Standard
Outstanding_Debt                 943.86
Credit_Utilization_Ratio      26.263823
Credit_History_Age                  374
Payment_of_Min_Amount               Yes
Total_EMI_per_month                 0.0
Amount_invested_monthly      181.011983
Monthly_Balance              394.318934


In [16]:
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100000 entries, 0 to 99999
Data columns (total 37 columns):
 #   Column                    Non-Null Count   Dtype  
---  ------                    --------------   -----  
 0   ID                        100000 non-null  object 
 1   Customer_ID               100000 non-null  object 
 2   Month                     100000 non-null  object 
 3   Name                      90015 non-null   object 
 4   Age                       100000 non-null  int64  
 5   SSN                       100000 non-null  object 
 6   Occupation                100000 non-null  object 
 7   Annual_Income             100000 non-null  float64
 8   Monthly_Inhand_Salary     84998 non-null   float64
 9   Num_Bank_Accounts         100000 non-null  int64  
 10  Num_Credit_Card           100000 non-null  int64  
 11  Interest_Rate             100000 non-null  int64  
 12  Num_of_Loan               100000 non-null  int64  
 13  Delay_from_due_date       100000 non-null  in

In [17]:
for column, bounds in hi_lo_bounds.items():
    lo = bounds['lo']
    hi = bounds['hi']
    
    # Apply HiLoImputer only for imputation if bounds are None
    if lo is None and hi is None:
        #HiLoImputer(0,14, 'Occupation')
        hi_lo_imputer = HiLoImputer(lo=float('-inf'), hi=float('inf'), columnName=column)
    else:
        hi_lo_imputer = HiLoImputer(lo=lo, hi=hi, columnName=column)
    data = train
    print(column)
    train = hi_lo_imputer.fit_transform(train)
    print(train[column].isnull().sum())   


Age
0
Annual_Income
0
Monthly_Inhand_Salary
664
Num_Bank_Accounts
0
Num_Credit_Card
0
Interest_Rate
0
Delay_from_due_date
0
Num_Credit_Inquiries
0
Changed_Credit_Limit
0
Num_of_Loan
0
Credit_Mix
0
Occupation
0


In [26]:
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100000 entries, 0 to 99999
Data columns (total 33 columns):
 #   Column                    Non-Null Count   Dtype   
---  ------                    --------------   -----   
 0   Month                     100000 non-null  category
 1   Age                       100000 non-null  float64 
 2   Occupation                100000 non-null  category
 3   Annual_Income             100000 non-null  float64 
 4   Monthly_Inhand_Salary     99336 non-null   float64 
 5   Num_Bank_Accounts         100000 non-null  float64 
 6   Num_Credit_Card           100000 non-null  float64 
 7   Interest_Rate             100000 non-null  float64 
 8   Num_of_Loan               100000 non-null  float64 
 9   Delay_from_due_date       100000 non-null  float64 
 10  Num_of_Delayed_Payment    92998 non-null   float64 
 11  Changed_Credit_Limit      100000 non-null  float64 
 12  Num_Credit_Inquiries      100000 non-null  float64 
 13  Credit_Mix                1000

In [19]:
# lets drop the columns that are not needed
# customerid, id, name, SSN
train.drop(['Customer_ID', 'ID', 'Name', 'SSN'], axis=1, inplace=True)

In [22]:
#change month to categorical
train['Month'] = train['Month'].astype('category')

In [28]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
import pandas as pd

# Drop unnecessary columns and set up target
X = train.drop(columns=['Credit_Score'])
y = train['Credit_Score']

# One-hot encode all categorical columns
X_encoded = pd.get_dummies(X, drop_first=True)

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X_encoded, y, test_size=0.2, random_state=42)

# Train a simple RandomForest model
model = RandomForestClassifier(random_state=42)
model.fit(X_train, y_train)

# Get feature importances
feature_importances = pd.Series(model.feature_importances_, index=X_train.columns).sort_values(ascending=False)
print(feature_importances.head(12))  # Display top 12 features to see if 'Month' columns are significant


Outstanding_Debt            0.096615
Interest_Rate               0.065865
Delay_from_due_date         0.055931
Credit_History_Age          0.051530
Changed_Credit_Limit        0.051009
Credit_Mix_Good             0.045025
Credit_Mix_Standard         0.044412
Monthly_Balance             0.043631
Amount_invested_monthly     0.043478
Num_Credit_Inquiries        0.041201
Credit_Utilization_Ratio    0.039304
Num_Credit_Card             0.038572
dtype: float64


In [33]:
# lets check model accuracyon the tes data

from sklearn.metrics import accuracy_score 
X_test_encoded = pd.get_dummies(X_test, drop_first=True)
y_pred = model.predict(X_test_encoded)
accuracy = accuracy_score(y_test, y_pred)
print(f'Model accuracy: {accuracy:.2%}')


Model accuracy: 81.16%


In [None]:
#lets check on the actual test data

In [30]:
from scipy.stats import f_oneway

# Group data by Month and check variance in credit scores
month_groups = [group['Credit_Score'] for name, group in train.groupby('Month')]
anova_result = f_oneway(*month_groups)
print(f'ANOVA F-test result: F={anova_result.statistic}, p-value={anova_result.pvalue}')


  month_groups = [group['Credit_Score'] for name, group in train.groupby('Month')]


TypeError: unsupported operand type(s) for /: 'str' and 'int'

In [32]:
# Calculate mean credit score for each month
month_mean_scores = train.groupby('Month')['Credit_Score'].mean()
print(month_mean_scores)


  month_mean_scores = train.groupby('Month')['Credit_Score'].mean()


TypeError: agg function failed [how->mean,dtype->object]

In [None]:
# now we drop columns ssn, name, 

In [41]:
# so the data has some realy high unrealeastic values in age, num_of_bank_accounts, num_of_credit_card, num_of_loan
# lets get a feel and get a right range for these values
# age hi value is 100, low value is 0
# num_of_bank_accounts hi value is 10, low value is 1
# num_of_credit_card hi value is 10, low value is 1
# num_of_loan hi value is 10, low value is 1
# annual income lo is 0
# monthly inhand salary lo is 0


# so now lets create a transformer



Age
38.0      2833
28.0      2829
31.0      2806
26.0      2792
32.0      2749
          ... 
5741.0       1
7178.0       1
5621.0       1
1908.0       1
1342.0       1
Name: count, Length: 1661, dtype: int64
Num_Bank_Accounts
6       13001
7       12823
8       12765
4       12186
5       12118
        ...  
1626        1
1470        1
887         1
211         1
697         1
Name: count, Length: 943, dtype: int64
Num_Credit_Card
5       18459
7       16615
6       16559
4       14030
3       13277
        ...  
791         1
1118        1
657         1
640         1
679         1
Name: count, Length: 1179, dtype: int64
Num_of_Loan
3.0       14386
2.0       14250
4.0       14016
0.0       10380
1.0       10083
          ...  
1444.0        1
392.0         1
841.0         1
1015.0        1
966.0         1
Name: count, Length: 399, dtype: int64
110.93450521244253
17.09128
22.47443
2.780339232263824
33.0
6.0
5.0
3.0
38.0
6
5
3.0
