In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import category_encoders as ce
from sklearn.model_selection import train_test_split
from imblearn.over_sampling import SMOTE



# 
def load_data(file_path):
    """Load data from a CSV file into a pandas DataFrame."""
    return pd.read_csv(file_path)

In [3]:
df = load_data('data/bank_data_train.csv')
print(df.shape)

# Define a threshold for missing values
TRESHOLD = 0.7

(355190, 116)


In [4]:
def explore_data(df):
    """Display basic information about the DataFrame and drop 'ID' if present. Returns the (possibly) modified DataFrame."""
    print("DataFrame Info:")
    df.info()
    
    # drop unnecessary column if it exists
    if 'ID' in df.columns:
        df = df.drop(columns=['ID'])
        print("\nDropped column: 'ID'")
    else:
        print("\nColumn 'ID' not found; skipping drop.")
    
    print("\nDataFrame Description (numeric):")
    display(df.describe())
    
    print("\nDataFrame Description (object):")
    display(df.describe(include=['object']))
    
    # optionally show missing values and plot in grid format
    print("\nMissing Values:")
    df_info = pd.DataFrame(df.isnull().sum(), columns=['Missing Values'])
    df_info['Percentage'] = (df_info['Missing Values'] / len(df)) * 100

    #siwich column into rows for better display

    df_info = df_info.transpose()
    display(df_info)
    # display(df.isnull().sum())
    
    # check the distribution of target column
    print(df['TARGET'].value_counts())
    return df

df = explore_data(df)

DataFrame Info:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 355190 entries, 0 to 355189
Columns: 116 entries, ID to TARGET
dtypes: float64(94), int64(9), object(13)
memory usage: 314.3+ MB

Dropped column: 'ID'

DataFrame Description (numeric):


Unnamed: 0,CR_PROD_CNT_IL,AMOUNT_RUB_CLO_PRC,PRC_ACCEPTS_A_EMAIL_LINK,APP_REGISTR_RGN_CODE,PRC_ACCEPTS_A_POS,PRC_ACCEPTS_A_TK,TURNOVER_DYNAMIC_IL_1M,CNT_TRAN_AUT_TENDENCY1M,SUM_TRAN_AUT_TENDENCY1M,AMOUNT_RUB_SUP_PRC,...,REST_DYNAMIC_CC_3M,MED_DEBT_PRC_YWZ,LDEAL_ACT_DAYS_PCT_TR3,LDEAL_ACT_DAYS_PCT_AAVG,LDEAL_DELINQ_PER_MAXYWZ,TURNOVER_DYNAMIC_CC_3M,LDEAL_ACT_DAYS_PCT_TR,LDEAL_ACT_DAYS_PCT_TR4,LDEAL_ACT_DAYS_PCT_CURR,TARGET
count,355190.0,316867.0,155163.0,60550.0,155163.0,155163.0,355190.0,77112.0,77112.0,316867.0,...,355190.0,95713.0,93448.0,98175.0,95713.0,355190.0,93448.0,93448.0,93448.0,355190.0
mean,0.105225,0.044045,0.0,50.947498,0.0,0.0,0.001305,0.416896,0.414572,0.085249,...,0.007309,0.055074,0.025707,0.049943,0.009252,0.004309,0.013938,0.013938,0.013938,0.081435
std,0.431372,0.108449,0.0,21.777855,0.0,0.0,0.029118,0.316493,0.338612,0.14231,...,0.066681,0.215909,0.115732,0.18583,0.092789,0.059852,0.097099,0.097099,0.097099,0.273503
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.006944,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,0.0,0.0,0.0,33.0,0.0,0.0,0.0,0.166667,0.139645,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,0.0,0.0,0.0,54.0,0.0,0.0,0.0,0.3,0.285714,0.027117,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
75%,0.0,0.036608,0.0,72.0,0.0,0.0,0.0,0.571429,0.661195,0.110005,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
max,11.0,1.0,0.0,89.0,0.0,0.0,1.0,1.0,1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0



DataFrame Description (object):


Unnamed: 0,CLNT_TRUST_RELATION,APP_MARITAL_STATUS,APP_KIND_OF_PROP_HABITATION,CLNT_JOB_POSITION_TYPE,CLNT_JOB_POSITION,APP_DRIVING_LICENSE,APP_EDUCATION,APP_TRAVEL_PASS,APP_CAR,APP_POSITION_TYPE,APP_EMP_TYPE,APP_COMP_TYPE,PACK
count,69421,68234,59361,44781,210811,57257,68104,57257,57256,60545,67362,67362,355190
unique,21,13,5,4,19588,2,17,2,2,4,4,4,12
top,FRIEND,M,SO,SPECIALIST,ДИРЕКТОР,N,H,N,N,SPECIALIST,PRIVATE,PRIVATE,102
freq,24896,30724,28056,25123,11200,36332,42459,52750,32843,36622,59087,59087,116986



Missing Values:


Unnamed: 0,CR_PROD_CNT_IL,AMOUNT_RUB_CLO_PRC,PRC_ACCEPTS_A_EMAIL_LINK,APP_REGISTR_RGN_CODE,PRC_ACCEPTS_A_POS,PRC_ACCEPTS_A_TK,TURNOVER_DYNAMIC_IL_1M,CNT_TRAN_AUT_TENDENCY1M,SUM_TRAN_AUT_TENDENCY1M,AMOUNT_RUB_SUP_PRC,...,REST_DYNAMIC_CC_3M,MED_DEBT_PRC_YWZ,LDEAL_ACT_DAYS_PCT_TR3,LDEAL_ACT_DAYS_PCT_AAVG,LDEAL_DELINQ_PER_MAXYWZ,TURNOVER_DYNAMIC_CC_3M,LDEAL_ACT_DAYS_PCT_TR,LDEAL_ACT_DAYS_PCT_TR4,LDEAL_ACT_DAYS_PCT_CURR,TARGET
Missing Values,0.0,38323.0,200027.0,294640.0,200027.0,200027.0,0.0,278078.0,278078.0,38323.0,...,0.0,259477.0,261742.0,257015.0,259477.0,0.0,261742.0,261742.0,261742.0,0.0
Percentage,0.0,10.789437,56.315493,82.952786,56.315493,56.315493,0.0,78.289929,78.289929,10.789437,...,0.0,73.053014,73.690701,72.359864,73.053014,0.0,73.690701,73.690701,73.690701,0.0


TARGET
0    326265
1     28925
Name: count, dtype: int64


In [6]:
326265/df.shape[0]

0.918564711844365

In [7]:
from scipy.stats import chi2_contingency
import numpy as np

def cramers_v(x, y):
    """Measure association between categorical x and binary y (TARGET)"""
    confusion_matrix = pd.crosstab(x, y)
    chi2 = chi2_contingency(confusion_matrix)[0]
    n = confusion_matrix.sum().sum()
    phi2 = chi2 / n
    r, k = confusion_matrix.shape
    return np.sqrt(phi2 / min(k-1, r-1))

In [8]:
# Annalyze data whitch to keep or drop
print("\nAnalyzing columns with >70% missing values for correlation with TARGET...")
print(f"df shape before dropping columns: {df.shape}")
high_missing = df.columns[df.isnull().sum() / len(df) > TRESHOLD]
for col in high_missing:
    if col == 'TARGET':
        continue
    # coerce to numeric where possible, compute correlation with TARGET
    
    if df[col].dtype == 'object':
        # categorical column
        corr = cramers_v(df[col].dropna(), df.loc[df[col].notnull(), 'TARGET'])
        if corr < 0.1:
            print(f"Dropping column {col} due to low correlation ({corr:.3f}) with TARGET")
            df = df.drop(columns=[col])
    else:
        # numerical column
        series_num = pd.to_numeric(df[col], errors='coerce')
        corr = series_num.corr(df['TARGET'])
        if pd.notnull(corr) and abs(corr) < 0.05:
            print(f"Dropping column {col} due to low correlation ({corr:.3f}) with TARGET")
            df = df.drop(columns=[col])
    
    
    # series_num = pd.to_numeric(df[col], errors='coerce')
    
    
    
    # corr = cramers_v(df[col].dropna(), df.loc[df[col].notnull(), 'TARGET'])
    # missing_pct = df[col].isnull().sum() / len(df) * 100
    # corr_str = f"{corr:.3f}" if pd.notnull(corr) else "N/A"
    # print(f"{col}: Missing={missing_pct:.1f}%, Correlation={corr_str}")
print(f"df shape after dropping columns: {df.shape}")


Analyzing columns with >70% missing values for correlation with TARGET...
df shape before dropping columns: (355190, 115)
Dropping column APP_REGISTR_RGN_CODE due to low correlation (-0.028) with TARGET
Dropping column CLNT_TRUST_RELATION due to low correlation (0.029) with TARGET
Dropping column APP_MARITAL_STATUS due to low correlation (0.030) with TARGET
Dropping column APP_KIND_OF_PROP_HABITATION due to low correlation (0.008) with TARGET
Dropping column CLNT_JOB_POSITION_TYPE due to low correlation (0.036) with TARGET
Dropping column APP_DRIVING_LICENSE due to low correlation (0.031) with TARGET
Dropping column APP_EDUCATION due to low correlation (0.058) with TARGET
Dropping column APP_TRAVEL_PASS due to low correlation (0.024) with TARGET
Dropping column APP_CAR due to low correlation (0.028) with TARGET
Dropping column APP_POSITION_TYPE due to low correlation (0.034) with TARGET
Dropping column APP_EMP_TYPE due to low correlation (0.030) with TARGET
Dropping column DEAL_YQZ_IR

In [9]:
for col in df:
    column_type = df[col].dtype
    if column_type != 'int64' and column_type != 'float64':
        print(f"{col}: {column_type}")

CLNT_JOB_POSITION: object
PACK: object


In [10]:
import numpy as np


def normalize_categorical_columns(df):
    """
    Normalize categorical columns by converting to lowercase/uppercase
    and stripping whitespace
    """
    cat_cols = df.select_dtypes(include=['object']).columns.tolist()
    
    for col in cat_cols:
        # Convert to string, lowercase, and strip whitespace
        df[col] = df[col].astype(str).str.upper()
        df[col] = df[col].str.strip()
        df[col] = df[col].replace('NAN', np.nan)  # Replace 'NA' strings with NaN
        df[col] = df[col].replace('', np.nan)  # Replace empty strings with NaN
    
    return df
df = normalize_categorical_columns(df)

In [11]:
for col in df:
    column_type = df[col].dtype
    if column_type != 'int64' and column_type != 'float64':
        print(f"{col}: {column_type}")


# Count non-null occurrences of each job position
existing_values = {}
for val in df['PACK']:
    if pd.isnull(val):
        continue
    existing_values[val] = existing_values.get(val, 0) + 1
corr = cramers_v(df['PACK'].dropna(), df.loc[df['PACK'].notnull(), 'TARGET'])
missing_pct = df['PACK'].isnull().sum() / len(df) * 100
print(f"PACK: Missing={missing_pct:.1f}%, Correlation={corr:.3f}")
print(f"existing_values length: {len(existing_values)}")
existing_values

CLNT_JOB_POSITION: object
PACK: object
PACK: Missing=0.0%, Correlation=0.071
existing_values length: 12


{'K01': 77083,
 '102': 116986,
 '105': 44936,
 'O01': 50478,
 '103': 24860,
 '101': 1816,
 '107': 27952,
 '301': 4208,
 '104': 6776,
 '108': 2,
 '109': 86,
 'M01': 7}

In [12]:
def fit_encoders(X_train, y_train):
    """
    Fit encoders on training data and return fitted encoders.
    """
    # Get object columns
    obj_cols = X_train.select_dtypes(include=['object']).columns.tolist()
    
    # Try numeric conversion
    numeric_conversions = {}
    for col in obj_cols:
        converted = pd.to_numeric(X_train[col], errors='coerce')
        if converted.notna().mean() > 0.9:
            numeric_conversions[col] = True
    print("numeric conversion", numeric_conversions)
    # Update object columns
    obj_cols = [col for col in obj_cols if col not in numeric_conversions]
    
    # Categorize by cardinality
    low_card_cols = []
    high_card_cols = []
    
    for col in obj_cols:
        if X_train[col].isnull().all():
            continue
        
        unique_count = X_train[col].nunique(dropna=True)
        
        if unique_count < 10:
            low_card_cols.append(col)
        else:
            high_card_cols.append(col)
    
    # Fit target encoder on training data
    target_encoder = None
    if high_card_cols:
        target_encoder = ce.TargetEncoder(cols=high_card_cols, smoothing=1.0)
        valid_mask = X_train[high_card_cols].notna().all(axis=1) & y_train['TARGET'].notna()
        target_encoder.fit(X_train.loc[valid_mask, high_card_cols], y_train.loc[valid_mask, 'TARGET'])
    
    return {
        'numeric_conversions': numeric_conversions,
        'low_card_cols': low_card_cols,
        'high_card_cols': high_card_cols,
        'target_encoder': target_encoder,
        'global_mean': y_train['TARGET'].mean()
    }

In [13]:

def transform_with_encoders(X, encoders_dict):
    """
    Transform data using pre-fitted encoders.
    """
    X = X.copy()
    
    # Apply numeric conversions
    for col in encoders_dict['numeric_conversions']:
        X[col] = pd.to_numeric(X[col], errors='coerce')
    
    # Drop all-null columns
    obj_cols = X.select_dtypes(include=['object']).columns
    for col in obj_cols:
        if X[col].isnull().all():
            X = X.drop(columns=[col])
    
    # Target encode high-cardinality columns
    if encoders_dict['high_card_cols'] and encoders_dict['target_encoder']:
        high_card_cols = encoders_dict['high_card_cols']
        print(f"\n\nTarget encoding: {high_card_cols}\n\n")
        for col in high_card_cols:
            if col in X.columns:
                # Transform using fitted encoder
                print(f"Encoding column: {col}")
                print()
                try:
                    # Transform using the encoder fitted on all high-cardinality cols,
                    # then take the encoded series for the current column
                    encoded = encoders_dict['target_encoder'].transform(X[encoders_dict['high_card_cols']])[col]
                    X[col] = encoded
                    # Fill unseen / missing encodings with global mean
                    X[col].fillna(encoders_dict['global_mean'], inplace=True)
                except Exception:
                    # If transform fails, fall back to global mean
                    X[col] = encoders_dict['global_mean']
        #         X = X.drop(columns=[col])
                
                
                
        #         X[f"{col}"] = encoded[col]
                
        #         # Fill nulls with global mean
        #         X[f"{col}"].fillna(encoders_dict['global_mean'], inplace=True)
        
        # Drop original columns
        # X = X.drop(columns=[col for col in high_card_cols if col in X.columns])
    
    # One-hot encode low-cardinality columns
    if encoders_dict['low_card_cols']:
        print(f"\nOne-hot encoding: {encoders_dict['low_card_cols']}")

        X = pd.get_dummies(X, columns=encoders_dict['low_card_cols'], drop_first=True)
    
    return X

In [14]:
# split data train test


y = pd.DataFrame()
y['TARGET'] = df['TARGET']
X = df.drop(columns=['TARGET'])
X_train, X_test, y_train, y_test = train_test_split(
                            X,y,
                            test_size=0.2,
                            random_state=42,
                            stratify=y)

print(f"Train shape: {X_train.shape}, Test shape: {X_test.shape}")


Train shape: (284152, 87), Test shape: (71038, 87)


In [15]:
# Fit encoders on TRAINING data only
print("Fitting encoders on training data...")
print("="*80)
encoders = fit_encoders(X_train, y_train)
for key, value in encoders.items():
        print(f"{key}: {value}")

Fitting encoders on training data...
numeric conversion {}
numeric_conversions: {}
low_card_cols: []
high_card_cols: ['CLNT_JOB_POSITION', 'PACK']
target_encoder: TargetEncoder(cols=['CLNT_JOB_POSITION', 'PACK'], smoothing=1.0)
global_mean: 0.08143528815563501


In [16]:
print("\nTransforming training data...")
X_train_encoded = transform_with_encoders(X_train, encoders)

print("\nTransforming test data...")
X_test_encoded = transform_with_encoders(X_test, encoders)


print(f"\nAfter encoding - Train: {X_train_encoded.shape}, Test: {X_test_encoded.shape}")


Transforming training data...


Target encoding: ['CLNT_JOB_POSITION', 'PACK']


Encoding column: CLNT_JOB_POSITION

Encoding column: PACK


Transforming test data...


Target encoding: ['CLNT_JOB_POSITION', 'PACK']


Encoding column: CLNT_JOB_POSITION

Encoding column: PACK


After encoding - Train: (284152, 87), Test: (71038, 87)


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  X[col].fillna(encoders_dict['global_mean'], inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  X[col].fillna(encoders_dict['global_mean'], inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which

In [17]:
# Just to make sure not needed in our case -------------------------------------
# Align columns (ensure train and test have same columns)
print("\nAligning train/test columns...")
train_cols = set(X_train_encoded.columns)
test_cols = set(X_test_encoded.columns)

# Add missing columns to test (fill with 0)
for col in train_cols - test_cols:
    X_test_encoded[col] = 0
    print(f"Added missing column to test: {col}")

# Remove extra columns from test
for col in test_cols - train_cols:
    X_test_encoded = X_test_encoded.drop(columns=[col])
    print(f"Removed extra column from test: {col}")
#-------------------------------------------------------------------------------------


Aligning train/test columns...


In [18]:
# Reorder test columns to match train

#! Mandatory step to ensure columns are in the same order after encoding

X_test_encoded = X_test_encoded[X_train_encoded.columns]

In [19]:
print(f"\nTrain nulls: {X_train_encoded.isnull().sum().sum()}")
print(f"Test nulls: {X_test_encoded.isnull().sum().sum()}")


Train nulls: 8813805
Test nulls: 2202381


In [20]:
imputation_values = X_train_encoded.mean()

for col in X_train_encoded.columns:
    if X_train_encoded[col].isnull().any():
        X_train_encoded[col].fillna(imputation_values[col], inplace=True)
    
    if X_test_encoded[col].isnull().any():
        X_test_encoded[col].fillna(imputation_values[col], inplace=True)
print(f"\nAfter fillna with mean\nTrain nulls: {X_train_encoded.isnull().sum().sum()}")
print(f"Test nulls: {X_test_encoded.isnull().sum().sum()}")


After fillna with mean
Train nulls: 0
Test nulls: 0


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  X_train_encoded[col].fillna(imputation_values[col], inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  X_test_encoded[col].fillna(imputation_values[col], inplace=True)


In [22]:
# Balence Dataset for training set only

print("Balancing training set using SMOTE...")
print(f"Before SMOTE: {y_train['TARGET'].value_counts()} | X shape: {X_train_encoded.shape}")
smote = SMOTE(sampling_strategy='minority', random_state=42)
X_train_balanced, y_train_balanced = smote.fit_resample(X_train_encoded, y_train)
print(f"After SMOTE: {y_train_balanced['TARGET'].value_counts()} | X shape: {X_train_balanced.shape}")

Balancing training set using SMOTE...
Before SMOTE: TARGET
0    261012
1     23140
Name: count, dtype: int64 | X shape: (284152, 87)
After SMOTE: TARGET
0    261012
1    261012
Name: count, dtype: int64 | X shape: (522024, 87)


In [23]:
# Standerdize
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
X_train_balanced = pd.DataFrame(scaler.fit_transform(X_train_balanced), columns=X_train_balanced.columns)
X_test_encoded = pd.DataFrame(scaler.transform(X_test_encoded), columns=X_test_encoded.columns)

# Train with Basline Classifier 

In [26]:
def baseline_model(X_train, y_train, X_test, y_test):
    from sklearn.dummy import DummyClassifier
    from sklearn.metrics import classification_report, confusion_matrix
    
    dummy_clf = DummyClassifier(strategy='most_frequent')
    dummy_clf.fit(X_train, y_train)
    
    y_pred = dummy_clf.predict(X_test)
    
    print("Confusion Matrix:")
    print(confusion_matrix(y_test, y_pred))
    
    print("\nClassification Report:")
    print(classification_report(y_test, y_pred))
    return dummy_clf

In [27]:
dummy_clf =  baseline_model(X_train_balanced, y_train_balanced, X_test_encoded, y_test)

Confusion Matrix:
[[65253     0]
 [ 5785     0]]

Classification Report:
              precision    recall  f1-score   support

           0       0.92      1.00      0.96     65253
           1       0.00      0.00      0.00      5785

    accuracy                           0.92     71038
   macro avg       0.46      0.50      0.48     71038
weighted avg       0.84      0.92      0.88     71038



  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])


# Train with Random Forest + Grid Search

In [22]:
print(f"x train shape: {X_train_balanced.shape}, y train shape: {y_train_balanced.shape}")
print(f"x test shape: {X_test_encoded.shape}, y test shape: {y_test.shape}")

x train shape: (522024, 87), y train shape: (522024, 1)
x test shape: (71038, 87), y test shape: (71038, 1)


In [24]:

from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV

# param_grid = {
#     'n_estimators': [50, 100],           # Reduced from 3 to 2
#     'max_depth': [None, 20],             # Reduced from 4 to 2
#     'min_samples_split': [2, 10],        # Reduced from 3 to 2
# }

param_grid = {
    'n_estimators': [100],              # 100 is usually good enough
    'max_depth': [None, 20],            
    'min_samples_split': [2, 10],
}



def random_forest_model(X_train, y_train, X_test, y_test):
    # Initialize model
    rf_clf = RandomForestClassifier(random_state=42)
    
    # GridSearch with cross-validation
    grid_search = GridSearchCV(
        estimator=rf_clf,
        param_grid=param_grid,
        cv=3, # 
        scoring='accuracy',  # Consider 'f1', 'roc_auc', or 'balanced_accuracy' for imbalanced data
        n_jobs=-1,
        verbose=2 # Increased verbosity for more detailed output
    )
    
    # Fit - handle y_train whether it's Series or array
    grid_search.fit(X_train, y_train.ravel() if hasattr(y_train, 'ravel') else y_train)
    
    # Print best parameters
    print(f"Best parameters found: {grid_search.best_params_}")
    print(f"Best CV score: {grid_search.best_score_:.4f}")
    
    # Get best model
    best_rf_model = grid_search.best_estimator_
    
    # Evaluate on test set
    test_accuracy = best_rf_model.score(X_test, y_test)
    print(f"Test set accuracy: {test_accuracy:.4f}")
    
    # Return the trained model for later use
    return best_rf_model





# Train the model
best_rf_model = random_forest_model(X_train_balanced, y_train_balanced, 
                                  X_test_encoded, y_test)










Fitting 3 folds for each of 4 candidates, totalling 12 fits


  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)


[CV] END max_depth=20, min_samples_split=10, n_estimators=100; total time= 2.5min
[CV] END max_depth=20, min_samples_split=2, n_estimators=100; total time= 2.8min
[CV] END max_depth=20, min_samples_split=10, n_estimators=100; total time= 3.0min
[CV] END max_depth=20, min_samples_split=10, n_estimators=100; total time= 3.0min
[CV] END max_depth=None, min_samples_split=2, n_estimators=100; total time= 3.1min
[CV] END max_depth=20, min_samples_split=2, n_estimators=100; total time= 3.2min
[CV] END max_depth=20, min_samples_split=2, n_estimators=100; total time= 3.2min
[CV] END max_depth=None, min_samples_split=2, n_estimators=100; total time= 3.2min
[CV] END max_depth=None, min_samples_split=10, n_estimators=100; total time= 3.3min
[CV] END max_depth=None, min_samples_split=10, n_estimators=100; total time= 3.4min
[CV] END max_depth=None, min_samples_split=10, n_estimators=100; total time= 3.4min
[CV] END max_depth=None, min_samples_split=2, n_estimators=100; total time= 3.5min


  return fit_method(estimator, *args, **kwargs)


Best parameters found: {'max_depth': None, 'min_samples_split': 2, 'n_estimators': 100}
Best CV score: 0.9372
Test set accuracy: 0.9043


In [28]:
from sklearn.metrics import confusion_matrix

# Test the model

y_pred = best_rf_model.predict(X_test_encoded)
print("Confusion Matrix:")
print(confusion_matrix(y_test, y_pred))

Confusion Matrix:
[[63576  1677]
 [ 5124   661]]


# Train with Scikit-learn MLPClassifier + Grid Search

In [29]:
from sklearn.neural_network import MLPClassifier
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score
import numpy as np



def sckit_learn_mlp_model(X_train, y_train, X_test, y_test):
    from sklearn.neural_network import MLPClassifier
    from sklearn.metrics import classification_report, confusion_matrix
    
    mlp_clf = MLPClassifier(hidden_layer_sizes=(100, ), max_iter=300, random_state=42) #solver='adam' by default we can tune more parameters later with sgd, learning_rate, etc.
    mlp_clf.fit(X_train, y_train.values.ravel())
    
    return mlp_clf
mlp_model =  sckit_learn_mlp_model(X_train_balanced, y_train_balanced, X_test_encoded, y_test)



In [None]:
# param_grid = {
#     'hidden_layer_sizes': [(50,), (50, 50)],
#     'activation': ['relu', 'tanh'],
#     'solver': ['adam'],
#     'alpha': [0.001],
#     'learning_rate_init': [0.01],
#     'max_iter': [300]
# }



param_grid = {
    'hidden_layer_sizes': [(50,), (100,), (50, 50)],
    'activation': ['relu', 'tanh'],
    'solver': ['adam'],
    'alpha': [0.0001, 0.001],
    'learning_rate_init': [0.001, 0.01],
    'max_iter': [300]
}

In [None]:
from sklearn.neural_network import MLPClassifier
from sklearn.model_selection import GridSearchCV



def scikit_learn_mlp_model(X_train, y_train):
    mlp_clf = MLPClassifier(random_state=42)

    grid_search = GridSearchCV(
        estimator=mlp_clf,
        param_grid=param_grid,
        cv=3,
        scoring='accuracy',
        n_jobs=-1,
        verbose=2
    )

    grid_search.fit(X_train, y_train)

    return grid_search.best_estimator_

mlp_model = scikit_learn_mlp_model(X_train_balanced, y_train_balanced)


Fitting 3 folds for each of 4 candidates, totalling 12 fits


  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


[CV] END activation=tanh, alpha=0.001, hidden_layer_sizes=(50,), learning_rate_init=0.01, max_iter=300, solver=adam; total time= 1.5min
[CV] END activation=tanh, alpha=0.001, hidden_layer_sizes=(50, 50), learning_rate_init=0.01, max_iter=300, solver=adam; total time= 2.1min
[CV] END activation=tanh, alpha=0.001, hidden_layer_sizes=(50,), learning_rate_init=0.01, max_iter=300, solver=adam; total time= 2.1min
[CV] END activation=tanh, alpha=0.001, hidden_layer_sizes=(50,), learning_rate_init=0.01, max_iter=300, solver=adam; total time= 2.9min
[CV] END activation=tanh, alpha=0.001, hidden_layer_sizes=(50, 50), learning_rate_init=0.01, max_iter=300, solver=adam; total time= 3.0min
[CV] END activation=relu, alpha=0.001, hidden_layer_sizes=(50,), learning_rate_init=0.01, max_iter=300, solver=adam; total time= 3.6min
[CV] END activation=relu, alpha=0.001, hidden_layer_sizes=(50,), learning_rate_init=0.01, max_iter=300, solver=adam; total time= 3.7min
[CV] END activation=relu, alpha=0.001, hid

  y = column_or_1d(y, warn=True)


In [None]:
# mlp_model = scikit_learn_mlp_model(X_train_balanced, y_train_balanced)
mlp_grid_model = scikit_learn_mlp_model(X_train_balanced, y_train_balanced)

In [40]:
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.metrics import accuracy_score
y_pred = mlp_model.predict(X_test_encoded)
    
print("Confusion Matrix:")
print(confusion_matrix(y_test, y_pred))

print("\nClassification Report:")
print(classification_report(y_test, y_pred))

accuracy = accuracy_score(y_test, y_pred)
print(f"Test set accuracy: {accuracy:.4f}")



accuracy_with_grid_search = mlp_model.score(X_test_encoded, y_test)
print(f"Test set accuracy with Grid Search: {accuracy_with_grid_search:.4f}")

accuracy_with_grid_search2 = mlp_grid_model.score(X_test_encoded, y_test)
print(f"Test set accuracy with Grid Search 2: {accuracy_with_grid_search:.4f}")




Confusion Matrix:
[[59494  5759]
 [ 4022  1763]]

Classification Report:
              precision    recall  f1-score   support

           0       0.94      0.91      0.92     65253
           1       0.23      0.30      0.26      5785

    accuracy                           0.86     71038
   macro avg       0.59      0.61      0.59     71038
weighted avg       0.88      0.86      0.87     71038

Test set accuracy: 0.8623
Test set accuracy with Grid Search: 0.8623
Test set accuracy with Grid Search 2: 0.8623


# Predict and Test all models

### Load Test dataset

In [31]:
from preprocessing import Preprocessor


def predict(model):
    preprocessor = Preprocessor()
    df_test = preprocessor.load_data('data/bank_data_test.csv')
    df_test = preprocessor.transform_with_encoders(df_test, encoders)


    df_test = preprocessor.normalize_categorical_columns(df_test)
    df_test_encoded = pd.DataFrame(scaler.transform(df_test), columns=df_test.columns)
    df = preprocessor.fill_missing_with_mean(df_test_encoded, imputation_values)
    
    predictions = model.predict(df)
    return predictions

predictions = predict(dummy_clf)



Target encoding: ['CLNT_JOB_POSITION', 'PACK']


Encoding column: CLNT_JOB_POSITION

Encoding column: PACK



The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  X[col].fillna(encoders_dict['global_mean'], inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  X[col].fillna(encoders_dict['global_mean'], inplace=True)


ValueError: The feature names should match those that were passed during fit.
Feature names unseen at fit time:
- APP_CAR
- APP_COMP_TYPE
- APP_DRIVING_LICENSE
- APP_EDUCATION
- APP_EMP_TYPE
- ...
