In [2]:
import pandas as pd
import os
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import plotly.express as px
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder,OrdinalEncoder,StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

In [3]:
df=pd.read_csv('C:\\Users\\Hariprasad\\Documents\\bankchurn_prediction\\data\\interim\\train\\data.csv')

In [4]:
df.head()

Unnamed: 0,Geography,Gender,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Exited,Credit_group
0,France,Male,57,1,0.0,2,1,1,56562.57,0,Fair
1,France,Female,53,8,114233.18,1,1,1,51587.04,0,Very_Good
2,France,Male,32,0,0.0,2,0,1,878.87,0,Poor
3,France,Female,32,6,184686.41,2,1,0,14956.44,0,Poor
4,Spain,Female,51,3,154962.99,3,0,1,191932.27,1,Fair


In [5]:
df.isnull().sum()

Geography          0
Gender             0
Age                0
Tenure             0
Balance            0
NumOfProducts      0
HasCrCard          0
IsActiveMember     0
EstimatedSalary    0
Exited             0
Credit_group       0
dtype: int64

In [6]:
df['Exited'].value_counts()

Exited
0    6324
1    1674
Name: count, dtype: int64

In [13]:
import pandas as pd

def preprocess(df):
    categorical_cols = ['Geography', 'Gender']
    numerical_cols = ['Age', 'Tenure', 'Balance', 'NumOfProducts', 'HasCrCard', 'IsActiveMember', 'EstimatedSalary']
    ordinal_cols = ['Credit_group']
    categories = [['Poor', 'Fair', 'Good', 'Very_Good', 'Exceptional']]
    

    categorical = ColumnTransformer(
        transformers=[
            ('onehot', OneHotEncoder(sparse_output=False, handle_unknown='ignore'), [0,1])
        ],
        remainder='passthrough'
    )
    
    numerical = ColumnTransformer(
        transformers=[
            ('num', StandardScaler(),[2,3,4,5,6,7,8])
        ],
        remainder='passthrough'
    )

    ordinal = ColumnTransformer(
        transformers=[
            ('ordinal', OrdinalEncoder(categories=categories, handle_unknown='use_encoded_value', unknown_value=-1),[10])
        ],remainder='passthrough'
    )
    
    pipeline = Pipeline([
       
        ('onehot', categorical), 
        ('num', numerical), 
        ('ordinal',ordinal)
    
        
        
        
    ])
    
    transformed_data = pipeline.fit_transform(df)

    
    return transformed_data



In [14]:
preprocess(df)

array([[-1.0, -0.5736460297447028, -0.9191428674536236, ..., 56562.57, 0,
        'Fair'],
       [-1.0, -0.5736460297447028, 1.0879701463281564, ..., 51587.04, 0,
        'Very_Good'],
       [-1.0, -0.5736460297447028, -0.9191428674536236, ..., 878.87, 0,
        'Poor'],
       ...,
       [-1.0, -0.5736460297447028, -0.9191428674536236, ..., 51980.25, 1,
        'Good'],
       [-1.0, -0.5736460297447028, -0.9191428674536236, ..., 27802.0, 0,
        'Poor'],
       [-1.0, -0.5736460297447028, -0.9191428674536236, ..., 187638.34,
        0, 'Fair']], dtype=object)

In [7]:
import pandas as pd
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder, StandardScaler, OrdinalEncoder

from imblearn.over_sampling import SMOTE
resampler = SMOTE()

categorical_cols = ['Geography', 'Gender']
numerical_cols = ['Age', 'Tenure', 'Balance', 'NumOfProducts', 'HasCrCard', 'IsActiveMember', 'EstimatedSalary']
ordinal_cols = ['Credit_group']

# Create the transformers
categorical_transformer = Pipeline(steps=[
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

numerical_transformer = Pipeline(steps=[
    ('scaler', StandardScaler())
])

ordinal_transformer = Pipeline(steps=[
    ('ordinal', OrdinalEncoder(categories=[['Poor', 'Fair', 'Good', 'Very_Good', 'Exceptional']]))
])

# Apply transformers to appropriate columns
preprocessor = ColumnTransformer(
    transformers=[
        ('cat', categorical_transformer, categorical_cols),
        ('num', numerical_transformer, numerical_cols),
        ('ord', ordinal_transformer, ordinal_cols)
    ])


# Create the preprocessing pipeline
pipeline = Pipeline(steps=[('preprocessor', preprocessor)])

# Apply preprocessing to the DataFrame
transformed_data = pipeline.fit_transform(df)


# Print the transformed data
print(transformed_data)


[[ 1.          0.          0.         ...  0.97774202 -0.75691549
   1.        ]
 [ 1.          0.          0.         ...  0.97774202 -0.8434816
   3.        ]
 [ 1.          0.          0.         ...  0.97774202 -1.7257211
   0.        ]
 ...
 [ 1.          0.          0.         ... -1.02276468 -0.83664039
   2.        ]
 [ 1.          0.          0.         ...  0.97774202 -1.25730253
   0.        ]
 [ 1.          0.          0.         ... -1.02276468  1.52358924
   1.        ]]


In [8]:
# Assuming preprocessor is your preprocessing pipeline or ColumnTransformer
preprocessor = pipeline.named_steps['preprocessor']

# Get the transformed column names
transformed_column_names = []

# For categorical columns
if 'cat' in preprocessor.named_transformers_:
    transformed_column_names.extend(preprocessor.named_transformers_['cat'].named_steps['onehot'].get_feature_names_out())

# For numerical columns
if 'num' in preprocessor.named_transformers_:
    transformed_column_names.extend(numerical_cols)  # Use the original numerical column names

# For ordinal columns
if 'ord' in preprocessor.named_transformers_:
    transformed_column_names.extend(ordinal_cols)  # Use the original ordinal column names

# Print the transformed column names
print(transformed_column_names)


['Geography_France', 'Geography_Germany', 'Geography_Spain', 'Gender_Female', 'Gender_Male', 'Age', 'Tenure', 'Balance', 'NumOfProducts', 'HasCrCard', 'IsActiveMember', 'EstimatedSalary', 'Credit_group']
