In [None]:
import numpy as np
import pandas as pd
import seaborn as sns 
import matplotlib.pyplot as plt 
%matplotlib inline
from copy import deepcopy 

In [None]:
data = pd.read_csv("D:/Projects/Projects/Churn_Modelling/artifacts/data/validation/valid/train.csv")
df = deepcopy(data)
data.shape, df.shape 

In [None]:
df.drop_duplicates(inplace=True)

In [None]:
df.shape 

In [None]:
df.head()

In [None]:
unnecessory_features = ["RowNumber", "CustomerId", "Surname"] 
unnecessory_features

In [None]:
df.drop(unnecessory_features, axis=1, inplace=True)

In [None]:
df.head()

In [None]:
df.info()

In [None]:
df.isnull().sum()

In [None]:
target = 'Exited'
target

In [None]:
for feature in df.columns:
    print(feature)
    print("-"*len(feature))
    print(f"samples:{df[feature].unique().tolist()[:5]}\ndtype:{df[feature].dtype}, count:{len(df[feature].unique())}, falling_category:{"numerical" if df[feature].dtype != "O" and len(df[feature].unique()) > 15 else "categorical"}")
    print("="*140)

In [None]:
numerical_features = [feature for feature in df.columns if df[feature].dtype != "O" and len(df[feature].unique()) > 15 and feature != target]
categorical_features = [feature for feature in df.columns if feature not in numerical_features and feature != target]
print(f"numerical:{numerical_features}, count:{len(numerical_features)}")
print(f"categorical:{categorical_features}, count:{len(categorical_features)}")

In [None]:
for feature in numerical_features:
    print(feature)
    print("-"*len(feature))
    print(f"samples:{df[feature].unique().tolist()[:5]}\ndtype:{df[feature].dtype}, count:{len(df[feature].unique())}, falling_category:{"numerical" if df[feature].dtype != "O" and len(df[feature].unique()) > 15 else "categorical"}")
    print("="*140)

In [None]:
for feature in categorical_features:
    print(feature)
    print("-"*len(feature))
    print(f"samples:{df[feature].unique().tolist()[:5]}\ndtype:{df[feature].dtype}, count:{len(df[feature].unique())}, falling_category:{"numerical" if df[feature].dtype != "O" and len(df[feature].unique()) > 15 else "categorical"}")
    print("="*140)

In [None]:
for feature in categorical_features:
    if df[feature].dtype == 'O':
        print(feature)
        print("-"*len(feature))
        print(f"samples:{df[feature].unique().tolist()[:5]}\ndtype:{df[feature].dtype}, count:{len(df[feature].unique())}, falling_category:{"numerical" if df[feature].dtype != "O" and len(df[feature].unique()) > 15 else "categorical"}")
        print("="*140)

In [None]:
categorical_features = [feature for feature in df.columns if df[feature].dtype == 'O' and feature not in numerical_features]
categorical_features

In [None]:
df['Exited'].value_counts()

In [None]:
from sklearn.preprocessing import OneHotEncoder, StandardScaler 
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline 

In [None]:
numerical_pipeline = Pipeline([
    ('scaler', StandardScaler())
])
categorical_pipeline = Pipeline([
    ('encoder', OneHotEncoder())
])

In [None]:
ct = ColumnTransformer([
    ('numerical_pipeline', numerical_pipeline, numerical_features),
    ('categorical_pipeline', categorical_pipeline, categorical_features)
], remainder='passthrough', n_jobs=-1, verbose=True, verbose_feature_names_out=True)

In [None]:
transformed_data = ct.fit_transform(df)

In [None]:
ct.get_feature_names_out()

In [None]:
[name.split('__')[1] for name in ct.get_feature_names_out()]

In [None]:
df = pd.DataFrame(transformed_data, columns=[name.split('__')[1] for name in ct.get_feature_names_out()])

# upsampling

In [None]:
from sklearn.utils import resample 

In [None]:
df[target].value_counts()

In [None]:
{v:k for k, v in df[target].value_counts().to_dict().items()}[min({v:k for k, v in df[target].value_counts().to_dict().items()})]

In [None]:
{v:k for k, v in df[target].value_counts().to_dict().items()}[max({v:k for k, v in df[target].value_counts().to_dict().items()})]

In [None]:
df_majority = df[df[target]=={v:k for k, v in df[target].value_counts().to_dict().items()}[max({v:k for k, v in df[target].value_counts().to_dict().items()})]]
df_minurity = df[df[target]=={v:k for k, v in df[target].value_counts().to_dict().items()}[min({v:k for k, v in df[target].value_counts().to_dict().items()})]]

In [None]:
df_majority.head()

In [None]:
df_minurity.head()

In [None]:
df_majority.shape, df_minurity.shape

In [None]:
len(df_majority)

In [None]:
df_minurity_resampled = resample(df_minurity, replace=True, n_samples=len(df_majority), random_state=42)

In [None]:
df_minurity_resampled.shape

In [None]:
df_majority.shape, df_minurity_resampled.shape

In [None]:
type(df_minurity_resampled)

In [None]:
df_minurity_resampled.head()

In [None]:
df = pd.concat([df_majority, df_minurity_resampled], axis=0)
df.shape

In [None]:
df[target].value_counts()

In [None]:
df.columns

In [None]:
df.head()

In [None]:
df.tail()