In [None]:
import pandas as pd
import numpy as np



def load_data(file_path):
    """Load data from a CSV file into a pandas DataFrame."""
    return pd.read_csv(file_path)

In [None]:
df = load_data('data/bank_data_train.csv')
missing_pct = df['TARGET'].isnull().sum() / len(df) * 100

print(df.shape)
print(f"Percentage of missing TARGET values: {missing_pct:.2f}%")

In [None]:
def explore_data(df):
    """Display basic information about the DataFrame and drop 'ID' if present. Returns the (possibly) modified DataFrame."""
    print("DataFrame Info:")
    df.info()
    
    # drop unnecessary column if it exists
    if 'ID' in df.columns:
        df = df.drop(columns=['ID'])
        print("\nDropped column: 'ID'")
    else:
        print("\nColumn 'ID' not found; skipping drop.")
    
    print("\nDataFrame Description (numeric):")
    display(df.describe())
    
    print("\nDataFrame Description (object):")
    display(df.describe(include=['object']))
    
    # optionally show missing values
    print("\nMissing Values:")
    df_info = pd.DataFrame(df.isnull().sum(), columns=['Missing Values'])
    df_info['Percentage'] = (df_info['Missing Values'] / len(df)) * 100
    
    #siwich column into rows for better display

    df_info = df_info.transpose()
    display(df_info)
    # display(df.isnull().sum())
    
    # check the distribution of target column
    print(df['TARGET'].value_counts())
    return df

df = explore_data(df)


In [None]:
df.shape

In [None]:
# Handle data types

# def convert_data_types(df):
#     # get columns with object data type
#     obj_cols = df.select_dtypes(include=['object']).columns.tolist()
#     print(f"Object columns to convert: {obj_cols}")

    
#     # Apply one-hot encoding to categorical columns
#     clean_df = pd.get_dummies(df,
#                         columns=obj_cols,
#                         drop_first=True).astype(float)
#     print("Converted object columns to numerical using one-hot encoding.")
#     return clean_df



# df = convert_data_types(df.copy())



In [None]:
existing_values = set()
for val in df['APP_MARITAL_STATUS']:
    if pd.notnull(val) or val not in existing_values:
        existing_values.add(val)
existing_values

In [None]:
# For categorical columns, use Chi-Square test or Cramér's V
from scipy.stats import chi2_contingency
import numpy as np

def cramers_v(x, y):
    """Measure association between categorical x and binary y (TARGET)"""
    confusion_matrix = pd.crosstab(x, y)
    chi2 = chi2_contingency(confusion_matrix)[0]
    n = confusion_matrix.sum().sum()
    phi2 = chi2 / n
    r, k = confusion_matrix.shape
    return np.sqrt(phi2 / min(k-1, r-1))

# Check APP_MARITAL_STATUS
# Remove nulls first
df_temp = df[['APP_MARITAL_STATUS', 'TARGET']].dropna()
correlation = cramers_v(df_temp['APP_MARITAL_STATUS'], df_temp['TARGET'])
print(f"Cramér's V: {correlation:.3f}")


In [None]:
# For columns with >70% missing
# drop corr bitween -0.05 to 0.05
high_missing = df.columns[df.isnull().sum() / len(df) > 0.70]
for col in high_missing:
    if col == 'TARGET':
        continue
    # coerce to numeric where possible, compute correlation with TARGET
    
    if df[col].dtype == 'object':
        # categorical column
        corr = cramers_v(df[col].dropna(), df.loc[df[col].notnull(), 'TARGET'])
    else:
        # numerical column
        series_num = pd.to_numeric(df[col], errors='coerce')
        corr = series_num.corr(df['TARGET'])
    
    print(type(corr))
    print(type(corr))
    
    # series_num = pd.to_numeric(df[col], errors='coerce')
    
    
    
    # corr = cramers_v(df[col].dropna(), df.loc[df[col].notnull(), 'TARGET'])
    missing_pct = df[col].isnull().sum() / len(df) * 100
    corr_str = f"{corr:.3f}" if pd.notnull(corr) else "N/A"
    print(f"{col}: Missing={missing_pct:.1f}%, Correlation={corr_str}")



In [None]:
# For categorical columns, use Chi-Square test or Cramér's V
from scipy.stats import chi2_contingency
import numpy as np

def cramers_v(x, y):
    """Measure association between categorical x and binary y (TARGET)"""
    confusion_matrix = pd.crosstab(x, y)
    chi2 = chi2_contingency(confusion_matrix)[0]
    n = confusion_matrix.sum().sum()
    phi2 = chi2 / n
    r, k = confusion_matrix.shape
    return np.sqrt(phi2 / min(k-1, r-1))

# Check APP_MARITAL_STATUS
# Remove nulls first
df_temp = df[['APP_MARITAL_STATUS', 'TARGET']].dropna()
correlation = cramers_v(df_temp['APP_MARITAL_STATUS'], df_temp['TARGET'])
print(f"Cramér's V: {correlation:.3f}")


In [None]:
df = load_data('data/bank_data_train.csv')

In [None]:
import numpy as np


def normalize_categorical_columns(df):
    """
    Normalize categorical columns by converting to lowercase/uppercase
    and stripping whitespace
    """
    cat_cols = df.select_dtypes(include=['object']).columns.tolist()
    
    for col in cat_cols:
        # Convert to string, lowercase, and strip whitespace
        df[col] = df[col].astype(str).str.upper()
        df[col] = df[col].str.strip()
        df[col] = df[col].replace('NAN', np.nan)  # Replace 'NA' strings with NaN
        df[col] = df[col].replace('', np.nan)  # Replace empty strings with NaN
    
    return df
df = normalize_categorical_columns(df)

In [None]:
print(df['APP_MARITAL_STATUS'].value_counts())
# count null values
print("null values",df['APP_MARITAL_STATUS'].isnull().sum())
print("*******************************")
existing_values = set()
for val in df['APP_MARITAL_STATUS']:
    if pd.notnull(val) or val not in existing_values:
        existing_values.add(val)
existing_values

In [None]:
existing_values = []
for val in df['APP_MARITAL_STATUS']:
    if pd.notnull(val) or val not in existing_values:
        existing_values.add(val)
existing_values

In [None]:
# Handle data types

def convert_data_types(df):
    print("Original shape:", df.shape)
    
    # Get object columns
    obj_cols = df.select_dtypes(include=['object']).columns.tolist()
    print(f"Object columns: {obj_cols}")
    
    # First, try to convert object columns to numeric where possible
    for col in obj_cols:
        try:
            # Try to convert to numeric
            converted = pd.to_numeric(df[col], errors='coerce')
            # If at least 90% conversion success, use numeric
            if converted.notna().mean() > 0.9:
                df[col] = converted
                print(f"Converted {col} to numeric")
        except:
            continue
    
    # Update object columns list after numeric conversion
    obj_cols = df.select_dtypes(include=['object']).columns.tolist()
    
    # Identify which columns to one-hot encode
    # Typically: low cardinality categorical columns
    cat_cols_to_encode = []
    for col in obj_cols:
        # Only one-hot encode if low cardinality (e.g., less than 10 unique values)
        if df[col].nunique() < 10:
            cat_cols_to_encode.append(col)
        else:
            print(f"Skipping one-hot for {col}: {df[col].nunique()} unique values")
    
    print(f"Categorical columns to one-hot encode: {cat_cols_to_encode}")
    
    # Apply one-hot encoding only to selected categorical columns
    df = pd.get_dummies(df, columns=cat_cols_to_encode, drop_first=True)
    
    # For high-cardinality categorical columns, consider other encoding strategies
    # or handle them separately
    
    print("Final shape:", df.shape)
    return df



df_cleaned = convert_data_types(df)



In [None]:

for col in df_cleaned.columns:
    if col.startswith('APP_MARITAL_STATUS'):
        print(col)
        

In [None]:
# check existence
print('CLNT_TRUST_RELATION' in df.columns)

# list generated dummy columns
[d for d in df.columns if d.startswith('CLNT_TRUST_RELATION')][:20]

# print the dummy columns (or subset)
df.filter(regex='^CLNT_TRUST_RELATION').head()

display(df.head())
obj_cols = df.select_dtypes(include=['object']).columns.tolist()
print(f"Object columns to convert: {obj_cols}")


# print(df.info())

In [None]:
for col in df:
    column_type = df[col].dtype
    if column_type != 'int64' and column_type != 'float64':
        print(f"{col}: {column_type}")

In [None]:
print("hello")

In [None]:
# def clean_and_encode(df):
    
    # print("Initial data types:")
    # for col in df.columns:
    #     if df[col].dtype != 'int64' and df[col].dtype != 'float64':
    #         print(f"  {col}: {df[col].dtype}")
    
    # Separate columns by type
object_cols = df.select_dtypes(include=['object']).columns.tolist()
bool_cols = df.select_dtypes(include=['bool']).columns.tolist()


In [None]:

print(f"\nObject columns to encode ({len(object_cols)}): {object_cols}")
print(f"Boolean columns to convert to int ({len(bool_cols)}): {bool_cols[:5]}...")  # Show first 5



In [None]:
#print only the rows that are not null
for row in df['APP_MARITAL_STATUS']:
    if pd.notnull(row):
        print(row)
# print(df['APP_TRAVEL_PASS'])

In [None]:
# Encode object columns
if object_cols:
    df = pd.get_dummies(df, columns=object_cols, drop_first=True, dtype=int)

# Convert boolean columns to int
if bool_cols:
    df[bool_cols] = df[bool_cols].astype(int)

# Verify all columns are now numeric
print("\nFinal data types:")
numeric_cols = df.select_dtypes(include=['int64', 'int32', 'int', 'float64', 'float32', 'float']).columns
non_numeric = [col for col in df.columns if col not in numeric_cols]

if non_numeric:
    print(f"Warning: {len(non_numeric)} non-numeric columns remain: {non_numeric}")
else:
    print("All columns are now numeric!")

print(f"Final shape: {df.shape}")
#     return df

# # Apply the cleaning
# df = clean_and_encode(df.copy())

In [None]:
df = load_data('data/bank_data_train.csv')

def convert_data_types_fixed(df, target_col='TARGET'):
    df = df.copy()
    # Preserve ID and target if present
    has_id = 'ID' in df.columns
    ids = df['ID'].copy() if has_id else None
    has_target = target_col in df.columns
    y = df[target_col].copy() if has_target else None

    # Drop ID and target for processing
    drop_cols = []
    if has_id:
        drop_cols.append('ID')
    if has_target:
        drop_cols.append(target_col)
    if drop_cols:
        df = df.drop(columns=drop_cols)

    # Try numeric conversion for object columns
    obj_cols = df.select_dtypes(include=['object']).columns.tolist()
    for col in obj_cols:
        conv = pd.to_numeric(df[col], errors='coerce')
        if conv.notna().mean() > 0.9:
            df[col] = conv

    # Remaining categorical (object) columns
    cat_cols = df.select_dtypes(include=['object']).columns.tolist()
    if cat_cols:
        low_card = [c for c in cat_cols if df[c].nunique() < 10]
        high_card = [c for c in cat_cols if df[c].nunique() >= 10]

        # One-hot encode low cardinality
        if low_card:
            df = pd.get_dummies(df, columns=low_card, drop_first=True, dtype=int)

        # Drop high cardinality columns (or handle differently)
        if high_card:
            print(f"Dropping high cardinality columns: {high_card}")
            df = df.drop(columns=high_card)

    # Re-add ID and target if they existed
    if has_id:
        df.insert(0, 'ID', ids)
    if has_target:
        df[target_col] = y

    return df

# Apply to single dataframe
df = convert_data_types_fixed(df)


In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

sns.countplot(x='TARGET', data=df)
plt.title('Distribution of Churn Target Variable')
plt.show()





In [None]:
import category_encoders as ce
import pandas as pd

def encode_categorical_data(df, target):
    """
    Convert object columns to appropriate numeric representations.
    Note: For production, split train/test BEFORE calling this function
    and fit encoders only on training data.
    """
    print("Original shape:", df.shape)
    
    # Get object columns
    obj_cols = df.select_dtypes(include=['object']).columns.tolist()
    print(f"Object columns: {obj_cols}")
    
    # Try to convert object columns to numeric where possible
    for col in obj_cols:
        try:
            converted = pd.to_numeric(df[col], errors='coerce')
            if converted.notna().mean() > 0.9:
                df[col] = converted
                print(f"Converted {col} to numeric")
        except:
            continue
    
    # Update object columns list after numeric conversion
    obj_cols = df.select_dtypes(include=['object']).columns.tolist()
    
    # Separate columns by cardinality
    cat_cols_to_encode = []  # Low cardinality (one-hot)
    high_card_cols = []      # High cardinality (target encode)
    cols_to_drop = []        # All null columns
    
    for col in obj_cols:
        # if col == 'TARGET':
        #     continue
            
        # Skip columns with all nulls
        if df[col].isnull().all():
            print(f"Skipping {col} (all null)")
            cols_to_drop.append(col)
            continue
        
        unique_count = df[col].nunique(dropna=True)
        
        # Categorize by cardinality
        if unique_count < 10:
            cat_cols_to_encode.append(col)
            print(f"Low cardinality {col}: {unique_count} unique values → One-hot encode")
        else:
            high_card_cols.append(col)
            print(f"High cardinality {col}: {unique_count} unique values → Target encode")
    
    # Drop all-null columns
    if cols_to_drop:
        df = df.drop(columns=cols_to_drop)
        print(f"Dropped all-null columns: {cols_to_drop}")
    
    # Target encode high-cardinality columns
    if high_card_cols:
        print(f"\nApplying Target Encoding to: {high_card_cols}")
        
        for col in high_card_cols:
            try:
                # Create mask for valid (non-null) values
                valid_mask = df[col].notna() & target['TARGET'].notna()
                
                # Initialize encoder
                encoder = ce.TargetEncoder(cols=[col], smoothing=1.0)
                
                # Fit and transform on valid data
                df.loc[valid_mask, f"{col}_ENCODED"] = encoder.fit_transform(
                    df.loc[valid_mask, [col]], 
                    target.loc[valid_mask, 'TARGET']
                )[col]
                
                # For null values, use global mean of target
                global_mean = target['TARGET'].mean()
                df[f"{col}_ENCODED"].fillna(global_mean, inplace=True)
                
                print(f"✓ Target encoded: {col}")
                
            except Exception as e:
                print(f"✗ Target encoding failed for {col}: {e}")
                continue
        
        # Drop original high-cardinality columns after encoding
        df = df.drop(columns=high_card_cols)
    
    # One-hot encode low-cardinality columns
    if cat_cols_to_encode:
        print(f"\nOne-hot encoding: {cat_cols_to_encode}")
        df = pd.get_dummies(df, columns=cat_cols_to_encode, drop_first=True)
    
    print(f"\nFinal shape: {df.shape}")
    return df


In [None]:
# Test conversion
ds = encode_categorical_data(df, df)



for col in ds:
    column_type = ds[col].dtype
    if column_type != 'int64' and column_type != 'float64':
        print(f"{col}: {column_type}")


# Count non-null occurrences of each job position
existing_values = {}
for val in ds['CLNT_JOB_POSITION_ENCODED']:
    if pd.isnull(val):
        continue
    existing_values[val] = existing_values.get(val, 0) + 1
corr = cramers_v(ds['CLNT_JOB_POSITION_ENCODED'].dropna(), ds.loc[ds['CLNT_JOB_POSITION_ENCODED'].notnull(), 'TARGET'])
missing_pct = ds['CLNT_JOB_POSITION_ENCODED'].isnull().sum() / len(ds) * 100
print(f"CLNT_JOB_POSITION: Missing={missing_pct:.1f}%, Correlation={corr:.3f}")
print(f"existing_values length: {len(existing_values)}")
existing_values

In [None]:
import category_encoders as ce
import pandas as pd

def fit_encoders(X_train, y_train):
    """
    Fit encoders on training data and return fitted encoders.
    """
    # Get object columns
    obj_cols = X_train.select_dtypes(include=['object']).columns.tolist()
    
    # Try numeric conversion
    numeric_conversions = {}
    for col in obj_cols:
        converted = pd.to_numeric(X_train[col], errors='coerce')
        if converted.notna().mean() > 0.9:
            numeric_conversions[col] = True
    print("numeric conversion", numeric_conversions)
    # Update object columns
    obj_cols = [col for col in obj_cols if col not in numeric_conversions]
    
    # Categorize by cardinality
    low_card_cols = []
    high_card_cols = []
    
    for col in obj_cols:
        if X_train[col].isnull().all():
            continue
        
        unique_count = X_train[col].nunique(dropna=True)
        
        if unique_count < 10:
            low_card_cols.append(col)
        else:
            high_card_cols.append(col)
    
    # Fit target encoder on training data
    target_encoder = None
    if high_card_cols:
        target_encoder = ce.TargetEncoder(cols=high_card_cols, smoothing=1.0)
        valid_mask = X_train[high_card_cols].notna().all(axis=1) & y_train['TARGET'].notna()
        target_encoder.fit(X_train.loc[valid_mask, high_card_cols], y_train.loc[valid_mask, 'TARGET'])
    
    return {
        'numeric_conversions': numeric_conversions,
        'low_card_cols': low_card_cols,
        'high_card_cols': high_card_cols,
        'target_encoder': target_encoder,
        'global_mean': y_train['TARGET'].mean()
    }


def transform_with_encoders(X, encoders_dict):
    """
    Transform data using pre-fitted encoders.
    """
    X = X.copy()
    
    # Apply numeric conversions
    for col in encoders_dict['numeric_conversions']:
        X[col] = pd.to_numeric(X[col], errors='coerce')
    
    # Drop all-null columns
    obj_cols = X.select_dtypes(include=['object']).columns
    for col in obj_cols:
        if X[col].isnull().all():
            X = X.drop(columns=[col])
    
    # Target encode high-cardinality columns
    if encoders_dict['high_card_cols'] and encoders_dict['target_encoder']:
        high_card_cols = encoders_dict['high_card_cols']
        print(f"\n\nTarget encoding: {high_card_cols}\n\n")
        for col in high_card_cols:
            if col in X.columns:
                # Transform using fitted encoder
                print(f"Encoding column: {col}")
                print()
                try:
                    # Transform using the encoder fitted on all high-cardinality cols,
                    # then take the encoded series for the current column
                    encoded = encoders_dict['target_encoder'].transform(X[encoders_dict['high_card_cols']])[col]
                    X[col] = encoded
                    # Fill unseen / missing encodings with global mean
                    X[col].fillna(encoders_dict['global_mean'], inplace=True)
                except Exception:
                    # If transform fails, fall back to global mean
                    X[col] = encoders_dict['global_mean']
        #         X = X.drop(columns=[col])
                
                
                
        #         X[f"{col}"] = encoded[col]
                
        #         # Fill nulls with global mean
        #         X[f"{col}"].fillna(encoders_dict['global_mean'], inplace=True)
        
        # Drop original columns
        # X = X.drop(columns=[col for col in high_card_cols if col in X.columns])
    
    # One-hot encode low-cardinality columns
    if encoders_dict['low_card_cols']:
        print(f"\nOne-hot encoding: {encoders_dict['low_card_cols']}")

        X = pd.get_dummies(X, columns=encoders_dict['low_card_cols'], drop_first=True)
    
    return X


In [None]:
# split data train test
from sklearn.model_selection import train_test_split

y = pd.DataFrame()
y['TARGET'] = df['TARGET']
X = df.drop(columns=['TARGET'])
X_train, X_test, y_train, y_test = train_test_split(
                            X,y,
                            test_size=0.2,
                            random_state=42,
                            stratify=y)


In [None]:
from sklearn.model_selection import train_test_split
from imblearn.over_sampling import SMOTE

# Split data
y = df[['TARGET']]
X = df.drop(columns=['TARGET'])

X_train, X_test, y_train, y_test = train_test_split(
    X, y,
    test_size=0.2,
    random_state=42,
    stratify=y
)

print(f"Train shape: {X_train.shape}, Test shape: {X_test.shape}")

# Fit encoders on TRAINING data only
print("\n" + "="*80)
print("Fitting encoders on training data...")
print("="*80)
encoders = fit_encoders(X_train, y_train)
for key, value in encoders.items():
        print(f"{key}: {value}")
# # Transform both train and test using the SAME encoders
print("\nTransforming training data...")
X_train_encoded = transform_with_encoders(X_train, encoders)

print("\nTransforming test data...")
X_test_encoded = transform_with_encoders(X_test, encoders)


print(f"\nAfter encoding - Train: {X_train_encoded.shape}, Test: {X_test_encoded.shape}")

# Just to make sure not needed in our case -------------------------------------
# Align columns (ensure train and test have same columns)
print("\nAligning train/test columns...")
train_cols = set(X_train_encoded.columns)
test_cols = set(X_test_encoded.columns)

# Add missing columns to test (fill with 0)
for col in train_cols - test_cols:
    X_test_encoded[col] = 0
    print(f"Added missing column to test: {col}")

# Remove extra columns from test
for col in test_cols - train_cols:
    X_test_encoded = X_test_encoded.drop(columns=[col])
    print(f"Removed extra column from test: {col}")
#-------------------------------------------------------------------------------------

# # Reorder test columns to match train
# X_test_encoded = X_test_encoded[X_train_encoded.columns]

# print(f"\nAfter alignment - Train: {X_train_encoded.shape}, Test: {X_test_encoded.shape}")

# # Impute missing values in BOTH train and test
# print("\n" + "="*80)
# print("Imputing missing values...")
# print("="*80)

# # Calculate means from TRAINING data only
# imputation_values = X_train_encoded.mean()

# # Apply to both train and test
# for col in X_train_encoded.columns:
#     if X_train_encoded[col].isnull().any():
#         X_train_encoded[col].fillna(imputation_values[col], inplace=True)
#         print(f"Imputed train {col}: {imputation_values[col]:.4f}")
    
#     if X_test_encoded[col].isnull().any():
#         X_test_encoded[col].fillna(imputation_values[col], inplace=True)
#         print(f"Imputed test {col}: {imputation_values[col]:.4f}")

# # Verify no nulls remain
# print(f"\nTrain nulls: {X_train_encoded.isnull().sum().sum()}")
# print(f"Test nulls: {X_test_encoded.isnull().sum().sum()}")

# # Apply SMOTE to training data only
# print("\n" + "="*80)
# print("Applying SMOTE to training data...")
# print("="*80)
# print(f"Before SMOTE: {y_train['TARGET'].value_counts().to_dict()}")

# smote = SMOTE(sampling_strategy='minority', random_state=42)
# X_train_balanced, y_train_balanced = smote.fit_resample(X_train_encoded, y_train)

# print(f"After SMOTE: {y_train_balanced['TARGET'].value_counts().to_dict()}")
# print(f"Train shape: {X_train_balanced.shape}")

# print("\n" + "="*80)
# print("Data preprocessing complete!")
# print("="*80)
# print(f"Final shapes:")
# print(f"  X_train_balanced: {X_train_balanced.shape}")
# print(f"  y_train_balanced: {y_train_balanced.shape}")
# print(f"  X_test_encoded: {X_test_encoded.shape}")
# print(f"  y_test: {y_test.shape}")
