In [2]:
import pandas as pd

# Load the datasets
train_1 = pd.read_pickle('train_1_imputed.bin')
train_2_1 = pd.read_pickle('train_2_1_imputed.bin')
train_2_2 = pd.read_pickle('train_2_2_imputed.bin')

In [17]:
test = pd.read_pickle('test_1_imputed.bin')
labels = pd.read_csv('test_labels.csv')

In [None]:
train_1

In [19]:
test = test.merge(labels, left_on='loan_id', right_on='loan_id')

In [None]:
test

In [22]:
train_1 = pd.concat([train_1, test], axis=0)

In [None]:
train_1

In [24]:

# Step 1: Aggregate train_2_1 at 'id' level
adict = {col: ['mean', 'sum', 'max', 'min'] for col in train_2_1.columns if col not in ['id', 'add_431']}
agg_train_2_1 = train_2_1.groupby('id').agg(
    adict
).reset_index()

# Flatten column names after aggregation
agg_train_2_1.columns = ['_'.join(col).strip('_') for col in agg_train_2_1.columns]

adict = {col: ['mean', 'sum', 'max', 'min'] for col in train_2_2.columns if col not in ['id', 'add_431']}
agg_train_2_2 = train_2_2.groupby('id').agg(
    adict
).reset_index()

# Flatten column names after aggregation
agg_train_2_2.columns = ['t2_'+'_'.join(col).strip('_') for col in agg_train_2_2.columns]
agg_train_2_2.rename(columns={'t2_id': 'id'}, inplace=True)

In [None]:
agg_train_2_2

In [26]:
train_1 = train_1.merge(agg_train_2_1, on='id', how='left')
train_1 = train_1.merge(agg_train_2_2, on='id', how='left')

In [None]:
train_1

In [None]:
def drop_high_nan_columns(df, threshold=0.8):
    # Calculate the threshold for dropping columns
    threshold_count = len(df) * threshold
    # Drop columns with more than the threshold percentage of NaN values
    return df.dropna(axis=1, thresh=threshold_count)

# Apply the function to each DataFrame
train_1 = drop_high_nan_columns(train_1)

print(train_1.shape)

def drop_all_zero_columns(df):
    return df.loc[:, (df != 0).any(axis=0)]

# Apply the function to each DataFrame
train_1 = drop_all_zero_columns(train_1)

print(train_1.shape)

In [29]:
import pandas as pd
import numpy as np
from sklearn.experimental import enable_iterative_imputer  # Needed to enable IterativeImputer
from sklearn.impute import IterativeImputer, SimpleImputer
from sklearn.preprocessing import OrdinalEncoder
import pickle

def impute_numeric_categorical(data, max_iter=10, random_state=0):
    # Step 1: Separate numerical, categorical, and object columns
    num_cols = data.select_dtypes(include=['number']).columns
    cat_cols = data.select_dtypes(include=['category']).columns

    imputer_num = SimpleImputer(strategy='mean')
    encoder = OrdinalEncoder(handle_unknown='use_encoded_value', unknown_value=-1)
    imputer_cat = SimpleImputer(strategy='most_frequent')

    # Step 2: Impute numerical columns with IterativeImputer
    if len(num_cols) > 0:
        num_data = data[num_cols]
        imputed_num_data = imputer_num.fit_transform(num_data)
        data[num_cols] = imputed_num_data

    # Step 3: Impute categorical columns with SimpleImputer (most frequent strategy)
    if len(cat_cols) > 0:
        # Encode categorical columns temporarily using OrdinalEncoder
        encoded_cat_data = encoder.fit_transform(data[cat_cols])

        # Impute encoded categorical columns with SimpleImputer
        imputed_cat_data = imputer_cat.fit_transform(encoded_cat_data)

        # Convert back to original categories using the encoder
        imputed_cat_data = encoder.inverse_transform(imputed_cat_data)
        data[cat_cols] = imputed_cat_data

    # Object columns remain unchanged
    imputers = {'numerical': imputer_num, 'categorical': imputer_cat, 'encoder': encoder}
    return data, imputers


# Apply imputation to numerical and categorical columns only
imputed_data, imputers_train_1 = impute_numeric_categorical(train_1, max_iter=8)

imputed_data.to_pickle('train_1_agg_imputed.bin')

with open('imputers2.pkl', 'wb') as f:
    pickle.dump(imputers_train_1, f)

In [30]:
X  = imputed_data.drop(columns=['id', 'label', 'loan_id'])
y = imputed_data['label']

In [None]:
X

In [32]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder, StandardScaler

def preprocess_data(df):
    # Step 1: Detect categorical and numerical columns
    cat_cols = df.select_dtypes(include=['object', 'category']).columns
    num_cols = df.select_dtypes(include=['int64', 'float64']).columns

    scaler = StandardScaler()
    ohe = OneHotEncoder(sparse_output=False, handle_unknown='ignore')

    # Step 2: One-Hot Encode categorical columns
    if len(cat_cols) > 0:
        encoded_cat_data = ohe.fit_transform(df[cat_cols])
        encoded_cat_df = pd.DataFrame(encoded_cat_data, columns=ohe.get_feature_names_out(cat_cols), index=df.index)
    else:
        encoded_cat_df = pd.DataFrame(index=df.index)  # Empty DataFrame if no categorical columns

    # Step 3: Standard Scale numerical columns
    if len(num_cols) > 0:
        scaled_num_data = scaler.fit_transform(df[num_cols])
        scaled_num_df = pd.DataFrame(scaled_num_data, columns=num_cols, index=df.index)
    else:
        scaled_num_df = pd.DataFrame(index=df.index)  # Empty DataFrame if no numerical columns

    # Step 4: Concatenate the processed categorical and numerical data
    X = pd.concat([scaled_num_df, encoded_cat_df], axis=1)

    return X, scaler, ohe

# Preprocess the data and split into X and y
X, scaler, ohe = preprocess_data(X)


In [None]:
X

In [34]:
with open('scaler.pkl', 'wb') as f:
    pickle.dump(scaler, f)

with open('ohe.pkl', 'wb') as f:
    pickle.dump(ohe, f)

In [35]:
with open("X_train.pkl", "wb") as f:
    pickle.dump(X, f)

with open("y_train.pkl", "wb") as f:
    pickle.dump(y, f)