In [1]:
import pandas as pd

In [2]:
train_1 = pd.read_pickle('train_1_proc.bin')
train_2_1 = pd.read_pickle('train_2_1_proc.bin')
train_2_2 = pd.read_pickle('train_2_2_proc.bin')

In [None]:
def drop_high_nan_columns(df, threshold=0.8):
    # Calculate the threshold for dropping columns
    threshold_count = len(df) * threshold
    # Drop columns with more than the threshold percentage of NaN values
    return df.dropna(axis=1, thresh=threshold_count)

# Apply the function to each DataFrame
train_1 = drop_high_nan_columns(train_1)
train_2_1 = drop_high_nan_columns(train_2_1)
train_2_2 = drop_high_nan_columns(train_2_2)

print(train_2_1.shape)

In [None]:
def drop_all_zero_columns(df):
    return df.loc[:, (df != 0).any(axis=0)]

# Apply the function to each DataFrame
train_1 = drop_all_zero_columns(train_1)
train_2_1 = drop_all_zero_columns(train_2_1)
train_2_2 = drop_all_zero_columns(train_2_2)

print(train_2_1.shape)

In [5]:
import pandas as pd
import numpy as np
from sklearn.experimental import enable_iterative_imputer  # Needed to enable IterativeImputer
from sklearn.impute import IterativeImputer, SimpleImputer
from sklearn.preprocessing import OrdinalEncoder
import pickle

def impute_numeric_categorical(data, max_iter=10, random_state=0):
    # Step 1: Separate numerical, categorical, and object columns
    num_cols = data.select_dtypes(include=['number']).columns
    cat_cols = data.select_dtypes(include=['category']).columns

    imputer_num = SimpleImputer(strategy='mean')
    encoder = OrdinalEncoder(handle_unknown='use_encoded_value', unknown_value=-1)
    imputer_cat = SimpleImputer(strategy='most_frequent')

    # Step 2: Impute numerical columns with IterativeImputer
    if len(num_cols) > 0:
        num_data = data[num_cols]
        imputed_num_data = imputer_num.fit_transform(num_data)
        data[num_cols] = imputed_num_data

    # Step 3: Impute categorical columns with SimpleImputer (most frequent strategy)
    if len(cat_cols) > 0:
        # Encode categorical columns temporarily using OrdinalEncoder
        encoded_cat_data = encoder.fit_transform(data[cat_cols])

        # Impute encoded categorical columns with SimpleImputer
        imputed_cat_data = imputer_cat.fit_transform(encoded_cat_data)

        # Convert back to original categories using the encoder
        imputed_cat_data = encoder.inverse_transform(imputed_cat_data)
        data[cat_cols] = imputed_cat_data

    # Object columns remain unchanged
    imputers = {'numerical': imputer_num, 'categorical': imputer_cat, 'encoder': encoder}
    return data, imputers


# Apply imputation to numerical and categorical columns only
imputers = {}
imputed_data, imputers_train_1 = impute_numeric_categorical(train_1, max_iter=8)
imputers['train_1'] = imputers_train_1

imputed_data.to_pickle('train_1_imputed.bin')

imputed_data, imputers_train_2_1 = impute_numeric_categorical(train_2_1, max_iter=8)
imputers['train_2_1'] = imputers_train_2_1

imputed_data.to_pickle('train_2_1_imputed.bin')

imputed_data, imputers_train_2_2 = impute_numeric_categorical(train_2_2, max_iter=8)
imputers['train_2_2'] = imputers_train_2_2

imputed_data.to_pickle('train_2_2_imputed.bin')

with open('imputers.pkl', 'wb') as f:
    pickle.dump(imputers, f)

In [None]:
imputed_data