In [1]:
#import pandas as pd
import numpy as np 
import matplotlib.pyplot as plt
import dask.dataframe as dd
from IPython.display import display, HTML
import pandas as pd
import cudf

In [2]:
from cuml.linear_model import LinearRegression, LogisticRegression
import cudf
import numpy as np
import gc
import cupy as cp
import joblib  # For saving the model

import cudf
from cuml.linear_model import LinearRegression, LogisticRegression
import joblib

def regression_impute_column(df, target_column, features, is_categorical=False, model_save_path=None):
    """
    Impute missing values in a specific column using regression models based on a specific set of features and save the model.
    
    Parameters:
    df (cudf.DataFrame): The input dataframe.
    target_column (str): The name of the column to impute.
    features (list): List of column names to be used as features for the model.
    is_categorical (bool): Flag to indicate if the target column is categorical.
    model_save_path (str): Path to save the trained model.
    """
    print("Preparing data...")

    # Ensure the target column is not in the features list
    features = [col for col in features if col != target_column]
    
    # Split the dataframe into rows with and without missing values in the target column
    df_with_values = df[df[target_column].notnull()]
    df_missing_values = df[df[target_column].isnull()]

    if len(df_missing_values) == 0:
        print("No imputation needed.")
        return df  # No imputation needed if no missing values

    # Select only the specified features for training and imputation
    X_train = df_with_values[features].fillna(0)
    y_train = df_with_values[target_column]
    X_missing = df_missing_values[features].fillna(0)
    
    # Choose the model based on the data type
    if is_categorical:
        model = LogisticRegression()
    else:
        model = LinearRegression()
        
    print("Start imputing...")
    # Fit the model
    model.fit(X_train, y_train)
    
    # Save the model if a path is provided
    if model_save_path:
        joblib.dump(model, model_save_path)
        print(f"Model saved to {model_save_path}")
    
    # Predict the missing values
    predicted_values = model.predict(X_missing)
    
    # Update the DataFrame with imputed values
    df.loc[df[target_column].isnull(), target_column] = predicted_values
    del X_train, y_train, X_missing, df_with_values, df_missing_values
    return df




In [3]:
def batch_impute_and_save(df, features, start_index=0, batch_size=9, base_filename='imputed_df'):
    """
    Perform imputation on batches of columns and save intermediate results.
    
    Args:
    df (cudf.DataFrame): The DataFrame to impute.
    numerical_categories (list): List of columns to impute.
    start_index (int): Column index to start imputation.
    batch_size (int): Number of columns to impute per batch.
    base_filename (str): Base name for saving intermediate Parquet files.
    
    Returns:
    int: Index of the last column processed.
    """
    for i, target in enumerate(features[start_index:], start=start_index):
        is_categorical = target == "merchant_profile_01"
        
        # Define the path to save the model
        model_save_path = f'impute_models/{target}_model.pkl'
        
        # Pass the model_save_path to the regression_impute_column function
        df = regression_impute_column(df, target, features, is_categorical=is_categorical, model_save_path=model_save_path)
        
        print(f'Successfully imputed values for {"categorical" if is_categorical else "numerical"} column: {target}')
        
        if (i + 1) % batch_size == 0:
            # Save intermediate results
            save_path = f'impute_data/{base_filename}_{i + 1}.parquet'
            df.to_parquet(save_path)
            print(f'Saved intermediate results to {save_path}')
            return i + 1  # Return index to indicate progress
    # Final s = f'{base_filename}_final.parquet'
    save_path = f'{base_filename}_final.parquet'
    df.to_parquet(save_path)
    print(f'Final save to {save_path}')
    del df
    return i + 1

def continue_imputation(latest_parquet):
    """
    Load the latest Parquet file and continue the imputation process.
    
    Args:
    latest_parquet (str): Path to the latest Parquet file.
    numerical_categories (list): List of all columns to impute.
    """
    df = cudf.read_parquet(latest_parquet)
    categories = df.columns.to_list()
    exclude_columns = ['ind_recommended', 'activation']
    features_to_use = [col for col in df.columns if col not in exclude_columns]    
    
    last_index_processed = int(latest_parquet.split('_')[-1].replace('.parquet', ''))
    if last_index_processed >= len(features_to_use):
        print("Imputation already completed.")
        return
    batch_impute_and_save(df, features_to_use, start_index=last_index_processed)

In [4]:
# df = cudf.read_parquet('temp_df.parquet')
# exclude_columns = ['ind_recommended', 'activation']
# features_to_use = [col for col in df.columns if col not in exclude_columns]  
# batch_impute_and_save(df, features_to_use, batch_size=6)

continue_imputation('impute_data/imputed_df_54.parquet')



Preparing data...


  return init_func(self, *args, **filtered_kwargs)


Start imputing...
Model saved to impute_models/merchant_profile_03_model.pkl
Successfully imputed values for numerical column: merchant_profile_03
Preparing data...
Start imputing...
Model saved to impute_models/customer_digital_activity_01_model.pkl
Successfully imputed values for numerical column: customer_digital_activity_01
Preparing data...
Start imputing...
Model saved to impute_models/merchant_spend_10_model.pkl
Successfully imputed values for numerical column: merchant_spend_10
Preparing data...
Start imputing...
Model saved to impute_models/customer_profile_03_model.pkl
Successfully imputed values for numerical column: customer_profile_03
Preparing data...
Start imputing...
Model saved to impute_models/customer_digital_activity_02_model.pkl
Successfully imputed values for numerical column: customer_digital_activity_02
Preparing data...
Start imputing...
Model saved to impute_models/customer_profile_04_model.pkl
Successfully imputed values for numerical column: customer_profile