In [1]:
import pandas as pd
import numpy as np
import duckdb
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split

pd.set_option('future.no_silent_downcasting', True)

conn = duckdb.connect(database="capstone.db")

In [2]:
df = conn.execute("SELECT * FROM churn_data").fetchdf()
df[["auto_payment", "churn"]] = df[["auto_payment", "churn"]].apply(lambda x: x.map({True: 1, False: 0}))
X, y = df.drop(columns=["churn", "apps", "service_type"]), df["churn"] 

X.loc[X['broadband'] == 1, ['roaming_usage', 'call_drops', 'avg_call_duration']] = X.loc[X['broadband'] == 1, ['roaming_usage', 'call_drops', 'avg_call_duration']].fillna(0)
X.loc[(X['broadband'] == 0) & (X['postpaid'] == 0), 'auto_payment'] = X.loc[(X['broadband'] == 0) & (X['postpaid'] == 0), 'auto_payment'].fillna(0)

In [3]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=23, stratify=y)

In [12]:
def fill_missing_values(df_train, df_test, condition_column, columns):

    for condition in condition_column:
   
        df_train_filtered = df_train[df_train[condition] == 1]
        df_test_filtered = df_test[df_test[condition] == 1]

        for col in columns:
            min_val, max_val = df_train_filtered[col].min(), df_train_filtered[col].max()

           
            missing_train_idx = df_train_filtered[df_train_filtered[col].isnull()].index
            missing_test_idx = df_test_filtered[df_test_filtered[col].isnull()].index

            train_random_values = np.random.uniform(min_val, max_val, size=len(missing_train_idx)).astype(np.float32)
            test_random_values = np.random.uniform(min_val, max_val, size=len(missing_test_idx)).astype(np.float32)

            df_train.loc[missing_train_idx, col] = train_random_values
            df_test.loc[missing_test_idx, col] = test_random_values

    return df_train, df_test

X_train, X_test =  fill_missing_values(X_train, X_test, ['broadband','prepaid','postpaid'], ['avg_call_duration', 'data_usage', 'monthly_charge'])

X_train.drop(columns=['prepaid'], inplace=True) 
X_test.drop(columns=['prepaid'], inplace=True) # dummy variable trap

X_train.fillna({'auto_payment': 0}, inplace=True)
X_test.fillna({'auto_payment': 0}, inplace=True)

X_train.fillna({'tenure': round(X_train['tenure'].mean())}, inplace=True)
X_test.fillna({'tenure': round(X_train['tenure'].mean())}, inplace=True)

In [14]:
train_data = pd.concat([X_train, y_train], axis=1)
test_data = pd.concat([X_test, y_test], axis=1)

train_data.to_parquet("train_data.parquet", engine='pyarrow')
test_data.to_parquet("test_data.parquet", engine='pyarrow')