In [1]:
import pandas as pd
import re
from sklearn.linear_model import LinearRegression
import numpy as np
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import train_test_split

## IO

In [2]:
random_state = 7

In [3]:
df = pd.read_csv("./data/raw/cell2celltrain.csv")
print (df.shape)

(51047, 58)


## Column Cleaning

In [4]:
def camel_to_snake(name):
    s1 = re.sub('(.)([A-Z][a-z]+)', r'\1_\2', name)
    snake_case = re.sub('([a-z0-9])([A-Z])', r'\1_\2', s1).lower()
    return snake_case

In [5]:
df.columns = [camel_to_snake(col) for col in df.columns]

In [6]:
df = df.drop('not_new_cellphone_user', axis=1)
df = df.drop('service_area', axis=1)
df = df.drop('customer_id',axis =1)

In [7]:
## New Features
# TotalCareIssues=CustomerCareCalls+BlockedCalls+DroppedBlockedCalls
df['total_care_issues'] = df['customer_care_calls'] + df['blocked_calls'] + df['dropped_blocked_calls']
df = df.drop(['customer_care_calls', 'blocked_calls', 'dropped_blocked_calls'], axis=1)

# OverallRevenue=MonthlyRevenue+TotalRecurringCharge
df['overall_revenue'] = df['monthly_revenue'] + df['total_recurring_charge']
df = df.drop(['monthly_revenue', 'total_recurring_charge'], axis=1)

# OverallUsage=MonthlyMinutes + OverageMinutes + RoamingCalls
df['overall_usage'] = df['monthly_minutes'] + df['overage_minutes'] + df['roaming_calls']
df = df.drop(['monthly_minutes', 'overage_minutes', 'roaming_calls'], axis=1)

# ValueAddedServiceUsage=DirectorAssistedCalls + ThreewayCalls + CallForwardingCalls + CallWaitingCalls
df['value_added_service_usage'] = df['director_assisted_calls'] + df['threeway_calls'] + df['call_forwarding_calls'] + df['call_waiting_calls']
df = df.drop(['director_assisted_calls', 'threeway_calls', 'call_forwarding_calls', 'call_waiting_calls'], axis=1)

#TotalCalls=InboundCalls+OutboundCalls
df['total_calls'] = df['inbound_calls'] + df['outbound_calls']
df = df.drop(['inbound_calls', 'outbound_calls'], axis=1)

#TotalPeakoffPeakcall =PeakCallsInOut+OffPeakCallsInOut
df['total_peak_off_peak_calls'] = df['peak_calls_in_out'] + df['off_peak_calls_in_out']
df = df.drop(['peak_calls_in_out', 'off_peak_calls_in_out'], axis=1)


## Make Data Categories

In [8]:
def get_data_categories(df, verbose=False):
    quantitative_columns = df.select_dtypes(include=['float64']).columns
    quantitative_discrete_columns = df.select_dtypes(include=['int64']).columns
    id_columns = ['customer_id']
    quantitative_discrete_columns = [col for col in quantitative_discrete_columns if col not in id_columns]
    categorical_columns = df.select_dtypes(include=['object']).columns
    binary_columns = [col for col in categorical_columns if df[col].nunique() == 2]
    categorical_columns = [col for col in categorical_columns if col not in binary_columns]

    if verbose:
        print("\nQuantitative Variables:", list(quantitative_columns))
        print("Total Quantitative-Continuous Variables:", len(quantitative_columns))

        print("\nQuantitative Discrete Variables:", list(quantitative_discrete_columns))
        print("Total Quantitative-Discrete Variables:", len(quantitative_discrete_columns))

        print("\nCategorical Variables:", list(categorical_columns))
        print("Total Categorical Variables:", len(categorical_columns))

        print("\nBinary Variables:", list(binary_columns))
        print("Total Binary Variables:", len(binary_columns))

    return quantitative_columns, quantitative_discrete_columns, categorical_columns, binary_columns, id_columns

quantitative_columns, quantitative_discrete_columns, categorical_columns, binary_columns, id_columns = get_data_categories(df)

## Binary Variable Cleaning

In [9]:
for col in binary_columns:
    if col == "homeownership":
        df[col] = df[col].map({"Known": 1, "Unknown": 0})
    else:
        df[col] = df[col].map({"Yes": 1, "No": 0})

## Quantitative Variables Imputation

- To prevent data leakage we split before applying imputation

In [10]:
train, test = train_test_split(df, train_size = 0.80, random_state = random_state)

# X_train = train.drop('churn', axis=1).reset_index(drop=True)
# y_train = train['churn'].reset_index(drop=True)

# X_test = test.drop('churn', axis=1).reset_index(drop=True)
# y_test = test['churn'].reset_index(drop=True)

In [11]:
def random_regression_imputer(df, column):

    df_known = df[df[column].notna()]
    df_missing = df[df[column].isna()]

    # Select features for the model
    features = df.columns.difference([column])

    # Define training data and replace NaNs with 0
    X_known = df_known[features].fillna(0)
    y_known = df_known[column].fillna(0)

    # Train a Linear Regression model
    model = LinearRegression()
    model.fit(X_known, y_known)

    # Predict missing values for this column
    X_missing = df_missing[features].fillna(0)  # Replace NaNs with 0 in missing data

    predicted_values = model.predict(X_missing)
    random_noise = np.random.normal(
        loc=0, scale=y_known.std(), size=predicted_values.shape
    )
    # only positive values
    df.loc[df[column].isna(), column] = np.max(predicted_values + random_noise, 0)
    return df[column]

In [12]:
# Impute train
quant_df = train[quantitative_columns].copy()
for col in quant_df.columns:
    if train[col].isna().sum() > 0:
        train[col] = random_regression_imputer(quant_df, col)

In [13]:
# Impute test
quant_df = test[quantitative_columns].copy()
for col in quant_df.columns:
    if test[col].isna().sum() > 0:
        test[col] = random_regression_imputer(quant_df, col)

## Trim Outliers

In [14]:
def trim_outliers_iqr_multiple_columns(df: pd.DataFrame, columns:str, threshold: float, verbose=False) -> pd.DataFrame:
    """
    Trim outliers from multiple DataFrame columns using the IQR method.

    Parameters:
        df (pd.DataFrame): The DataFrame to process.
        columns (list): A list of column names from which to trim outliers.

    Returns:
        pd.DataFrame: DataFrame with outliers removed.
    """
    for column in columns:
        if verbose:
            print(f"Processing column: {column}")
        Q1 = df[column].quantile(0.25)
        Q3 = df[column].quantile(0.75)
        IQR = Q3 - Q1
        lower_bound = Q1 - threshold * IQR
        upper_bound = Q3 + threshold * IQR
        
        if verbose: 
            print("Dimensions before:", df.shape)
        # Create a mask for values within bounds
        mask = (df[column] >= lower_bound) & (df[column] <= upper_bound)
        
        # Apply the mask to keep only the non-outliers
        df = df[mask]
        if verbose:
            print("Dimensions after:", df.shape)
        
    return df.reset_index(drop=True)

In [15]:
train = trim_outliers_iqr_multiple_columns(
    train, 
    quantitative_columns,
    threshold=3
)
test = trim_outliers_iqr_multiple_columns(
    test, 
    quantitative_columns,
    threshold=3
)

## Categorical Cleaning

In [16]:
def categorical_to_numeric(df, verbose=False):
    # handset_price
    # credit_rating

    df['handset_price'] = df['handset_price'].replace("Unknown", np.nan).astype(float)
    df['handset_price'] = df['handset_price'].fillna(df['handset_price'].mean())

    df['credit_rating'] = df['credit_rating'].str.split('-').str[0].str.strip()
    df['credit_rating'] = df['credit_rating'].astype(int)

    if verbose:
        print (df[categorical_columns].isna().sum())
    return df

In [17]:
train = categorical_to_numeric(train)
test = categorical_to_numeric(test)

In [18]:
def ohe_encoding(df, columns):
    encoder = OneHotEncoder(sparse_output=False, handle_unknown='ignore', drop='first')
    encoder.fit(df[columns])

    train_encoded = encoder.transform(df[columns])
    train_encoded_df = pd.DataFrame(train_encoded, columns=encoder.get_feature_names_out())

    df = pd.concat([df.drop(columns, axis=1), train_encoded_df], axis=1)
    
    return df

In [19]:
# Diverge the Pipeline here

In [20]:
ohe_vars = [
    "prizm_code",
    "occupation",
    "marital_status"
]

In [21]:
train_ohe = ohe_encoding(train, ohe_vars).copy()
test_ohe = ohe_encoding(test, ohe_vars).copy()

In [22]:
for ohe_var in ohe_vars:
    train[ohe_var] = train[ohe_var].astype('category')
    test[ohe_var] = test[ohe_var].astype('category')

## Zero Inflated Variables

In [23]:
# Get zero inflated features
def get_zero_inflated_features(df, threshold=0.5):
    zero_inflated = []
    for col in df.columns:
        if df[col].value_counts().get(0) is None:
            continue

        if df[col].value_counts().get(0) / len(df) > threshold:
            zero_inflated.append(col)
    return zero_inflated

def make_zero_inflated_indicators(df):
    nonbinary_feature_names = list(quantitative_discrete_columns) + list(quantitative_columns)

    zero_inflated_features = get_zero_inflated_features(df[nonbinary_feature_names], threshold=0.1)

    for feature in zero_inflated_features:
        df[f'{feature}_is_zero'] = (df[feature] == 0).astype(int)
    return df

df = pd.concat([train, test], axis=0).reset_index(drop=True)
df = make_zero_inflated_indicators(df)

In [24]:
df_ohe = pd.concat([train_ohe, test_ohe], axis=0).reset_index(drop=True)
df_ohe = make_zero_inflated_indicators(df_ohe)

## Test Train Split

### Categorical

In [25]:
train, test = train_test_split(df, train_size = 0.80, random_state = random_state)

In [26]:
X_train = train.drop('churn', axis=1).reset_index(drop=True)
y_train = train['churn'].reset_index(drop=True)

X_test = test.drop('churn', axis=1).reset_index(drop=True)
y_test = test['churn'].reset_index(drop=True)

In [27]:
assert X_train.isna().sum().sum() == 0
assert X_test.isna().sum().sum() == 0

In [28]:
assert y_train.isna().sum().sum() == 0
assert y_test.isna().sum().sum() == 0

In [29]:
X_train.to_csv("./data/prod/X_train_cat.csv", index=False)
y_train.to_csv("./data/prod/y_train_cat.csv", index=False)

X_test.to_csv("./data/prod/X_test_cat.csv", index=False)
y_test.to_csv("./data/prod/y_test_cat.csv", index=False)

### OHE

In [30]:
train_ohe, test_ohe = train_test_split(df_ohe, train_size = 0.80, random_state = random_state)

In [31]:
X_train = train_ohe.drop("churn", axis=1)
y_train = train_ohe["churn"].reset_index(drop=True)

X_test = test_ohe.drop("churn", axis=1)
y_test = test_ohe["churn"].reset_index(drop=True)

In [32]:
assert X_train.isna().sum().sum() == 0
assert X_test.isna().sum().sum() == 0

In [33]:
assert y_train.isna().sum().sum() == 0
assert y_test.isna().sum().sum() == 0

In [34]:
X_train.to_csv("./data/prod/X_train_ohe.csv", index=False)
y_train.to_csv("./data/prod/y_train_ohe.csv", index=False)

X_test.to_csv("./data/prod/X_test_ohe.csv", index=False)
y_test.to_csv("./data/prod/y_test_ohe.csv", index=False)