In [91]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import sklearn as sk
import seaborn as sns
from scipy.stats import zscore
import math
from sklearn.impute import KNNImputer



In [92]:
data = pd.read_csv('customer.csv')

# The Unnamed: 0 column resulted from saving the csv file
data.drop('Unnamed: 0', axis=1, inplace=True) 
 
# Replace the null values in the 'is_employed' column with 'False' (empty means no employment)
data.fillna({'is_employed': 'False'}, inplace=True)

data['recent_move_b'] = data['recent_move_b'].apply(lambda x: True if x == 'T' else False)

data['is_employed'] = data['is_employed'].apply(lambda x: True if x == 'True' else False)

data.drop('custid', axis=1, inplace=True)

data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 72458 entries, 0 to 72457
Data columns (total 13 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   sex             72458 non-null  object 
 1   is_employed     72458 non-null  bool   
 2   income          72458 non-null  float64
 3   marital_status  72458 non-null  object 
 4   health_ins      72458 non-null  bool   
 5   housing_type    70772 non-null  object 
 6   num_vehicles    70772 non-null  float64
 7   age             72458 non-null  int64  
 8   state_of_res    72458 non-null  object 
 9   code_column     72458 non-null  int64  
 10  gas_usage       70772 non-null  float64
 11  rooms           72458 non-null  int64  
 12  recent_move_b   72458 non-null  bool   
dtypes: bool(3), float64(3), int64(3), object(4)
memory usage: 5.7+ MB


In [93]:
# In the dictionary, we are indicated that the values 1, 2, 3 indicate special cases which we decided to separate
data['gas_payment'] = data['gas_usage'].replace([x for x in data['gas_usage'] if x not in [1, 2, 3]], 0)
data['gas_bill'] = data['gas_usage'].replace([x for x in data['gas_usage'] if x in [1, 2, 3]], 0)

data.drop('gas_usage', axis=1, inplace=True)

In [94]:
# Print the columns with missing values
print(f' --- PRE CLEANING --- \nColumns with missing values: \n{data.isna().sum()}')
old_len = len(data)
print('Length of the dataframe: ', old_len)   

# Drop rows with missing values except age column
data.dropna(inplace=True)


print(f'\n --- POST CLEANING --- \nLength of the dataframe: {len(data)}')
print('Number of rows dropped: ', old_len - len(data))

 --- PRE CLEANING --- 
Columns with missing values: 
sex                  0
is_employed          0
income               0
marital_status       0
health_ins           0
housing_type      1686
num_vehicles      1686
age                  0
state_of_res         0
code_column          0
rooms                0
recent_move_b        0
gas_payment          0
gas_bill          1686
dtype: int64
Length of the dataframe:  72458

 --- POST CLEANING --- 
Length of the dataframe: 70772
Number of rows dropped:  1686


In [95]:
data['age'] = data['age'].replace(0, np.nan)

In [96]:
scaling_cols = ['income', 'gas_payment']

scaling_df = data[scaling_cols]


# Dataset with log scaling
data_log = data.copy()
data_log[scaling_cols] = np.log1p(scaling_df)

# Dataset with zscore scaling
data_zs = data.copy()
data_zs[scaling_cols] = scaling_df.apply(zscore)

data_mm = data.copy()

dfs = [data_mm, data_log, data_zs]

for i, df in enumerate(dfs):
    numerical_cols = df.select_dtypes(include=[np.number])
    numerical_cols.drop('age', axis=1, inplace=True)
    cols = (numerical_cols-numerical_cols.min())/(numerical_cols.max()-numerical_cols.min())
    df[numerical_cols.columns] = cols
    dfs[i] = df


dfs = [data] + dfs

  result = func(self.values, **kwargs)


In [97]:
from sklearn.preprocessing import OneHotEncoder

encoders = []

for i, df in enumerate(dfs):
    categorical_cols = df.select_dtypes(include=[object])
    if categorical_cols.empty:
        continue

    encoder = OneHotEncoder(sparse_output=False, handle_unknown='ignore')
    encoded_array = encoder.fit_transform(categorical_cols)
    encoded_df = pd.DataFrame(
        encoded_array,
        columns=encoder.get_feature_names_out(categorical_cols.columns),
        index=df.index
    )
    
    df.drop(columns=categorical_cols.columns, inplace=True)
    df = pd.concat([df, encoded_df], axis=1)
    encoders.append(encoder)
    dfs[i] = df




In [98]:
dfs[3].head()

Unnamed: 0,is_employed,income,health_ins,num_vehicles,age,code_column,rooms,recent_move_b,gas_payment,gas_bill,...,state_of_res_South Dakota,state_of_res_Tennessee,state_of_res_Texas,state_of_res_Utah,state_of_res_Vermont,state_of_res_Virginia,state_of_res_Washington,state_of_res_West Virginia,state_of_res_Wisconsin,state_of_res_Wyoming
0,False,0.022866,True,0.0,24.0,0.103726,0.4,False,0.0,0.368421,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,False,0.023815,True,0.0,82.0,0.103726,1.0,True,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,False,0.022075,True,0.333333,31.0,0.103726,0.4,False,0.0,0.070175,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,False,0.035343,True,0.166667,93.0,0.103726,0.2,False,0.0,0.210526,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,False,0.036316,True,0.333333,67.0,0.103726,0.2,False,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [99]:
dfs[2].head()

Unnamed: 0,is_employed,income,health_ins,num_vehicles,age,code_column,rooms,recent_move_b,gas_payment,gas_bill,...,state_of_res_South Dakota,state_of_res_Tennessee,state_of_res_Texas,state_of_res_Utah,state_of_res_Vermont,state_of_res_Virginia,state_of_res_Washington,state_of_res_West Virginia,state_of_res_Wisconsin,state_of_res_Wyoming
0,False,0.711953,True,0.0,24.0,0.103726,0.4,False,0.0,0.368421,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,False,0.715735,True,0.0,82.0,0.103726,1.0,True,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,False,0.708641,True,0.333333,31.0,0.103726,0.4,False,0.0,0.070175,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,False,0.750436,True,0.166667,93.0,0.103726,0.2,False,0.0,0.210526,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,False,0.752717,True,0.333333,67.0,0.103726,0.2,False,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [100]:
for i, df in enumerate(dfs):
    imputer = KNNImputer(n_neighbors=5)
    imputed_data = imputer.fit_transform(df)
    age = np.round(imputed_data[:, 4]) # Rounds to the nearest integer
    df['age'] = (age - age.min())/(age.max()-age.min()) # Min-Max scaling
    dfs[i] = df

## Algorithms