In [None]:
import numpy as np 
import pandas as pd 
import matplotlib.pyplot as plt 

In [None]:
#Read data from file
data = pd.read_csv('C:/Users/yousefi.k/Downloads/S08/cs_02.csv')
data.head()

In [None]:
data.shape

In [None]:
data.info()

In [None]:
#Step 1: Determine the type of MVs
#Know the cause
np.sum(data.isnull(), axis = 0)

In [None]:
#The number of MVs in each column
np.sum(data == '.', axis = 0)

In [None]:
#Replace '.' with nan
data[data == '.'] = np.nan

In [None]:
#The number of MVs in each column
np.sum(data.isnull(), axis = 0)

In [None]:
#Get info
data.info()

In [None]:
#Use astype method to change data type of a column
data['customer_satisfaction_score'].astype('float')

In [None]:
#Change data type of numeric columns
data[data.columns[4 : ]] = data.iloc[:, 4 : ].apply(lambda col: col.astype('float'), axis = 0)

In [None]:
data.info()

In [None]:
#Step 2: Determine the extent of MVs
#Summary of MVs in each column
mvs_summary = pd.DataFrame({'freq' : np.sum(data.isnull(), axis = 0)})
mvs_summary['pct'] = round(mvs_summary['freq'] / data.shape[0] * 100, 1)
mvs_summary.sort_values(by = 'pct', ascending = False)

In [None]:
data.loc[:, 'mvs'] = np.sum(data.isnull(), axis = 1)
data.sort_values(by = 'mvs', ascending = False).head(10)

In [None]:
#Decision: remove cases with more than 50% mvs
data.drop(index = [84, 65, 87, 85], inplace = True)

In [None]:
#Decision: remove customers with 0 longevity
data.drop(index = data.loc[data['customer_longevity'] == '0', :].index, inplace = True)

In [None]:
#Summary of MVs in each column
mvs_summary = pd.DataFrame({'freq' : np.sum(data.isnull(), axis = 0)})
mvs_summary['pct'] = round(mvs_summary['freq'] / data.shape[0] * 100, 1)
mvs_summary.sort_values(by = 'pct', ascending = False)

In [None]:
#Step 3: Diagnose the randomness of the MVs processes
#Create a list of conditions
conditions = [data['customer_satisfaction_score'].isnull(), data['customer_satisfaction_score'].notnull()]
#Create a list of the values needed to assign for each conditions
values = [1, 0]
#Create a new column and use np.select to assign values to it using the lists as arguments
data['if_null'] = np.select(conditions, values)
data.tail()

In [None]:
#Evaluate the randomness of the MVs in customer_satisfaction_score from age perspective
data.groupby(by = 'if_null')['age'].mean()

In [None]:
#Box plot for evaluating the randomness of the MVs in customer_satisfaction_score from age perspective
plt.boxplot([data.loc[data['if_null'] == 0, 'age'], 
            data.loc[data['if_null'] == 1, 'age']])
plt.xticks(ticks = [1, 2], labels = [0, 1])
plt.title('MVs in customer satisfaction score \n from age perspective')
plt.show()

In [None]:
#Evaluate the randomness of the MVs in customer_satisfaction_score from gender perspective
#Cross tabulation analysis
cross_tab_pct = round(pd.crosstab(data['customer_longevity'], data['if_null'], normalize = 'index'), 2)
cross_tab_pct

In [None]:
#Remove temporary variables: mvs and if_null
data.drop(columns = ['mvs', 'if_null'], inplace = True)
data.head()

In [None]:
data.shape

In [None]:
#Step 4: Select the imputation method
#Method 1: complete case approach
data_complete_case = data.dropna(axis = 0, inplace = False)
print(data_complete_case.shape)
np.sum(data_complete_case.isnull(), axis = 0)

In [None]:
#Method 2: mean substitution
data_mean_sub = data.copy()
#Substiude NAs w/ mean of each column
data_mean_sub.iloc[:, 4 : ] = data_mean_sub.iloc[:, 4 : ].fillna(data_mean_sub.iloc[:, 4 : ].mean())
print(data_mean_sub.shape)
np.sum(data_mean_sub.isnull(), axis = 0)

In [None]:
#Data preparation for MICE 
data_mice_imputation = data.iloc[:, 1 :].reset_index(drop = True).copy()
#Convert object columns to category data type
data_mice_imputation['gender'] = data_mice_imputation['gender'].astype('category')
data_mice_imputation['customer_longevity'] = data_mice_imputation['customer_longevity'].astype('category')

In [None]:
#Method 3: MICE
#%pip install miceforest 
from miceforest import ImputationKernel #It uses lightgbm as a backend

mice_kernel = ImputationKernel(data = data_mice_imputation, 
                               random_state = 123)
#Run the kernel on the data for 10 iterations
mice_kernel.mice(10)
#Create the imputed data
data_mice_imputation = mice_kernel.complete_data()
print(data_mice_imputation.shape)
np.sum(data_mice_imputation.isnull(), axis = 0)

In [None]:
#Step 5: correlation analysis
#Method 1: complete case approach
corr_complete_case = round(data_complete_case.iloc[:, 4 : ].corr(), 2)
corr_complete_case

In [None]:
#Method 2: mean substitution
corr_mean_sub = round(data_mean_sub.iloc[:, 4 : ].corr(), 2)
corr_mean_sub

In [None]:
#Method 3: MICE
corr_mice_imputation = round(data_mice_imputation.iloc[:, 3 : ].corr(), 2)
corr_mice_imputation

In [None]:
#Final correlation table
corr_table = round((corr_complete_case + corr_mean_sub + corr_mice_imputation) / 3, 2)
corr_table

In [None]:
#Summary
#   The missing data process is MCAR
#   Imputation is the most logical course of action
#   Correlations differ slightly across imputation techniques

In [None]:
#Problem of Masking
import numpy as np
x = np.array([2, 2, 2, 2, 2, 3, 3, 3, 3, 3, 4, 4, 4, 4, 4, 1000])
y = np.array([2, 2, 3, 3, 3, 4, 4, 4, 10000, 100000])

In [None]:
#Classic method for outlier detection
#|(x - mean)/ sd| > 3
print(abs((x - np.mean(x))/ np.std(x)) > 3)
print(abs((y - np.mean(y))/ np.std(y)) > 3)

In [None]:
#Tukey's method
from scipy.stats import iqr
#x > q(0.75) + 1.5 * IQR(x)
#x < q(0.25) - 1.5 * IQR(x)
print(x > np.quantile(x, 0.75) + 1.5 * iqr(x))
print(y > np.quantile(y, 0.75) + 1.5 * iqr(y))