In [None]:
import random
from sklearn.metrics import mean_squared_error, accuracy_score, f1_score

random.seed(25) # set a random seed
copy_raw_training_data = raw_training_data.copy()  # copy the raw training data
shape = copy_raw_training_data.shape  # store dimension of raw training data
num_entries = shape[0] * (shape[1] - 1)  # store number of entries for all independent variables
num_null = int(num_entries * 0.01)  # make 1% of the data be null entries

# randomly select entries and replace them with NaN, excluding the response variable
for _ in range(num_null):
    rand_row, rand_col = random.randint(0, shape[0] - 1) , random.randint(0, shape[1] - 2) # select a random row and column
    copy_raw_training_data.iloc[rand_row, rand_col] = np.nan # store entry as nan
    
# create a list to store all of the entries which are null: (row, column)
null_entries = [(row_index, col_index) 
                for row_index, row in enumerate(copy_raw_training_data.values) 
                for col_index, val in enumerate(row) 
                if pd.isnull(val)]

# store all of the entries for the original data set, in the positions of the removed entries
actual_entries_store = {} # create dictionary for storing the actual values in the locations that are removed
for row, col in null_entries: # loop over every removed entry:
    col_name = copy_raw_training_data.columns[col] # store column name of current null entry
    if col_name not in actual_entries_store: # check if column name is already in the dictionary
        actual_entries_store[col_name] = [] # if not already in dictionary, create empty list
    actual_entries_store[col_name].append(raw_training_data.iloc[row, col])# add actual value to the list for correct column
    
# store the names of all categorical and numerical columns (excluding the response variable)
categorical_cols = [col for col in copy_raw_training_data.select_dtypes(include=['object']).columns.tolist() if col != 'Status']
num_cols = [x for x in copy_raw_training_data.columns.drop(['Status']) if x not in categorical_cols]

In [None]:
# Imputation 1: impute missing values with median for numerical variables and mode for categorical variables

imputed1_entries_store = {} # create empty dictionary
for col_name in copy_raw_training_data.columns.drop(['Status']): # loop over each column of the data set 
    col_data = copy_raw_training_data[col_name] # store data for current column
    if col_data.dtype in ['float64', 'int64']: # check if data type of current column is numeric
        col_data.fillna(col_data.median(), inplace=True) # impute numerical nulls with median
    else:
        col_data.fillna(col_data.mode().iloc[0], inplace=True) # impute categorical nulls with mode
    imputed1_entries_store[col_name] = [col_data.loc[row] for row, col in null_entries if col_name == copy_raw_training_data.columns[col]] # store the imputed values in the dictionary
#---------------------------------------------------------------------------------------------------------------------------
# test performance of imputation method 1
# for numerical variables, measure performance using RMSE.
numerical_data = []
for col_name, actual_vals in actual_entries_store.items(): # loop over all variables and actual values
    # change imputed and actual values to be numeric
    imputed_vals = pd.to_numeric(imputed1_entries_store.get(col_name, []), errors='coerce')
    actual_vals = pd.to_numeric(actual_vals, errors='coerce')
    if col_name in num_cols: # if the column is numeric:
        numerical_data.append({"Variable": col_name, "RMSE": mean_squared_error(actual_vals, imputed_vals, squared=False).round(3)}) # add the column name and its RMSE

# for categorical variables, measure performance using accuracy and F1 score.
categorical_data = []
for col_name in categorical_cols: # loop over all categorical variables
    # change imputed and actual values to be a string
    imputed_vals = [str(val) for val in imputed1_entries_store.get(col_name, [])]
    actual_vals = [str(val) for val in actual_entries_store[col_name]]
    accuracy = accuracy_score(actual_vals, imputed_vals) # calculate accuracy score
    f1 = f1_score(actual_vals, imputed_vals, average='weighted') # calculate F1 score
    categorical_data.append({"Variable": col_name, "Accuracy": accuracy.round(3), "F1 Score": f1.round(3)}) # add the column name and its accuracy and F1 score
#---------------------------------------------------------------------------------------------------------------------------
# return the performance metrics
print("Numerical variables performance for imputation 1:")
print(pd.DataFrame(numerical_data).to_string(index=False))
print("\nCategorical variables performance for imputation 1:")
print(pd.DataFrame(categorical_data).to_string(index=False))