### Import Relevent Packages

In [14]:
# use pip to install all the libraries we need
import sys
!{sys.executable} -m pip install numpy pandas matplotlib scikit-learn seaborn | grep -v 'already satisfied'

# import necessary packages
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns
import warnings
import random

from sklearn.impute import KNNImputer
from sklearn.metrics import mean_squared_error, accuracy_score, f1_score
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeClassifier

# remove warnings
warnings.filterwarnings('ignore')

Defaulting to user installation because normal site-packages is not writeable
distutils: /home/trf1/.local/lib/python3.9/site-packages
sysconfig: /home/trf1/.local/lib64/python3.9/site-packages[0m
user = True
home = None
root = None
prefix = None[0m


### Get Data

In [15]:
# read the training data, dropping the ID column, as this is duplicated by pandas
raw_training_data = pd.read_csv(r"train.csv").drop(columns=["id"])

# copy the raw training data
copy_raw_training_data = raw_training_data.copy()

# separate out the independent variables from the target
X_copy = copy_raw_training_data.drop(labels=['Status'], axis=1)
Y_copy = copy_raw_training_data["Status"]

### Define Useful Functions

In [16]:
def add_missing_values(X_full, Y_full, missing_rate):
    """
    Takes a dataframe and adds in some missing values.

    Parameters:
    - X_full: a dataframe containing all columns with independent variables
    - Y_full: a dataframe containing the response variable column only
    - missing_rate: a float between 0 and 1 which specifies the proportion of lines which should have missing values

    Returns:
    - X_missing: the dataframe X_full but with some missing values
    - Y_missing: an exact copy of Y_full
    - actual_entries_store: dictionary containing all entries that were set to NaN, keys are the data frames columns
    """
    # set a seed for reproducibility
    np.random.seed(24)
    
    # copy the input dataframes
    X_missing = X_full.copy()
    y_missing = Y_full.copy()

    # create empty dictionary to store values of entries before setting them to nan
    actual_entries_store = {}

    # remove a percentage of entries in each column at random, specified by missing_rate
    for col in X_missing.columns:
        index_list = X_missing.sample(frac=missing_rate+random.uniform(-0.01, 0.01)).index  # sample a proportion of the indices within the column, with a random component for the rate
        actual_entries_store[col] = []
        for value in index_list:
            actual_entries_store[col].append(X_missing.loc[value, col])   # store the actual entries in the dictionary
        X_missing.loc[index_list, col] = np.nan # replace the values with nan

    return X_missing, y_missing, actual_entries_store

In [17]:
def get_imputed_values(imputed_data, null_entries):
    """
    Gets the value of all the imputed entries and outputs them in a dictionary.
    
    Parameters:
    - imputed_data: the dataframe containing some actual values and some imputed values
    - null_entries: the locations of all the entries which were null before imputation

    Returns:
    - imputed_entries_store: dictionary containing all imputed entries
    """
    imputed_entries_store = {} # create dictionary for storing the actual values in the locations that are removed
    
    for row, col in null_entries: # loop over every removed entry:
        col_name = imputed_data.columns[col] # store column name of current null entry
        if col_name not in imputed_entries_store: # check if column name is already in the dictionary
            imputed_entries_store[col_name] = [] # if not already in dictionary, create empty list
        imputed_entries_store[col_name].append(imputed_data.iloc[row, col])# add actual value to the list for correct column

    return imputed_entries_store

In [18]:
def calc_num_metrics(actual_entries_store, imputed_entries_store, num_cols):
    """
    Calculates performance metric RMSE for numerical variables for the imputation method.
    
    Parameters:
    - actual_entries_store: dictionary containing all entries that were set to NaN, keys are the data frames columns
    - imputed_entries_store: dictionary containing all imputed entries, indexed the same as actual_entries_store
    - num_cols: list of the numerical columns in the data frame
    
    Returns:
    - Data frame as a string showing the RMSE for the imputation method for each numerical variable
    """
    numerical_data = []
    for col_name, actual_vals in actual_entries_store.items(): # loop over all variables and actual values
        if col_name in num_cols: # if the current column is numerical:
            # change imputed and actual values to be numeric
            imputed_vals = pd.to_numeric(imputed_entries_store.get(col_name, []))
            actual_vals = pd.to_numeric(actual_vals)
            numerical_data.append({"Variable": col_name, "RMSE": round(mean_squared_error(actual_vals, imputed_vals, squared=False),3)}) # add the column name and its RMSE
    
    return pd.DataFrame(numerical_data).to_string(index=False)

In [19]:
def calc_categorical_metrics(actual_entries_store, imputed_entries_store, categorical_cols):
    """
    Calculates performance metric accuracy and F1 score for categorical variables for the imputation method.
    
    Parameters:
    - actual_entries_store: dictionary containing all entries that were set to NaN, keys are the data frames columns
    - imputed_entries_store: dictionary containing all imputed entries, indexed the same as actual_entries_store
    - categorical_cols: list of the categorical columns in the data frame
    
    Returns:
    - Data frame as a string showing the accuracy and F1 score for the imputation method for each categorical variable
    """
    categorical_data = []
    for col_name in categorical_cols: # loop over all categorical variables
        # change imputed and actual values to be a string
        imputed_vals = [str(val) for val in imputed_entries_store.get(col_name, [])]
        actual_vals = [str(val) for val in actual_entries_store[col_name]]
        accuracy = accuracy_score(actual_vals, imputed_vals) # calculate accuracy score
        f1 = f1_score(actual_vals, imputed_vals, average='weighted') # calculate F1 score
        categorical_data.append({"Variable": col_name, "Accuracy": round(accuracy,3), "F1 Score": round(f1,3)}) # add the column name and its accuracy and F1 score
    
    return pd.DataFrame(categorical_data).to_string(index=False)

### Prepare Data for Imputation

In [20]:
# set random seed for reproducability
random.seed(24)

# apply the add_missing_values function to our data
X_miss, Y_miss, actual_entries_X = add_missing_values(X_copy, Y_copy, 0.05)

# create a list to store all of the entries which are null: (row, column)
null_entries_X = [(row_index, col_index) 
                for row_index, row in enumerate(X_miss.values) 
                for col_index, val in enumerate(row) 
                if pd.isnull(val)]

We have all the components we need in order to test out our different imputation methods:
* random seed is set
* X_miss and Y_miss can be duplicated and the imputation methods can be applied to each of the copies
* actual_entries_X stores the original values from X_copy which have now been replaced by NaN
* null_entries_X stores the locations of all NaN values in X_miss

So now all we need to do for each imputation method is the following:
* make a new copy of X_miss
* apply the imputation algorithm to the copy
* apply the get_imputed_values function to the resulting dataset
* apply the calc_num_metrics and calc_categorical_metrics functions to assess the performance of the imputation

In [21]:
random.seed(24) # set a random seed for reproducability

#-------------------------------------------------------------------------------------------------------------------------
copy_raw_training_data = raw_training_data.copy()  # copy the raw training data
shape = copy_raw_training_data.shape  # store dimension of raw training data
num_entries = shape[0] * (shape[1] - 1)  # store number of entries for all independent variables
num_null = int(num_entries * 0.01)  # make 1% of the data be null entries

# randomly select entries and replace them with NaN, excluding the response variable
for _ in range(num_null):
    rand_row, rand_col = random.randint(0, shape[0] - 1) , random.randint(0, shape[1] - 2) # select a random row and column
    copy_raw_training_data.iloc[rand_row, rand_col] = np.nan # store entry as nan
    
missing_raw_training_data = copy_raw_training_data.copy()
    
# create a list to store all of the entries which are null: (row, column)
null_entries = [(row_index, col_index) 
                for row_index, row in enumerate(copy_raw_training_data.values) 
                for col_index, val in enumerate(row) 
                if pd.isnull(val)] 

# store the names of all categorical and numerical columns (excluding the response variable)
categorical_cols = [col for col in copy_raw_training_data.select_dtypes(include=['object']).columns.tolist() if col != 'Status']
num_cols = [x for x in copy_raw_training_data.columns.drop(['Status']) if x not in categorical_cols]

# store the actual entries that are in the missing value locations as a dictionary
actual_entries_store = get_imputed_values(raw_training_data, null_entries)
#-----------------------------------------------------------------------------------------------------------------------

# specify how to encode each categorical variable as an integer, and define a reverse of this encoding
numerical_encode = {
    "Drug": {"Placebo": 0, "D-penicillamine": 1}, "Sex": {"F": 0, "M": 1}, "Ascites": {"N": 0, "Y": 1},
    "Hepatomegaly": {"N": 0, "Y": 1}, "Spiders": {"N": 0, "Y": 1}, "Edema": {"N": 0, "Y": 1, "S": 2}}

reverse_numerical_encode = {feature: {value: key for key, value in encoding.items()}
                            for feature, encoding in numerical_encode.items()}

### Imputation Methods

In [22]:
# Imputation 1: impute missing values with median for numerical variables and mode for categorical variables
random.seed(24)

# apply the add_missing_values function to our data
X_copy_miss, Y_copy_miss, actual_entries_X = add_missing_values(X_copy, Y_copy, 0.05)

# create a list to store all of the entries which are null: (row, column)
null_entries_X = [(row_index, col_index) 
                for row_index, row in enumerate(X_copy_miss.values) 
                for col_index, val in enumerate(row) 
                if pd.isnull(val)]

actual_entries_X = get_imputed_values(X_copy, null_entries_X) # redefine actual location of missing entries
X_copy_miss2 = X_copy_miss.copy() # define the same data frame with missing entries to access later
#-------------------------------------------------------------------------------------------------------------------------
imputed1_entries_store = {} # create empty dictionary
for col_name in X_copy_miss.columns: # loop over each column of the data set 
    col_data = X_copy_miss[col_name] # store data for current column
    if col_data.dtype in ['float64', 'int64']: # check if data type of current column is numeric
        col_data.fillna(col_data.median(), inplace=True) # impute numerical nulls with median
    else:
        col_data.fillna(col_data.mode().iloc[0], inplace=True) # impute categorical nulls with mode
    imputed1_entries_store[col_name] = [col_data.loc[row] for row, col in null_entries_X if col_name == X_copy_miss.columns[col]] # store the imputed values in the dictionary

# Display performance metrics
print("Numerical variables performance for imputation 1:")
print(calc_num_metrics(actual_entries_X, imputed1_entries_store, num_cols))
print("\nCategorical variables performance for imputation 1:")
print(calc_categorical_metrics(actual_entries_X, imputed1_entries_store, categorical_cols))

Numerical variables performance for imputation 1:
     Variable     RMSE
     Alk_Phos 1726.848
         SGOT   44.724
       Copper   76.725
    Platelets   85.359
        Stage    0.882
  Prothrombin    0.815
  Cholesterol  193.877
          Age 3617.757
    Bilirubin    4.378
Tryglicerides   63.538
       N_Days 1138.424
      Albumin    0.358

Categorical variables performance for imputation 1:
    Variable  Accuracy  F1 Score
        Drug     0.512     0.347
         Sex     0.939     0.909
     Ascites     0.945     0.919
Hepatomegaly     0.550     0.390
     Spiders     0.779     0.683
       Edema     0.899     0.851


In [23]:
# Imputation 2: impute each missing value with the mean value from k nearest neighbours
X_copy_miss = X_copy_miss2.replace(numerical_encode)

imputer2 = KNNImputer(n_neighbors=2, weights="uniform")    # create a KNN imputer
data_imputed_knn_X = pd.DataFrame(imputer2.fit_transform(X=X_copy_miss, y=Y_copy_miss), columns=X_copy_miss.columns) # apply the imputer to the data with missing values

# add the target back to the result
data_imputed_knn = pd.concat([data_imputed_knn_X, Y_copy_miss], axis=1)

In [24]:
imputed2_entries_store = get_imputed_values(data_imputed_knn, null_entries_X)

In [25]:
knn_metrics = calc_num_metrics(get_imputed_values(X_copy.replace(numerical_encode), null_entries_X), imputed2_entries_store, data_imputed_knn.columns)
print(knn_metrics)

     Variable     RMSE
     Alk_Phos 1874.174
 Hepatomegaly    0.521
         Drug    0.568
         SGOT   42.066
      Ascites    0.213
       Copper   69.414
    Platelets   99.137
        Stage    1.102
      Spiders    0.413
  Prothrombin    0.879
          Sex    0.263
  Cholesterol  173.945
          Age 4274.268
        Edema    0.490
    Bilirubin    3.821
Tryglicerides   58.302
       N_Days 1163.729
      Albumin    0.395


In [26]:
# Imputation 3: for numerical variables, implement linear regression using the non-null rows to determine the regression lines.
# For rows with more 2 or 3 missing values, fill in 1 or 2, respecitvely, entries with the median for that column
# For categorical variables, implement a decision tree classifier to impute the missing entries.
X_copy_miss = X_copy_miss.replace(reverse_numerical_encode)

np.random.seed(24)
# Store the missing data frame for only numerical variables. Store another data frame by dropping the null rows.
num_missing_raw_training_data = X_copy_miss.drop(categorical_cols, axis=1)
linear_regres_df = num_missing_raw_training_data.dropna()

# fit linear regression models (for each numerical variable as the response) and store the coefficients and intercepts
regressions = {col: {'intercept': LinearRegression().fit(linear_regres_df.drop(col, axis=1),linear_regres_df[col]).intercept_,
                     'coeffs': LinearRegression().fit(linear_regres_df.drop(col, axis=1),linear_regres_df[col]).coef_}
               for col in linear_regres_df.columns}

# identify the rows with 1 missing value, and more than 1 missing value
rows_with_one_missing = num_missing_raw_training_data[num_missing_raw_training_data.isnull().sum(axis=1) == 1]
rows_with_over1_missing = num_missing_raw_training_data[num_missing_raw_training_data.isnull().sum(axis=1) >= 2]

# impute missing values using the median of the column in the rows with 2 or more missing values
median_vals = X_copy_miss.median() # store median value for each column in missing data frame
for idx, row in rows_with_over1_missing.iterrows(): # loop over rows with more than 1 missing value:
    missing_indices = row.isnull() # store the columns in current row with null entries
    fill_indices = np.random.choice(missing_indices[missing_indices].index, missing_indices.sum() - 1, replace=False) # sellect indicies to fill at random
    rows_with_over1_missing.loc[idx, fill_indices] = median_vals[fill_indices] #fill these entries using median of the column

# combine data frames such that each row only contains 1 missing value
new_rows_with_one_missing = pd.concat([rows_with_over1_missing, rows_with_one_missing]).sort_index()
#--------------------------------------------------------------------------------------------------------------------------

# impute missing values using linear regression
for idx, row in new_rows_with_one_missing.iterrows(): # loop over each row in this new data frame
    for col in num_cols: # loop over numerical variables:
        if pd.isna(row[col]): # if value in current cell is na:
            row[col] = (regressions[col]['coeffs'] * row.dropna()).sum() + regressions[col]['intercept'] # calc the imputed value using linear regression coefficients and intercept

# map to convert numerical variables in 'null_entries' to their names
mapping = {0: 'N_Days', 2: 'Age', 8: 'Bilirubin', 9: 'Cholesterol', 10: 'Albumin', 11: 'Copper', 12: 'Alk_Phos', 13: 'SGOT', 14: 'Tryglicerides', 15: 'Platelets', 16: 'Prothrombin', 17: 'Stage'}

#---------------------------------------------------------------------------------------------------------------------------

# create the data set with numerical variables imputed as above and categorical variables still missing
for index, row in new_rows_with_one_missing.iterrows():
    for col in num_cols:
        X_copy_miss.loc[index,col] = new_rows_with_one_missing.loc[index][col]
        
#--------------------------------------------------------------------------------------------------------------------------

# Categorical imputation method:
# split data set into numerical and categorical sets, dropping status column and all missing rows, and numerically encode
dropped_missing_data = X_copy_miss.replace(numerical_encode).dropna()

# train decision tree classifier for each categorical variable, predict the missing values, and impute into original dataset
missing_training_data_encoded = X_copy_miss.replace(numerical_encode).copy() # new data frame by numerically encoding

for cat_var in categorical_cols: # loop over each categorical variable
    # define the training data and labels
    X = dropped_missing_data.drop(columns=categorical_cols)
    y = dropped_missing_data[cat_var]
    # fit the tree
    decision_tree = DecisionTreeClassifier()
    decision_tree.fit(X, y)
    # predict the missing categorical variables and impute into missing_raw_training_data
    X_missing = X_copy_miss.drop(columns=categorical_cols)
    missing_predictions = decision_tree.predict(X_missing)
    X_copy_miss.loc[X_copy_miss[cat_var].isnull(), cat_var] = missing_predictions[X_copy_miss[cat_var].isnull()]
# restore the original categorical entries instead of being numerical
X_copy_miss = X_copy_miss.replace(reverse_numerical_encode)

for cat_var in categorical_cols: # loop over each categorical variable
    # define the training data and labels
    X = dropped_missing_data.drop(columns=categorical_cols)
    y = dropped_missing_data[cat_var]
    # fit the tree
    decision_tree = DecisionTreeClassifier()
    decision_tree.fit(X, y)
    # predict the missing categorical variables and impute into missing_raw_training_data
    X_missing = X_copy_miss.drop(columns=categorical_cols)
    missing_predictions = decision_tree.predict(X_missing)
    X_copy_miss.loc[X_copy_miss[cat_var].isnull(), cat_var] = missing_predictions[X_copy_miss[cat_var].isnull()]
# restore the original categorical entries instead of being numerical
X_copy_miss = X_copy_miss.replace(reverse_numerical_encode)
#--------------------------------------------------------------------------------------------------------------------------

imputed3_entries_store = get_imputed_values(X_copy_miss, null_entries_X) # store imputed values in dictionary
print("Numerical variables performance for imputation 3:")
print(calc_num_metrics(actual_entries_X, imputed3_entries_store, num_cols))
print("\nNumerical variables performance for imputation 3:")
print(calc_categorical_metrics(actual_entries_X, imputed3_entries_store, categorical_cols))

TypeError: Cannot convert [['D-penicillamine' 'Placebo' nan ... nan 'D-penicillamine'
  'D-penicillamine']] to numeric