### Import Relevent Packages

In [241]:
# use pip to install all the libraries we need
import sys
!{sys.executable} -m pip install numpy pandas matplotlib scikit-learn seaborn | grep -v 'already satisfied'

# import necessary packages
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns
import warnings
import random

from sklearn.impute import KNNImputer
from sklearn.metrics import mean_squared_error, accuracy_score, f1_score
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeClassifier

# remove warnings
warnings.filterwarnings('ignore')

Defaulting to user installation because normal site-packages is not writeable
distutils: /home/trf1/.local/lib/python3.9/site-packages
sysconfig: /home/trf1/.local/lib64/python3.9/site-packages[0m
user = True
home = None
root = None
prefix = None[0m


### Get Data

In [242]:
# read the training data, dropping the ID column, as this is duplicated by pandas
raw_training_data = pd.read_csv(r"train.csv").drop(columns=["id"])

# copy the raw training data
copy_raw_training_data = raw_training_data.copy()

# separate out the independent variables from the target
X_copy = copy_raw_training_data.drop(labels=['Status'], axis=1)
Y_copy = copy_raw_training_data["Status"]

### Define Useful Functions

In [243]:
def add_missing_values(X_full, Y_full, missing_rate):
    """
    Takes a dataframe and adds in some missing values.

    Parameters:
    - X_full: a dataframe containing all columns with independent variables
    - Y_full: a dataframe containing the response variable column only
    - missing_rate: a float between 0 and 1 which specifies the proportion of lines which should have missing values

    Returns:
    - X_missing: the dataframe X_full but with some missing values
    - Y_missing: an exact copy of Y_full
    - actual_entries_store: dictionary containing all entries that were set to NaN, keys are the data frames columns
    """
    # set a seed for reproducibility
    np.random.seed(24)
    
    # copy the input dataframes
    X_missing = X_full.copy()
    y_missing = Y_full.copy()

    # create empty dictionary to store values of entries before setting them to nan
    actual_entries_store = {}

    # remove a percentage of entries in each column at random, specified by missing_rate
    for col in X_missing.columns:
        index_list = X_missing.sample(frac=missing_rate+random.uniform(-0.01, 0.01)).index  # sample a proportion of the indices within the column, with a random component for the rate
        actual_entries_store[col] = []
        for value in index_list:
            actual_entries_store[col].append(X_missing.loc[value, col])   # store the actual entries in the dictionary
        X_missing.loc[index_list, col] = np.nan # replace the values with nan

    return X_missing, y_missing, actual_entries_store

In [244]:
def get_imputed_values(imputed_data, null_entries):
    """
    Gets the value of all the imputed entries and outputs them in a dictionary.
    
    Parameters:
    - imputed_data: the dataframe containing some actual values and some imputed values
    - null_entries: the locations of all the entries which were null before imputation

    Returns:
    - imputed_entries_store: dictionary containing all imputed entries
    """
    imputed_entries_store = {} # create dictionary for storing the actual values in the locations that are removed
    
    for row, col in null_entries: # loop over every removed entry:
        col_name = imputed_data.columns[col] # store column name of current null entry
        if col_name not in imputed_entries_store: # check if column name is already in the dictionary
            imputed_entries_store[col_name] = [] # if not already in dictionary, create empty list
        imputed_entries_store[col_name].append(imputed_data.iloc[row, col])# add actual value to the list for correct column

    return imputed_entries_store

In [245]:
def calc_num_metrics(actual_entries_store, imputed_entries_store, num_cols):
    """
    Calculates performance metric RMSE for numerical variables for the imputation method.
    
    Parameters:
    - actual_entries_store: dictionary containing all entries that were set to NaN, keys are the data frames columns
    - imputed_entries_store: dictionary containing all imputed entries, indexed the same as actual_entries_store
    - num_cols: list of the numerical columns in the data frame
    
    Returns:
    - Data frame as a string showing the RMSE for the imputation method for each numerical variable
    """
    numerical_data = []
    for col_name, actual_vals in actual_entries_store.items(): # loop over all variables and actual values
        if col_name in num_cols: # if the current column is numerical:
            # change imputed and actual values to be numeric
            imputed_vals = pd.to_numeric(imputed_entries_store.get(col_name, []))
            actual_vals = pd.to_numeric(actual_vals)
            numerical_data.append({"Variable": col_name, "RMSE": round(mean_squared_error(actual_vals, imputed_vals, squared=False),3)}) # add the column name and its RMSE
    
    return pd.DataFrame(numerical_data).to_string(index=False)

In [246]:
def calc_categorical_metrics(actual_entries_store, imputed_entries_store, categorical_cols):
    """
    Calculates performance metric accuracy and F1 score for categorical variables for the imputation method.
    
    Parameters:
    - actual_entries_store: dictionary containing all entries that were set to NaN, keys are the data frames columns
    - imputed_entries_store: dictionary containing all imputed entries, indexed the same as actual_entries_store
    - categorical_cols: list of the categorical columns in the data frame
    
    Returns:
    - Data frame as a string showing the accuracy and F1 score for the imputation method for each categorical variable
    """
    categorical_data = []
    for col_name in categorical_cols: # loop over all categorical variables
        # change imputed and actual values to be a string
        imputed_vals = [str(val) for val in imputed_entries_store.get(col_name, [])]
        actual_vals = [str(val) for val in actual_entries_store[col_name]]
        accuracy = accuracy_score(actual_vals, imputed_vals) # calculate accuracy score
        f1 = f1_score(actual_vals, imputed_vals, average='weighted') # calculate F1 score
        categorical_data.append({"Variable": col_name, "Accuracy": round(accuracy,3), "F1 Score": round(f1,3)}) # add the column name and its accuracy and F1 score
    
    return pd.DataFrame(categorical_data).to_string(index=False)

In [247]:
def numerical_encode(X):
    """
    Encodes the categorical variables in a given dataframe numerically.
    
    Parameters:
    - X: dataframe, should have columns labelled "Drug", "Sex", "Ascites", "Hepatomegaly", "Spiders", "Edema"
    
    Returns:
    - X_encoded: dataframe with categorical variables encoded numerically
    """

    # make a copy of the dataframe
    X_encoded = X.copy()

    # specify how each numerical column should be encoded
    encodings = {
        "Drug": {"Placebo": 0, "D-penicillamine": 1},
        "Sex": {"F": 0, "M": 1},
        "Ascites": {"N": 0, "Y": 1},
        "Hepatomegaly": {"N": 0, "Y": 1},
        "Spiders": {"N": 0, "Y": 1},
        "Edema": {"N": 0, "Y": 1, "S": 2}
    }

    # replace the values in categorical columns by their encoded values
    X_encoded = X_encoded.replace(encodings)

    # output the encoded dataframe
    return X_encoded

In [248]:
def reverse_numerical_encode(X):
    """
    Reverses the numerical encoding process by replacing numerical values in categorical columns with the closest category.
    
    Parameters:
    - X: dataframe which has its categorical columns encoded
    
    Returns:
    - X_decoded: dataframe with categorical columns decoded
    """

    # make a copy of the input dataframe
    X_decoded = X.copy()

    # specify the inverse of the encoding
    decodings = {
        "Drug": {0: "Placebo", 1: "D-penicillamine"},
        "Sex": {0: "F", 1: "M"},
        "Ascites": {0: "N", 1: "Y"},
        "Hepatomegaly": {0: "N", 1: "Y"},
        "Spiders": {0: "N", 1: "Y"},
        "Edema": {0: "N", 1: "Y", 2: "S"}
    }

    # replace each categorical column with numbers closest to the ones in the list above
    for col in decodings:
        new_col = []                            # stores a replacement column with numbers that correspond to decodings
        for entry in X_decoded[col]:
            closest_val = 0                     # stores the closest value so far, returns 0 if entry is nan
            min_dist = 100                      # stores highscore of minimum distance, starts very high to ensure it's overtaken
            for option in decodings[col]:
                dist = abs(option - entry)      # calculate the distance of the given entry to each option
                if dist < min_dist:
                    closest_val = option        # updates closest value and minimum distance if it's a new highscore
                    min_dist = dist
            new_col.append(closest_val)         # adds the final closest value to the new column
        new_col_df = pd.DataFrame(new_col)      # makes the replacement column into a dataframe and replaces the old column with it
        X_decoded[col] = new_col_df

    # apply the reverse encoding now that all entries are options for categories
    X_decoded = X_decoded.replace(decodings)

    # output the decoded dataframe
    return X_decoded

### Prepare Data for Imputation

In [249]:
# set random seed for reproducability
random.seed(24)

# apply the add_missing_values function to our data
X_miss, Y_miss, actual_entries_X = add_missing_values(X_copy, Y_copy, 0.05)

# create a list to store all of the entries which are null: (row, column)
null_entries_X = [(row_index, col_index) 
                for row_index, row in enumerate(X_miss.values) 
                for col_index, val in enumerate(row) 
                if pd.isnull(val)]

# store the names of all categorical and numerical columns (excluding the response variable)
categorical_cols = [col for col in X_miss.select_dtypes(include=['object']).columns.tolist()]
num_cols = [x for x in X_miss.columns if x not in categorical_cols]

We have all the components we need in order to test out our different imputation methods:
* random seed is set
* X_miss and Y_miss can be duplicated and the imputation methods can be applied to each of the copies
* actual_entries_X stores the original values from X_copy which have now been replaced by NaN
* null_entries_X stores the locations of all NaN values in X_miss
* categorical_cols and num_cols store the names of categorical and numerical columns respectively

So now all we need to do for each imputation method is the following:
* make a new copy of X_miss
* apply the imputation algorithm to the copy
* apply the get_imputed_values function to the resulting dataset
* apply the calc_num_metrics and calc_categorical_metrics functions to assess the performance of the imputation

### Imputation Methods

In [250]:
# Imputation 1: impute missing values with median for numerical variables and mode for categorical variables

# make a copy of the data with missing values
X_miss_copy1 = X_miss.copy()

# apply the imputation method
imputed1_entries_store = {} # create empty dictionary
for col_name in X_miss_copy1.columns: # loop over each column of the data set 
    col_data = X_miss_copy1[col_name] # store data for current column
    if col_data.dtype in ['float64', 'int64']: # check if data type of current column is numeric
        col_data.fillna(col_data.median(), inplace=True) # impute numerical nulls with median
    else:
        col_data.fillna(col_data.mode().iloc[0], inplace=True) # impute categorical nulls with mode
    imputed1_entries_store[col_name] = [col_data.loc[row] for row, col in null_entries_X if col_name == X_miss_copy1.columns[col]] # store the imputed values in the dictionary

# display performance metrics
print("Numerical variables performance for imputation 1:")
print(calc_num_metrics(actual_entries_X, imputed1_entries_store, num_cols))
print("\nCategorical variables performance for imputation 1:")
print(calc_categorical_metrics(actual_entries_X, imputed1_entries_store, categorical_cols))

Numerical variables performance for imputation 1:
     Variable     RMSE
       N_Days 1138.424
          Age 3617.757
    Bilirubin    4.378
  Cholesterol  193.877
      Albumin    0.358
       Copper   76.725
     Alk_Phos 1726.848
         SGOT   44.724
Tryglicerides   63.538
    Platelets   85.359
  Prothrombin    0.815
        Stage    0.882

Categorical variables performance for imputation 1:
    Variable  Accuracy  F1 Score
        Drug     0.512     0.347
         Sex     0.939     0.909
     Ascites     0.945     0.919
Hepatomegaly     0.550     0.390
     Spiders     0.779     0.683
       Edema     0.899     0.851


In [251]:
# Imputation 2: impute each missing value with the mean value from k nearest neighbours

# make a copy of the data with missing values
X_miss_copy2 = X_miss.copy()

# encode the categorical data to be numerical
X_miss_copy2 = numerical_encode(X_miss_copy2)

# create a KNN imputer and apply it to this copy of the data
imputer2 = KNNImputer(n_neighbors=20, weights="uniform")    # create a KNN imputer
X_miss_copy2 = pd.DataFrame(imputer2.fit_transform(X=X_miss_copy2, y=Y_miss), columns=X_miss_copy2.columns) # apply the imputer to the data with missing values

# reverse the numerical encoding of the categorical data
X_miss_copy2 = reverse_numerical_encode(X_miss_copy2)

In [252]:
# get the imputed values from this imputation method and store them
imputed2_entries_store = get_imputed_values(X_miss_copy2, null_entries_X)

In [253]:
# display performance metrics
print("Numerical variables performance for imputation 1:")
print(calc_num_metrics(actual_entries_X, imputed2_entries_store, num_cols))
print("\nCategorical variables performance for imputation 1:")
print(calc_categorical_metrics(actual_entries_X, imputed2_entries_store, categorical_cols))

Numerical variables performance for imputation 1:
     Variable     RMSE
       N_Days 1219.340
          Age 3748.735
    Bilirubin    4.492
  Cholesterol  205.170
      Albumin    0.374
       Copper   78.695
     Alk_Phos 1717.350
         SGOT   48.352
Tryglicerides   65.929
    Platelets   89.474
  Prothrombin    0.859
        Stage    0.929

Categorical variables performance for imputation 1:
    Variable  Accuracy  F1 Score
        Drug     0.539     0.523
         Sex     0.932     0.906
     Ascites     0.934     0.913
Hepatomegaly     0.472     0.473
     Spiders     0.773     0.695
       Edema     0.881     0.842


In [254]:
# Imputation 3: for numerical variables, implement linear regression using the non-null rows to determine the regression lines.
# For rows with more 2 or 3 missing values, fill in 1 or 2, respecitvely, entries with the median for that column
# For categorical variables, implement a decision tree classifier to impute the missing entries.

# make a copy of the data with missing values
X_miss_copy3 = X_miss.copy()

np.random.seed(24)
# Store the missing data frame for only numerical variables. Store another data frame by dropping the null rows.
num_missing_raw_training_data = X_miss_copy3.drop(categorical_cols, axis=1)
linear_regres_df = num_missing_raw_training_data.dropna()

# identify the rows with 1 missing value, and more than 1 missing value
rows_with_one_missing = num_missing_raw_training_data[num_missing_raw_training_data.isnull().sum(axis=1) == 1]
rows_with_over1_missing = num_missing_raw_training_data[num_missing_raw_training_data.isnull().sum(axis=1) >= 2]

# impute missing values using the median of the column in the rows with 2 or more missing values
median_vals = num_missing_raw_training_data.median() # store median value for each column in missing data frame
for idx, row in rows_with_over1_missing.iterrows(): # loop over rows with more than 1 missing value:
    missing_indices = row.isnull() # store the columns in current row with null entries
    fill_indices = np.random.choice(missing_indices[missing_indices].index, missing_indices.sum() - 1, replace=False) # sellect indicies to fill at random
    rows_with_over1_missing.loc[idx, fill_indices] = median_vals[fill_indices] #fill these entries using median of the column

# combine data frames such that each row only contains 1 missing value
new_rows_with_one_missing = pd.concat([rows_with_over1_missing, rows_with_one_missing]).sort_index()

# fit linear regression models (for each numerical variable as the response) and store the coefficients and intercepts
regressions = {col: {'intercept': LinearRegression().fit(linear_regres_df.drop(col, axis=1),linear_regres_df[col]).intercept_,
                     'coeffs': LinearRegression().fit(linear_regres_df.drop(col, axis=1),linear_regres_df[col]).coef_}
               for col in linear_regres_df.columns}

#--------------------------------------------------------------------------------------------------------------------------

# impute missing values using linear regression
for idx, row in new_rows_with_one_missing.iterrows(): # loop over each row in this new data frame
    for col in num_cols: # loop over numerical variables:
        if pd.isna(row[col]): # if value in current cell is na:
            row[col] = (regressions[col]['coeffs'] * row.dropna()).sum() + regressions[col]['intercept'] # calc the imputed value using linear regression coefficients and intercept

# map to convert numerical variables in 'null_entries' to their names
mapping = {0: 'N_Days', 2: 'Age', 8: 'Bilirubin', 9: 'Cholesterol', 10: 'Albumin', 11: 'Copper', 12: 'Alk_Phos', 13: 'SGOT', 14: 'Tryglicerides', 15: 'Platelets', 16: 'Prothrombin', 17: 'Stage'}

#---------------------------------------------------------------------------------------------------------------------------

# create the data set with numerical variables imputed as above and categorical variables still missing
for index, row in new_rows_with_one_missing.iterrows():
    for col in num_cols:
        X_miss_copy3.loc[index,col] = new_rows_with_one_missing.loc[index][col]
        
#--------------------------------------------------------------------------------------------------------------------------

# Categorical imputation method:
# split data set into numerical and categorical sets, dropping status column and all missing rows, and numerically encode
dropped_missing_data = numerical_encode(X_miss_copy3).dropna()

for cat_var in categorical_cols: # loop over each categorical variable
    # define the training data and labels
    X = dropped_missing_data.drop(columns=categorical_cols)
    y = dropped_missing_data[cat_var]
    # fit the tree
    decision_tree = DecisionTreeClassifier()
    decision_tree.fit(X, y)
    # predict the missing categorical variables and impute into missing_raw_training_data
    X_missing = dropped_missing_data.drop(columns=categorical_cols)
    missing_predictions = decision_tree.predict(X_missing)
    dropped_missing_data.loc[dropped_missing_data[cat_var].isnull(), cat_var] = missing_predictions[dropped_missing_data[cat_var].isnull()]
# restore the original categorical entries instead of being numerical
X_miss_copy3 = reverse_numerical_encode(dropped_missing_data)

#--------------------------------------------------------------------------------------------------------------------------

imputed3_entries_store = get_imputed_values(X_miss_copy3, null_entries_X) # store imputed values in dictionary
print("Numerical variables performance for imputation 3:")
print(calc_num_metrics(actual_entries_X, imputed3_entries_store, num_cols))
print("\nNumerical variables performance for imputation 3:")
print(calc_categorical_metrics(actual_entries_X, imputed3_entries_store, categorical_cols))

IndexError: index 5739 is out of bounds for axis 0 with size 5739