In [1]:
# use pip to install all the libraries we need
import sys
!{sys.executable} -m pip install numpy pandas matplotlib scikit-learn seaborn | grep -v 'already satisfied'

# import necessary packages
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns
import warnings
import random

from sklearn.impute import KNNImputer
from sklearn.metrics import mean_squared_error, accuracy_score, f1_score
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeClassifier

warnings.filterwarnings('ignore') # remove warnings

Defaulting to user installation because normal site-packages is not writeable

[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m23.2.1[0m[39;49m -> [0m[32;49m24.0[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m


In [2]:
# read the data, dropping the ID column, as this is duplicated by pandas
raw_training_data = pd.read_csv(r"train.csv").drop(columns=["id"])
raw_test_data = pd.read_csv(r"test.csv").drop(columns=["id"])

copy_raw_training_data = raw_training_data.copy()  # copy the raw training data

Because many of the imputation methods only work on numerical data, we need to encode the categorical variables in our dataframe in some way. We chose to encode them with an integer for each category, since they all had very few options.

In [3]:
random.seed(25) # set a random seed
copy_raw_training_data = raw_training_data.copy()  # copy the raw training data
shape = copy_raw_training_data.shape  # store dimension of raw training data
num_entries = shape[0] * (shape[1] - 1)  # store number of entries for all independent variables
num_null = int(num_entries * 0.01)  # make 1% of the data be null entries

# randomly select entries and replace them with NaN, excluding the response variable
for _ in range(num_null):
    rand_row, rand_col = random.randint(0, shape[0] - 1) , random.randint(0, shape[1] - 2) # select a random row and column
    copy_raw_training_data.iloc[rand_row, rand_col] = np.nan # store entry as nan
    
missing_raw_training_data = copy_raw_training_data.copy()
    
# create a list to store all of the entries which are null: (row, column)
null_entries = [(row_index, col_index) 
                for row_index, row in enumerate(copy_raw_training_data.values) 
                for col_index, val in enumerate(row) 
                if pd.isnull(val)]

# store all of the entries for the original data set in the positions of the removed entries
actual_entries_store = {} # create dictionary for storing the actual values in the locations that are removed
for row, col in null_entries: # loop over every removed entry:
    col_name = copy_raw_training_data.columns[col] # store column name of current null entry
    if col_name not in actual_entries_store: # check if column name is already in the dictionary
        actual_entries_store[col_name] = [] # if not already in dictionary, create empty list
    actual_entries_store[col_name].append(raw_training_data.iloc[row, col])# add actual value to the list for correct column
    
# store the names of all categorical and numerical columns (excluding the response variable)
categorical_cols = [col for col in copy_raw_training_data.select_dtypes(include=['object']).columns.tolist() if col != 'Status']
num_cols = [x for x in copy_raw_training_data.columns.drop(['Status']) if x not in categorical_cols]
#--------------------------------------------------------------------------------------------------------------------------
# function to calculate performance metrics for numerical variables
def calc_num_metrics(actual_entries_store, imputed_entries_store, num_cols):
    """
    Calculates performance metric RMSE for numerical variables for the imputation method
    
    Parameters:
    - actual_entries_store: dictionary containing all entries that were set to NaN, keys are the data frames columns
    - imputed_entries_store: dictionary containing all imputed entries, indexed the same as actual_entries_store
    - num_cols: list of the numerical columns in the data frame
    
    Returns:
    - Data frame as a string showing the RMSE for the imputation method for each numerical variable
    """
    numerical_data = []
    for col_name, actual_vals in actual_entries_store.items(): # loop over all variables and actual values
        if col_name in num_cols: # if the current column is numerical:
            # change imputed and actual values to be numeric
            imputed_vals = pd.to_numeric(imputed_entries_store.get(col_name, []))
            actual_vals = pd.to_numeric(actual_vals)
            numerical_data.append({"Variable": col_name, "RMSE": round(mean_squared_error(actual_vals, imputed_vals, squared=False),3)}) # add the column name and its RMSE
    return pd.DataFrame(numerical_data).to_string(index=False)

# function to calculate performance metrics for categorical variables
def calc_categorical_metrics(actual_entries_store, imputed_entries_store, categorical_cols):
    """
    Calculates performance metric accuracy and F1 score for categorical variables for the imputation method
    
    Parameters:
    - actual_entries_store: dictionary containing all entries that were set to NaN, keys are the data frames columns
    - imputed_entries_store: dictionary containing all imputed entries, indexed the same as actual_entries_store
    - categorical_cols: list of the categorical columns in the data frame
    
    Returns:
    - Data frame as a string showing the accuracy and F1 score for the imputation method for each categorical variable
    """
    categorical_data = []
    for col_name in categorical_cols: # loop over all categorical variables
        # change imputed and actual values to be a string
        imputed_vals = [str(val) for val in imputed_entries_store.get(col_name, [])]
        actual_vals = [str(val) for val in actual_entries_store[col_name]]
        accuracy = accuracy_score(actual_vals, imputed_vals) # calculate accuracy score
        f1 = f1_score(actual_vals, imputed_vals, average='weighted') # calculate F1 score
        categorical_data.append({"Variable": col_name, "Accuracy": round(accuracy,3), "F1 Score": round(f1,3)}) # add the column name and its accuracy and F1 score
    return pd.DataFrame(categorical_data).to_string(index=False)

In [4]:
def get_imputed_values(imputed_data, null_entries):
    """
    Gets the value of all the imputed entries and outputs them in a dictionary
    
    Parameters:
    - null_entries: the locations of all the entries which were null before imputation

    Returns:
    - imputed_entries_store: dictionary containing all imputed entries, indexed the same as actual_entries_store
    """
    imputed_entries_store = {} # create dictionary for storing the actual values in the locations that are removed
    
    for row, col in null_entries: # loop over every removed entry:
        col_name = imputed_data.columns[col] # store column name of current null entry
        if col_name not in imputed_entries_store: # check if column name is already in the dictionary
            imputed_entries_store[col_name] = [] # if not already in dictionary, create empty list
        imputed_entries_store[col_name].append(imputed_data.iloc[row, col])# add actual value to the list for correct column

    return imputed_entries_store

In [5]:
# Imputation 1: impute missing values with median for numerical variables and mode for categorical variables

imputed1_entries_store = {} # create empty dictionary
for col_name in copy_raw_training_data.columns.drop(['Status']): # loop over each column of the data set 
    col_data = copy_raw_training_data[col_name] # store data for current column
    if col_data.dtype in ['float64', 'int64']: # check if data type of current column is numeric
        col_data.fillna(col_data.median(), inplace=True) # impute numerical nulls with median
    else:
        col_data.fillna(col_data.mode().iloc[0], inplace=True) # impute categorical nulls with mode
    imputed1_entries_store[col_name] = [col_data.loc[row] for row, col in null_entries if col_name == copy_raw_training_data.columns[col]] # store the imputed values in the dictionary

# Display performance metrics
print("Numerical variables performance for imputation 1:")
print(calc_num_metrics(actual_entries_store, imputed1_entries_store, num_cols))
print("\nCategorical variables performance for imputation 1:")
print(calc_categorical_metrics(actual_entries_store, imputed1_entries_store, categorical_cols))

Numerical variables performance for imputation 1:
     Variable     RMSE
       N_Days 1178.102
    Bilirubin    4.242
  Prothrombin    0.796
          Age 3782.324
  Cholesterol  129.944
      Albumin    0.349
       Copper   50.941
     Alk_Phos 2026.417
    Platelets   83.103
         SGOT   42.677
Tryglicerides   76.562
        Stage    0.897

Categorical variables performance for imputation 1:
    Variable  Accuracy  F1 Score
        Drug     0.570     0.414
         Sex     0.920     0.882
     Ascites     0.949     0.925
Hepatomegaly     0.549     0.389
     Spiders     0.809     0.723
       Edema     0.918     0.878


In [6]:
# specify how to encode each categorical variable as an integer, and define a reverse of this encoding
numerical_encode = {
    "Drug": {"Placebo": 0, "D-penicillamine": 1}, "Sex": {"F": 0, "M": 1}, "Ascites": {"N": 0, "Y": 1},
    "Hepatomegaly": {"N": 0, "Y": 1}, "Spiders": {"N": 0, "Y": 1}, "Edema": {"N": 0, "Y": 1, "S": 2}}

reverse_numerical_encode = {feature: {value: key for key, value in encoding.items()}
                            for feature, encoding in numerical_encode.items()}

# replace entries in categorical columns with specified numbers
training_data_encoded = copy_raw_training_data.replace(numerical_encode)

#---------------------------------------------------------------------------------------------------------------------------

# separate out the independent variables from the target
X_copy = training_data_encoded.drop(labels=['Status'], axis=1)
Y_copy = training_data_encoded["Status"]

def add_missing_values(X_full, Y_full, missing_rate):
    """
    Takes a dataframe and adds in some missing values.

    Parameters:
    - X_full: a dataframe containing all columns with independent variables
    - Y_full: a dataframe containing the response variable column only
    - missing_rate: a float between 0 and 1 which specifies the proportion of lines which should have missing values

    Returns:
    - X_missing: the dataframe X_full but with some missing values
    - Y_missing: an exact copy of Y_full
    - actual_entries_store: dictionary containing all entries that were set to NaN, keys are the data frames columns
    """
    # set a seed for reproducibility
    np.random.seed(24)
    
    # copy the input dataframes
    X_missing = X_full.copy()
    y_missing = Y_full.copy()

    # create empty dictionary to store values of entries before setting them to nan
    actual_entries_store = {}

    # remove a percentage of entries in each column at random, specified by missing_rate
    for col in X_missing.columns:
        index_list = X_missing.sample(frac=missing_rate).index  # sample a proportion of the indices within the column
        actual_entries_store[col] = []
        for value in index_list:
            actual_entries_store[col].append(X_missing.loc[value, col])   # store the actual entries in the dictionary
        X_missing.loc[index_list, col] = np.nan # replace the values with nan

    return X_missing, y_missing, actual_entries_store

# apply the add_missing_values function to our data
X_copy_miss, Y_copy_miss, actual_entries_X = add_missing_values(X_copy, Y_copy, 0.05)

#---------------------------------------------------------------------------------------------------------------------------
# create a list to store all of the entries which are null: (row, column)
null_entries_X = [(row_index, col_index) 
                for row_index, row in enumerate(X_copy_miss.values) 
                for col_index, val in enumerate(row) 
                if pd.isnull(val)]

In [7]:
# Imputation 2: impute each missing value with the mean value from k nearest neighbours

imputer2 = KNNImputer(n_neighbors=2, weights="uniform")    # create a KNN imputer
data_imputed_knn_X = pd.DataFrame(imputer2.fit_transform(X=X_copy_miss, y=Y_copy_miss), columns=X_copy_miss.columns) # apply the imputer to the data with missing values

# add the target back to the result
data_imputed_knn = pd.concat([data_imputed_knn_X, Y_copy_miss], axis=1)

In [8]:
imputed2_entries_store = get_imputed_values(data_imputed_knn, null_entries_X)

In [9]:
knn_metrics = calc_num_metrics(actual_entries_X, imputed2_entries_store, data_imputed_knn.columns)
print(knn_metrics)

     Variable     RMSE
       N_Days 1347.275
         Drug    0.596
          Age 4256.052
          Sex    0.320
      Ascites    0.232
 Hepatomegaly    0.630
      Spiders    0.565
        Edema    0.561
    Bilirubin    5.173
  Cholesterol  223.777
      Albumin    0.416
       Copper   87.412
     Alk_Phos 2020.361
         SGOT   59.019
Tryglicerides   68.322
    Platelets  112.273
  Prothrombin    0.987
        Stage    1.162


In [10]:
# Imputation 3: for numerical variables, implement linear regression using the non-null rows to determine the regression lines.
# For rows with more 2 or 3 missing values, fill in 1 or 2, respecitvely, entries with the median for that column
# For categorical variables, implement a decision tree classifier to impute the missing entries.

np.random.seed(24)
# Store the missing data frame for only numerical variables. Store another data frame by dropping the null rows.
num_missing_raw_training_data = missing_raw_training_data.drop(categorical_cols, axis=1).drop(['Status'], axis=1)
linear_regres_df = num_missing_raw_training_data.dropna()

# fit linear regression models (for each numerical variable as the response) and store the coefficients and intercepts
regressions = {col: {'intercept': LinearRegression().fit(linear_regres_df.drop(col, axis=1),linear_regres_df[col]).intercept_,
                     'coeffs': LinearRegression().fit(linear_regres_df.drop(col, axis=1),linear_regres_df[col]).coef_}
               for col in linear_regres_df.columns}

# identify the rows with 1 missing value, and more than 1 missing value
rows_with_one_missing = num_missing_raw_training_data[num_missing_raw_training_data.isnull().sum(axis=1) == 1]
rows_with_over1_missing = num_missing_raw_training_data[num_missing_raw_training_data.isnull().sum(axis=1) >= 2]

# impute missing values using the median of the column in the rows with 2 or more missing values
median_vals = missing_raw_training_data.median() # store median value for each column in missing data frame
for idx, row in rows_with_over1_missing.iterrows(): # loop over rows with more than 1 missing value:
    missing_indices = row.isnull() # store the columns in current row with null entries
    fill_indices = np.random.choice(missing_indices[missing_indices].index, missing_indices.sum() - 1, replace=False) # sellect indicies to fill at random
    rows_with_over1_missing.loc[idx, fill_indices] = median_vals[fill_indices] #fill these entries using median of the column

# combine data frames such that each row only contains 1 missing value
new_rows_with_one_missing = pd.concat([rows_with_over1_missing, rows_with_one_missing]).sort_index()

#--------------------------------------------------------------------------------------------------------------------------

# impute missing values using linear regression
for idx, row in new_rows_with_one_missing.iterrows(): # loop over each row in this new data frame
    for col in num_cols: # loop over numerical variables:
        if pd.isna(row[col]): # if value in current cell is na:
            row[col] = (regressions[col]['coeffs'] * row.dropna()).sum() + regressions[col]['intercept'] # calc the imputed value using linear regression coefficients and intercept

# map to convert numerical variables in 'null_entries' to their names
mapping = {0: 'N_Days', 2: 'Age', 8: 'Bilirubin', 9: 'Cholesterol', 10: 'Albumin', 11: 'Copper', 12: 'Alk_Phos', 13: 'SGOT', 14: 'Tryglicerides', 15: 'Platelets', 16: 'Prothrombin', 17: 'Stage'}

# create dictionary for imputed values to be used in 'calc_num_metrics' function
imputed2_entries_store = {col_name: [] for col_name in mapping.values()} # initialise dictionary with values being empty list
for row, col in null_entries: # loop over each null entry:
    if col in mapping and isinstance(mapping[col], str): # if column variable is a key in mapping and value is a string:
        imputed2_entries_store[mapping[col]].append(new_rows_with_one_missing.loc[row, mapping[col]]) # add the imputed value to the corresponding list

#---------------------------------------------------------------------------------------------------------------------------

# create the data set with numerical variables imputed using linear regression for method 2 and categorical variables still missing
for index, row in new_rows_with_one_missing.iterrows():
    for col in num_cols:
        missing_raw_training_data.loc[index,col] = new_rows_with_one_missing.loc[index][col]
        
#--------------------------------------------------------------------------------------------------------------------------

# Categorical imputation method:
# split data set into numerical and categorical sets, dropping status column and all missing rows, and numerically encode
dropped_missing_data = missing_raw_training_data.replace(numerical_encode).dropna().drop(columns=['Status'])

# train decision tree classifier for each categorical variable, predict the missing values, and impute into original dataset
missing_training_data_encoded = missing_raw_training_data.replace(numerical_encode).copy() # new data frame by numerically encoding
for cat_var in categorical_cols: # loop over each categorical variable
    # define the training data and labels
    X = dropped_missing_data.drop(columns=categorical_cols)
    y = dropped_missing_data[cat_var]
    # fit the tree
    decision_tree = DecisionTreeClassifier()
    decision_tree.fit(X, y)
    # predict the missing categorical variables and impute into missing_raw_training_data
    X_missing = missing_raw_training_data.drop(columns=categorical_cols).drop(columns=['Status'])
    missing_predictions = decision_tree.predict(X_missing)
    missing_raw_training_data.loc[missing_raw_training_data[cat_var].isnull(), cat_var] = missing_predictions[missing_raw_training_data[cat_var].isnull()]

# restore the original categorical entries instead of being numerical
missing_raw_training_data = missing_raw_training_data.replace(reverse_numerical_encode)

#--------------------------------------------------------------------------------------------------------------------------

imputed3_entries_store = get_imputed_values(missing_raw_training_data, null_entries) # store imputed values in dictionary
print("Numerical variables performance for imputation 3:")
print(calc_num_metrics(actual_entries_store, imputed3_entries_store, num_cols))
print("\nNumerical variables performance for imputation 3:")
print(calc_categorical_metrics(actual_entries_store, imputed3_entries_store, categorical_cols))

Numerical variables performance for imputation 3:
     Variable     RMSE
       N_Days 1019.818
    Bilirubin    3.600
  Prothrombin    0.726
          Age 3826.653
  Cholesterol  111.272
      Albumin    0.321
       Copper   43.167
     Alk_Phos 1797.119
    Platelets   77.763
         SGOT   41.895
Tryglicerides   68.187
        Stage    0.841

Numerical variables performance for imputation 3:
    Variable  Accuracy  F1 Score
        Drug     0.640     0.637
         Sex     0.880     0.884
     Ascites     0.937     0.918
Hepatomegaly     0.829     0.829
     Spiders     0.809     0.823
       Edema     0.871     0.885
