In [59]:
# use pip to install all the libraries we need
import sys
!{sys.executable} -m pip install numpy pandas matplotlib scikit-learn seaborn | grep -v 'already satisfied'

# import necessary packages
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns
import warnings
import random

from sklearn.impute import KNNImputer
from sklearn.metrics import mean_squared_error, accuracy_score, f1_score
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer

warnings.filterwarnings('ignore') # remove warnings

Defaulting to user installation because normal site-packages is not writeable
distutils: /home/trf1/.local/lib/python3.9/site-packages
sysconfig: /home/trf1/.local/lib64/python3.9/site-packages[0m
user = True
home = None
root = None
prefix = None[0m


In [54]:
# read the data, dropping the ID column, as this is duplicated by pandas
raw_training_data = pd.read_csv(r"train.csv").drop(columns=["id"])
raw_test_data = pd.read_csv(r"test.csv").drop(columns=["id"])

copy_raw_training_data = raw_training_data.copy()  # copy the raw training data

In [55]:

X_copy = copy_raw_training_data.drop(labels=['Status'], axis=1)
Y_copy = copy_raw_training_data["Status"]

In [60]:
# convert all non-numerical columns into categorical data
non_numeric_columns = copy_raw_training_data.select_dtypes(exclude=['number']).columns
copy_raw_training_data[non_numeric_columns] = copy_raw_training_data[non_numeric_columns].astype('category')

copy_raw_training_data.dtypes

N_Days              int64
Drug             category
Age                 int64
Sex              category
Ascites          category
Hepatomegaly     category
Spiders          category
Edema            category
Bilirubin         float64
Cholesterol       float64
Albumin           float64
Copper            float64
Alk_Phos          float64
SGOT              float64
Tryglicerides     float64
Platelets         float64
Prothrombin       float64
Stage             float64
Status           category
dtype: object

In [66]:
categorical_attributes = ["Drug", "Sex", "Ascites", "Hepatomegaly", "Spiders", "Edema"]

onehot = ColumnTransformer([
        ("One_hot_encoder", OneHotEncoder(), categorical_attributes)], 
        remainder='passthrough')

X_copy_onehot = pd.DataFrame(onehot.fit_transform(X_copy))

display(X_copy_onehot)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,15,16,17,18,19,20,21,22,23,24
0,1.0,0.0,0.0,1.0,1.0,0.0,1.0,0.0,1.0,0.0,...,2.3,316.0,3.35,172.0,1601.0,179.80,63.0,394.0,9.7,3.0
1,0.0,1.0,1.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,...,0.9,364.0,3.54,63.0,1440.0,134.85,88.0,361.0,11.0,3.0
2,0.0,1.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0,...,3.3,299.0,3.55,131.0,1029.0,119.35,50.0,199.0,11.7,4.0
3,0.0,1.0,1.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,...,0.6,256.0,3.50,58.0,1653.0,71.30,96.0,269.0,10.7,3.0
4,0.0,1.0,1.0,0.0,1.0,0.0,0.0,1.0,1.0,0.0,...,1.1,346.0,3.65,63.0,1181.0,125.55,96.0,298.0,10.6,4.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7900,1.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,...,0.8,309.0,3.56,38.0,1629.0,79.05,224.0,344.0,9.9,2.0
7901,0.0,1.0,1.0,0.0,1.0,0.0,0.0,1.0,1.0,0.0,...,0.9,260.0,3.43,62.0,1440.0,142.00,78.0,277.0,10.0,4.0
7902,1.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,...,2.0,225.0,3.19,51.0,933.0,69.75,62.0,200.0,12.7,2.0
7903,1.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0,1.0,0.0,...,0.7,248.0,2.75,32.0,1003.0,57.35,118.0,221.0,10.6,4.0


In [None]:
# create random number generator
rng = np.random.RandomState(42)

def add_missing_values(X_full, Y_full, missing_rate=0.75):
    """
    Takes a dataframe and adds in some missing values.

    Parameters:
    - X_full: a dataframe containing all columns with independent variables
    - Y_full: a dataframe containing the response variable column only
    - missing_rate: an integer between 0 and 1 which specifies how many lines should have missing values

    Returns:
    - X_missing: the dataframe X_full but with some missing values
    - Y_missing: an exact copy of Y_full
    """
    n_samples, n_features = X_full.shape

    # Add missing values in some of the lines (default is 75%)
    n_missing_samples = int(n_samples * missing_rate)

    # Create an array the same length as the number of samples with a specified proportion of True values
    missing_samples = np.zeros(n_samples, dtype=bool)
    missing_samples[:n_missing_samples] = True

    # Randomly add missing values to the specified proportion of lines
    rng.shuffle(missing_samples)
    missing_features = rng.randint(0, n_features, n_missing_samples)
    X_missing = X_full.copy()
    X_missing.iloc[missing_samples, missing_features] = np.nan
    y_missing = Y_full.copy()

    return X_missing, y_missing

In [None]:

random.seed(25) # set a random seed
copy_raw_training_data = raw_training_data.copy()  # copy the raw training data
shape = copy_raw_training_data.shape  # store dimension of raw training data
num_entries = shape[0] * (shape[1] - 1)  # store number of entries for all independent variables
num_null = int(num_entries * 0.01)  # make 1% of the data be null entries

# randomly select entries and replace them with NaN, excluding the response variable
for _ in range(num_null):
    rand_row, rand_col = random.randint(0, shape[0] - 1) , random.randint(0, shape[1] - 2) # select a random row and column
    copy_raw_training_data.iloc[rand_row, rand_col] = np.nan # store entry as nan
    
# create a list to store all of the entries which are null: (row, column)
null_entries = [(row_index, col_index) 
                for row_index, row in enumerate(copy_raw_training_data.values) 
                for col_index, val in enumerate(row) 
                if pd.isnull(val)]

# store all of the entries for the original data set in the positions of the removed entries
actual_entries_store = {} # create dictionary for storing the actual values in the locations that are removed
for row, col in null_entries: # loop over every removed entry:
    col_name = copy_raw_training_data.columns[col] # store column name of current null entry
    if col_name not in actual_entries_store: # check if column name is already in the dictionary
        actual_entries_store[col_name] = [] # if not already in dictionary, create empty list
    actual_entries_store[col_name].append(raw_training_data.iloc[row, col])# add actual value to the list for correct column
    
# store the names of all categorical and numerical columns (excluding the response variable)
categorical_cols = [col for col in copy_raw_training_data.select_dtypes(include=['object']).columns.tolist() if col != 'Status']
num_cols = [x for x in copy_raw_training_data.columns.drop(['Status']) if x not in categorical_cols]
#--------------------------------------------------------------------------------------------------------------------------
# function to calculate performance metrics for numerical variables
def calc_num_metrics(actual_entries_store, imputed_entries_store, num_cols):
    """
    Calculates performance metric RMSE for numerical variables for the imputation method
    
    Parameters:
    - actual_entries_store: dictionary containing all entries that were set to NaN, keys are the data frames columns
    - imputed_entries_store: dictionary containing all imputed entries, indexed the same as actual_entries_store
    - num_cols: list of the numerical columns in the data frame
    
    Returns:
    - Data frame as a string showing the RMSE for the imputation method for each numerical variable
    """
    numerical_data = []
    for col_name, actual_vals in actual_entries_store.items(): # loop over all variables and actual values
        if col_name in num_cols: # if the current column is numerical:
            # change imputed and actual values to be numeric
            imputed_vals = pd.to_numeric(imputed_entries_store.get(col_name, []))
            actual_vals = pd.to_numeric(actual_vals)
            numerical_data.append({"Variable": col_name, "RMSE": round(mean_squared_error(actual_vals, imputed_vals, squared=False),3)}) # add the column name and its RMSE
    return pd.DataFrame(numerical_data).to_string(index=False)

# function to calculate performance metrics for categorical variables
def calc_categorical_metrics(actual_entries_store, imputed_entries_store, categorical_cols):
    """
    Calculates performance metric accuracy and F1 score for categorical variables for the imputation method
    
    Parameters:
    - actual_entries_store: dictionary containing all entries that were set to NaN, keys are the data frames columns
    - imputed_entries_store: dictionary containing all imputed entries, indexed the same as actual_entries_store
    - categorical_cols: list of the categorical columns in the data frame
    
    Returns:
    - Data frame as a string showing the accuracy and F1 score for the imputation method for each categorical variable
    """
    categorical_data = []
    for col_name in categorical_cols: # loop over all categorical variables
        # change imputed and actual values to be a string
        imputed_vals = [str(val) for val in imputed_entries_store.get(col_name, [])]
        actual_vals = [str(val) for val in actual_entries_store[col_name]]
        accuracy = accuracy_score(actual_vals, imputed_vals) # calculate accuracy score
        f1 = f1_score(actual_vals, imputed_vals, average='weighted') # calculate F1 score
        categorical_data.append({"Variable": col_name, "Accuracy": round(accuracy,3), "F1 Score": round(f1,3)}) # add the column name and its accuracy and F1 score
    return pd.DataFrame(categorical_data).to_string(index=False)

In [None]:
# Imputation 1: impute missing values with median for numerical variables and mode for categorical variables

imputed1_entries_store = {} # create empty dictionary
for col_name in copy_raw_training_data.columns.drop(['Status']): # loop over each column of the data set 
    col_data = copy_raw_training_data[col_name] # store data for current column
    if col_data.dtype in ['float64', 'int64']: # check if data type of current column is numeric
        col_data.fillna(col_data.median(), inplace=True) # impute numerical nulls with median
    else:
        col_data.fillna(col_data.mode().iloc[0], inplace=True) # impute categorical nulls with mode
    imputed1_entries_store[col_name] = [col_data.loc[row] for row, col in null_entries if col_name == copy_raw_training_data.columns[col]] # store the imputed values in the dictionary

# Display performance metrics
print("Numerical variables performance for imputation 1:")
print(calc_num_metrics(actual_entries_store, imputed1_entries_store, num_cols))
print("\nCategorical variables performance for imputation 1:")
print(calc_categorical_metrics(actual_entries_store, imputed1_entries_store, categorical_cols))

Numerical variables performance for imputation 1:
     Variable     RMSE
       N_Days 1178.102
    Bilirubin    4.242
  Prothrombin    0.796
          Age 3782.324
  Cholesterol  129.944
      Albumin    0.349
       Copper   50.941
     Alk_Phos 2026.417
    Platelets   83.103
         SGOT   42.677
Tryglicerides   76.562
        Stage    0.897

Categorical variables performance for imputation 1:
    Variable  Accuracy  F1 Score
        Drug     0.570     0.414
         Sex     0.920     0.882
     Ascites     0.949     0.925
Hepatomegaly     0.549     0.389
     Spiders     0.809     0.723
       Edema     0.918     0.878


In [None]:
copy_raw_training_data.dtypes

N_Days           float64
Drug              object
Age              float64
Sex               object
Ascites           object
Hepatomegaly      object
Spiders           object
Edema             object
Bilirubin        float64
Cholesterol      float64
Albumin          float64
Copper           float64
Alk_Phos         float64
SGOT             float64
Tryglicerides    float64
Platelets        float64
Prothrombin      float64
Stage            float64
Status            object
dtype: object

In [None]:
# Imputation 2: impute each missing value with the mean value from k nearest neighbours

X_copy_miss, Y_copy_miss = add_missing_values(X_copy, Y_copy, missing_rate=0.75)

independent_vars = copy_raw_training_data.drop(labels=['Status'], axis=1)   # ignore the status column
numerical = independent_vars.select_dtypes(include=["float64", "int64"])    # separate out the numerical data
categorical = independent_vars.select_dtypes(exclude=["float64", "int64"])

imputed2_entries_store = {} # create empty dictionary
imputer2 = KNNImputer(n_neighbors=2)    # create a KNN imputer
imputer2.fit_transform(X_copy_miss)



ValueError: could not convert string to float: 'Placebo'