# Case 1

## Import libraries

In [14]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
from sklearn.impute import KNNImputer

## Load data

In [15]:
# Path to the data files
data_path_1 = '../data/case1Data.csv'
data_path_2 = '../data/case1Data_Xnew.csv'

# Load the data into a numpy array
data_np = np.loadtxt(data_path_1, delimiter=',', skiprows=1)
data_np_new = np.loadtxt(data_path_2, delimiter=',', skiprows=1)

# Print the shape of the data in the numpy array
print(data_np.shape) # 100 rows and 101 columns (100 features and 1 target)
print(data_np_new.shape) # 1000 rows and 100 columns (100 features and no target)

# Create a pandas dataframe and use the first row as the column names
data_pd = pd.read_csv(data_path_1, sep=',', header=0)
data_pd_new = pd.read_csv(data_path_2, sep=',', header=0)

# Print the shape of the data in the pandas dataframe
print(data_pd.shape)
print(data_pd_new.shape)

(100, 101)
(1000, 100)
(100, 101)
(1000, 100)


## Handling missing values

Possible methods:
- Drop rows
- Imputation: Mean, Median, Mode
- Interolation: Linear, Quadratic
- K-Nearest Neighbors (KNN)

In [31]:
# Use KNNImputer from scikit-learn to impute the missing values in the data with the mean of the k-nearest neighbors

# class sklearn.impute.KNNImputer(*, missing_values=nan, n_neighbors=5, weights='uniform', metric='nan_euclidean', copy=True, add_indicator=False, keep_empty_features=False)
imputer = KNNImputer(n_neighbors=5, missing_values=np.nan)

# Fit the imputer on the data and transform the data
data_imputed = imputer.fit_transform(data_pd)
data_new_imputed = imputer.fit_transform(data_pd_new)

# Round the last 5 columns to a whole number
data_imputed[:, -5:] = np.round(data_imputed[:, -5:])
data_new_imputed[:, -5:] = np.round(data_new_imputed[:, -5:])

# Print the shape of the imputed data
print(data_imputed.shape)
print(data_new_imputed.shape)

# Save the imputed data to a csv file
np.savetxt('../data/case1Data_imputed.csv', data_imputed, delimiter=',')
np.savetxt('../data/case1Data_Xnew_imputed.csv', data_new_imputed, delimiter=',')

(100, 101)
(1000, 100)


## Building models