# Case 1 - Data Wrangling

### Table of Contents

1. **Importing Libraries**

2. **Loading Data**

3. **Wrangling case1Data.csv**

4. **Wrangling case1Data_Xnew.csv**

## 1. Importing Libraries

In [8]:
import numpy as np
import pandas as pd

# Imputers
from sklearn.impute import KNNImputer, SimpleImputer

# Standardization scalers
from sklearn.preprocessing import StandardScaler

# Splitting data
from sklearn.model_selection import train_test_split

# Set seed for reproducibility
import random
random.seed(42)

## 2. Loading Data

In [9]:
# Loading the data from case1Data.csv into a numpy array
data = np.loadtxt('../data/case1Data.csv', delimiter=',', skiprows=1)
print("case1Data.csv: ", data.shape)

# Loading the new data into a numpy array
X_new = pd.DataFrame(np.loadtxt('../data/case1Data_Xnew.csv', delimiter=',', skiprows=1))
print("case1Data_Xnew.csv: ", X_new.shape)

case1Data.csv:  (100, 101)
case1Data_Xnew.csv:  (1000, 100)


## 3. Wrangling case1Data.csv

In [10]:
# Splitting the data into features (X) and target (y)
X = data[:, 1:] # All columns except the first one
y = data[:, 0] # First column
print("X: ", X.shape)
print("y: ", y.shape)

# Saving the data in a csv file
np.savetxt('../data/case1Data_X.csv', X, delimiter=',')
np.savetxt('../data/case1Data_y.csv', y, delimiter=',')

X:  (100, 100)
y:  (100,)


## 4. Wrangling case1Data_Xnew.csv

In [11]:
# Using StandardScaler from scikit-learn to standardize the data
scaler = StandardScaler()

# Standardizing the numerical features (all columns exept the last five)
X_new.iloc[:, :-5] = scaler.fit_transform(X_new.iloc[:, :-5])

# Using KNNImputer from scikit-learn to impute the missing values in the data (for continuous variables) with the mean of the k-nearest neighbors (k=5)
continuous_imputer = KNNImputer(n_neighbors=5, missing_values=np.nan)
X_new.iloc[:, :-5] = pd.DataFrame(continuous_imputer.fit_transform(X_new.iloc[:, :-5]))

# Mode Imputation: Using SimpleImputer from scikit-learn to impute the missing values in the data (for categorical variables) with the most frequent value
categorical_imputer = SimpleImputer(strategy='most_frequent')
X_new.iloc[:, -5:] = categorical_imputer.fit_transform(X_new.iloc[:, -5:])

# One-hot encoding the categorical variables using get_dummies from pandas library (for the last five columns)
X_new = pd.get_dummies(X_new, columns=X_new.columns[-5:])

# Converting the data into numpy arrays
X_new = np.asarray(X_new, dtype=np.float64)

# Saving the preprocessed data to a csv file
np.savetxt('../data/case1Data_Xnew_wrangled.csv', X_new, delimiter=',')
print("X_new: ", X_new.shape)

X_new:  (1000, 116)
