# **Pre-Processing Exercise**

_John Andrew Dixon_

---

**Setup**

In [29]:
# Import necessary modules

# For working with the data
import pandas as pd

# For performing a TTS
from sklearn.model_selection import train_test_split

# For scaling numerical features
from sklearn.preprocessing import StandardScaler

# For One-Hot Encoding nominal features
from sklearn.compose import make_column_selector
from sklearn.preprocessing import OneHotEncoder


In [30]:
# Load the data into a DataFrame and verify
url = "https://docs.google.com/spreadsheets/d/e/2PACX-1vT3R_qvIyzGAYylk0aIdTFxAtcxLdjBtfEGfSyAI1PfOnr0YpN_QjbbH1j5OScoYNcoyOjY_c9tQQ0H/pub?output=csv"
df= pd.read_csv(url)
df.head()

Unnamed: 0,age,sex,bmi,children,smoker,region,charges
0,19,female,27.9,0,yes,southwest,16884.924
1,18,male,33.77,1,no,southeast,1725.5523
2,28,male,33.0,3,no,southeast,4449.462
3,33,male,22.705,0,no,northwest,21984.47061
4,32,male,28.88,0,no,northwest,3866.8552


---

## **Tasks**

> **Question**: _How well can the total charge be predicted based on the age, sex, bmi, number of children, smoking habit and region of the patient?_ 

### **Define features (X) and target (y)**

In [31]:
# Define the feature matrix
X = df.drop(columns="charges")
X

Unnamed: 0,age,sex,bmi,children,smoker,region
0,19,female,27.900,0,yes,southwest
1,18,male,33.770,1,no,southeast
2,28,male,33.000,3,no,southeast
3,33,male,22.705,0,no,northwest
4,32,male,28.880,0,no,northwest
...,...,...,...,...,...,...
1333,50,male,30.970,3,no,northwest
1334,18,female,31.920,0,no,northeast
1335,18,female,36.850,0,no,southeast
1336,21,female,25.800,0,no,southwest


In [32]:
# Define the target vector
y = df['charges']
y

0       16884.92400
1        1725.55230
2        4449.46200
3       21984.47061
4        3866.85520
           ...     
1333    10600.54830
1334     2205.98080
1335     1629.83350
1336     2007.94500
1337    29141.36030
Name: charges, Length: 1338, dtype: float64

 ### **Is this a classification or regression task?**

 Given that the goal is to predict total charge ( `charges`, a numerical feature that is continuous), I believe this is a regression task.

### **Train test split the data to prepare for machine learning**

In [33]:
# Perform the Train-Test Split
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)

### **Identify each feature as numerical, ordinal, or nominal. (Please provide this answer in a text cell in your Colab notebook)**

In [34]:
# Takes a sample of the data
X_train.sample(10)

Unnamed: 0,age,sex,bmi,children,smoker,region
641,42,male,28.31,3,yes,northwest
38,35,male,36.67,1,yes,northeast
613,34,female,19.0,3,no,northeast
1325,61,male,33.535,0,no,northeast
1333,50,male,30.97,3,no,northwest
760,22,female,34.58,2,no,northeast
957,24,male,26.79,1,no,northwest
690,21,male,27.36,0,no,northeast
1236,63,female,21.66,0,no,northeast
445,45,female,33.1,0,no,southwest


Numerical: `age`, `bmi`, `children`

Nominal: `smoker`, `sex`, `region`

Ordinal: `None`

### **Ordinal encode any ordinal features**

No ordinal features to encode.

### **Scale any numeric features**

In [35]:
# Create a column selector to select only numeric features
numeric_selector = make_column_selector(dtype_include='number')

# Select only numeric features from the X_train feature matrix
X_train_numeric = X_train[numeric_selector]
X_test_numeric = X_test[numeric_selector]

In [36]:
# Instantiate a Standard Scaler
scaler = StandardScaler()

# Fit the standard scaler to the Numeric features of the training feature matrix
scaler.fit(X_train_numeric)

# Transform the training and testing data according to the standard scaler
X_train_numeric_scaled = scaler.transform(X_train_numeric)
X_test_numeric_scaled = scaler.transform(X_test_numeric)

In [41]:
# Make the Numpy arrays into DataFrames for future concatenation
X_train_scaled_df = pd.DataFrame(X_train_numeric_scaled, columns=X_train_numeric.columns)
X_test_scaled_df = pd.DataFrame(X_test_numeric_scaled, columns=X_test_numeric.columns)
X_train_scaled_df

Unnamed: 0,age,bmi,children
0,-1.087167,-1.140875,-0.917500
1,-0.802106,-0.665842,0.743605
2,0.836992,1.528794,-0.086947
3,0.551932,0.926476,-0.086947
4,0.480667,-0.268178,0.743605
...,...,...,...
998,-1.514757,0.139468,2.404710
999,-0.018189,-1.105101,3.235263
1000,1.335848,-0.887967,-0.917500
1001,-0.160720,2.843247,0.743605


### **One Hot Encode any nominal features**

In [43]:
# Create a column selector to select only nominal features
nominal_selector = make_column_selector(dtype_include='object')

# Select only nominal features from the X_train feature matrix
X_train_nominal = X_train[nominal_selector]
X_test_nominal = X_test[nominal_selector]


In [46]:
# Instantiate a One Hot Encoder
one_hot_encoder = OneHotEncoder(sparse_output=False, handle_unknown='ignore')

# Fit the One-Hot Encoder onto the training data
one_hot_encoder.fit(X_train_nominal)

# Transform the training and testing data according to the One-Hot encoder
X_train_nominal_ohe = one_hot_encoder.transform(X_train_nominal)
X_test_nominal_ohe = one_hot_encoder.transform(X_test_nominal)

In [53]:
# Get the name of the generated features 
ohe_columns = one_hot_encoder.get_feature_names_out(X_train_nominal.columns)

# Convert OHE arrays to columns
X_train_ohe_df = pd.DataFrame(X_train_nominal_ohe, columns=ohe_columns)
X_test_ohe_df = pd.DataFrame(X_test_nominal_ohe, columns=ohe_columns)

X_train_ohe_df

Unnamed: 0,sex_female,sex_male,smoker_no,smoker_yes,region_northeast,region_northwest,region_southeast,region_southwest
0,0.0,1.0,1.0,0.0,0.0,1.0,0.0,0.0
1,1.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0
2,0.0,1.0,1.0,0.0,0.0,0.0,0.0,1.0
3,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0
4,1.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0
...,...,...,...,...,...,...,...,...
998,1.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0
999,1.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0
1000,0.0,1.0,1.0,0.0,1.0,0.0,0.0,0.0
1001,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0


### **Concatenate all features back into one dataframe.**

In [55]:
preprocess_train_df = pd.concat([X_train_scaled_df, X_train_ohe_df], axis=1)
preprocess_test_df = pd.concat([X_test_scaled_df, X_test_ohe_df], axis=1)

preprocess_train_df

Unnamed: 0,age,bmi,children,sex_female,sex_male,smoker_no,smoker_yes,region_northeast,region_northwest,region_southeast,region_southwest
0,-1.087167,-1.140875,-0.917500,0.0,1.0,1.0,0.0,0.0,1.0,0.0,0.0
1,-0.802106,-0.665842,0.743605,1.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0
2,0.836992,1.528794,-0.086947,0.0,1.0,1.0,0.0,0.0,0.0,0.0,1.0
3,0.551932,0.926476,-0.086947,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0
4,0.480667,-0.268178,0.743605,1.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0
...,...,...,...,...,...,...,...,...,...,...,...
998,-1.514757,0.139468,2.404710,1.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0
999,-0.018189,-1.105101,3.235263,1.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0
1000,1.335848,-0.887967,-0.917500,0.0,1.0,1.0,0.0,1.0,0.0,0.0,0.0
1001,-0.160720,2.843247,0.743605,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0
