# **Abalone Preprocessing Exercise**

_John Andrew Dixon_

---

**Setup**

In [110]:
# Import necessary modules

# For working with the data
import pandas as pd

# For performing a TTS
from sklearn.model_selection import train_test_split

# For scaling numerical features and encoding nominal features
from sklearn.preprocessing import StandardScaler, OneHotEncoder


# For working with NumPY arrays, the main output of sklearn
import numpy as np

# For creating column selectors and column transformers
from sklearn.compose import make_column_selector, make_column_transformer

In [111]:
# Load the data into a DataFrame and verify
url = "https://docs.google.com/spreadsheets/d/1jfU2oFSfhX1ywUbqETExDJuztO95r3h6pbWAm7xpwNY/gviz/tq?tqx=out:csv&sheet=users"
df= pd.read_csv(url)
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4177 entries, 0 to 4176
Data columns (total 9 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   sex             4177 non-null   object 
 1   length          4177 non-null   float64
 2   diameter        4177 non-null   float64
 3   height          4177 non-null   float64
 4   whole_weight    4177 non-null   float64
 5   shucked_weight  4177 non-null   float64
 6   viscera_weight  4177 non-null   float64
 7   shell_weight    4177 non-null   float64
 8   rings           4177 non-null   int64  
dtypes: float64(7), int64(1), object(1)
memory usage: 293.8+ KB


---

## **Tasks**

### **Separate your data into the feature matrix (X) and the target vector (y).**
- Rings will be your y
- The rest of the features will be your X

In [112]:
# Create the feature matrix (X)
X = df.drop(columns="rings")
X

Unnamed: 0,sex,length,diameter,height,whole_weight,shucked_weight,viscera_weight,shell_weight
0,M,0.455,0.365,0.095,0.5140,0.2245,0.1010,0.1500
1,M,0.350,0.265,0.090,0.2255,0.0995,0.0485,0.0700
2,F,0.530,0.420,0.135,0.6770,0.2565,0.1415,0.2100
3,M,0.440,0.365,0.125,0.5160,0.2155,0.1140,0.1550
4,I,0.330,0.255,0.080,0.2050,0.0895,0.0395,0.0550
...,...,...,...,...,...,...,...,...
4172,F,0.565,0.450,0.165,0.8870,0.3700,0.2390,0.2490
4173,M,0.590,0.440,0.135,0.9660,0.4390,0.2145,0.2605
4174,M,0.600,0.475,0.205,1.1760,0.5255,0.2875,0.3080
4175,F,0.625,0.485,0.150,1.0945,0.5310,0.2610,0.2960


In [113]:
# Create the target vector (y)
y = df['rings']
y

0       15
1        7
2        9
3       10
4        7
        ..
4172    11
4173    10
4174     9
4175    10
4176    12
Name: rings, Length: 4177, dtype: int64

### **Train/test split the data. Please use the random number 42 for consistency.**

In [114]:
# Perform the Train-Test Split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42) 

### **Create a ColumnTransformer to preprocess the data.** 
Remember to:
1. Create column selectors for the numeric and categorical columns.
2. Create a StandardScaler for scaling numeric columns.
3. Create a OneHotEncoder for one-hot encoding the categorical columns.
4. Match each transformer with the appropriate selector in a tuple.
5. Use the tuples to create a ColumnTransformer to preprocess the data.

In [115]:
# Create column selectors for the numeric and categorical columns.
numerical_selector = make_column_selector(dtype_include="number")
nominal_selector =  make_column_selector(dtype_include="object") # Categorical selector

In [116]:
# Create a StandardScaler for scaling numeric columns.
scaler = StandardScaler()

In [117]:
# Create a OneHotEncoder for one-hot encoding the categorical columns.
one_hot_encoder = OneHotEncoder(handle_unknown="ignore")

In [118]:
# Match each transformer with the appropriate selector in a tuple.
numeric_tuple = (scaler, numerical_selector)
nominal_tuple = (one_hot_encoder, nominal_selector) # Categorical Tuple

In [119]:
# Use the tuples to create a ColumnTransformer to preprocess the data.
# Perused the Documentation and found out that verbose_feature_names_out affects
# the output of get_feature_names_out() which I also found by looking at documentation
col_transformer = make_column_transformer(nominal_tuple, numeric_tuple, remainder='passthrough', verbose_feature_names_out=False)

### **Transform your data and display the result.**
- Individual transformers do NOT need to be fit separately.  Just fit the resulting preprocessing object once on the training data, and use it to transform both the training and testing data.

In [120]:
# Fit the ColumnTransformer to the training data
col_transformer.fit(X_train)
print()




In [121]:
# Transform both the training and the testing data
X_train_processed = col_transformer.transform(X_train)
X_test_processed = col_transformer.transform(X_test)

print("X_train_processed data missing values:", np.isnan(X_train_processed).sum().sum())
print("X_train_processed datatypes:", X_train_processed.dtype)
print("X_train_processed shape:", X_train_processed.shape)

print()

print("X_test_processed data missing values:", np.isnan(X_test_processed).sum().sum())
print("X_test_processed datatypes:", X_test_processed.dtype)
print("X_test_processed shape:", X_test_processed.shape)

display(X_train_processed)
X_test_processed

X_train_processed data missing values: 0
X_train_processed datatypes: float64
X_train_processed shape: (3341, 10)

X_test_processed data missing values: 0
X_test_processed datatypes: float64
X_test_processed shape: (836, 10)


array([[ 0.        ,  1.        ,  0.        , ..., -0.32540694,
        -0.40512998, -0.21213236],
       [ 0.        ,  1.        ,  0.        , ..., -0.48012641,
        -0.82093157, -0.71241871],
       [ 1.        ,  0.        ,  0.        , ..., -1.35462776,
        -1.34639511, -1.39137875],
       ...,
       [ 0.        ,  0.        ,  1.        , ..., -0.52273032,
        -0.43254547, -0.36936522],
       [ 0.        ,  0.        ,  1.        , ...,  0.69708696,
         0.34422673, -0.03345867],
       [ 1.        ,  0.        ,  0.        , ...,  0.83611025,
         0.45845793,  0.22383145]])

array([[ 0.        ,  0.        ,  1.        , ...,  0.27104784,
         1.10272193,  0.60976664],
       [ 0.        ,  0.        ,  1.        , ...,  0.11857068,
         0.31224199,  0.03801081],
       [ 1.        ,  0.        ,  0.        , ..., -0.24916836,
         0.39905771,  0.68123611],
       ...,
       [ 0.        ,  1.        ,  0.        , ..., -0.03614879,
        -0.20865231, -0.22642626],
       [ 0.        ,  1.        ,  0.        , ..., -0.47339947,
        -0.81636232, -0.39795301],
       [ 0.        ,  1.        ,  0.        , ..., -1.17748518,
        -1.30984112, -1.17697032]])

The above is verbose and hard to understand. I'll display it as a DataFrame:

In [122]:
# Get the names of the features as created by the column transformer
# Ref: https://scikit-learn.org/stable/modules/generated/sklearn.compose.ColumnTransformer.html
features = col_transformer.get_feature_names_out()

# Make each a DataFrame
X_train_processed_df = pd.DataFrame(X_train_processed, columns=features)
X_test_processed_df = pd.DataFrame(X_test_processed, columns=features)

# Display each DataFrame
display(X_train_processed_df)
X_test_processed_df

Unnamed: 0,sex_F,sex_I,sex_M,length,diameter,height,whole_weight,shucked_weight,viscera_weight,shell_weight
0,0.0,1.0,0.0,0.210175,0.369725,-0.347950,-0.323164,-0.325407,-0.405130,-0.212132
1,0.0,1.0,0.0,-0.419444,-0.542386,-0.936082,-0.665985,-0.480126,-0.820932,-0.712419
2,1.0,0.0,0.0,-1.846581,-1.859881,-1.641840,-1.388250,-1.354628,-1.346395,-1.391379
3,0.0,1.0,0.0,-2.098428,-2.113245,-1.759467,-1.453355,-1.363597,-1.533734,-1.462848
4,0.0,0.0,1.0,-0.251546,-0.289022,-0.112697,-0.409632,-0.437522,-0.350299,-0.176398
...,...,...,...,...,...,...,...,...,...,...
3336,1.0,0.0,0.0,-0.293521,-0.086330,-0.583203,-0.532723,-0.468915,-0.441684,-0.676684
3337,1.0,0.0,0.0,1.217565,1.433855,1.181193,1.138658,0.815929,1.120999,1.145788
3338,0.0,0.0,1.0,-0.125622,-0.137003,-0.347950,-0.509325,-0.522730,-0.432545,-0.369365
3339,0.0,0.0,1.0,0.420048,0.572417,-0.465577,0.453016,0.697087,0.344227,-0.033459


Unnamed: 0,sex_F,sex_I,sex_M,length,diameter,height,whole_weight,shucked_weight,viscera_weight,shell_weight
0,0.0,0.0,1.0,0.671895,0.471071,0.475434,0.554743,0.271048,1.102722,0.609767
1,0.0,0.0,1.0,0.545971,0.319052,0.240182,0.084763,0.118571,0.312242,0.038011
2,1.0,0.0,0.0,0.294124,0.369725,1.298819,0.305511,-0.249168,0.399058,0.681236
3,1.0,0.0,0.0,0.923743,0.825781,0.710687,0.876201,0.797991,0.782875,1.002849
4,0.0,0.0,1.0,-0.419444,-0.238349,0.122555,-0.434047,-0.563092,-0.665577,-0.176398
...,...,...,...,...,...,...,...,...,...,...
831,1.0,0.0,0.0,-0.041673,0.217707,0.240182,-0.209230,-0.262622,-0.213222,-0.033459
832,1.0,0.0,0.0,0.587946,0.268380,0.240182,0.140713,0.255352,-0.098990,0.180950
833,0.0,1.0,0.0,0.168200,0.217707,0.240182,-0.180746,-0.036149,-0.208652,-0.226426
834,0.0,1.0,0.0,-0.503394,-0.542386,-0.465577,-0.509325,-0.473399,-0.816362,-0.397953
