<a href="https://colab.research.google.com/github/kellianneyang/bootcamp-assignments/blob/main/Abalone_Preprocessing_Exercise_(Core)_Kellianne_Yang.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# **Abalone Preprocessing Exercise (Core)**

**Name:** Kellianne Yang

# Preliminary Steps

In [None]:
# import libraries
import pandas as pd
# import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.compose import make_column_selector, make_column_transformer
from sklearn.preprocessing import OneHotEncoder, StandardScaler

In [None]:
# load dataset
df = pd.read_csv('https://docs.google.com/spreadsheets/d/1jfU2oFSfhX1ywUbqETExDJuztO95r3h6pbWAm7xpwNY/gviz/tq?tqx=out:csv&sheet=users')

In [None]:
# inspect data with .info() and .head() to make sure it uploaded correctly

df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4177 entries, 0 to 4176
Data columns (total 9 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   sex             4177 non-null   object 
 1   length          4177 non-null   float64
 2   diameter        4177 non-null   float64
 3   height          4177 non-null   float64
 4   whole_weight    4177 non-null   float64
 5   shucked_weight  4177 non-null   float64
 6   viscera_weight  4177 non-null   float64
 7   shell_weight    4177 non-null   float64
 8   rings           4177 non-null   int64  
dtypes: float64(7), int64(1), object(1)
memory usage: 293.8+ KB


In [None]:
df.head(10)

Unnamed: 0,sex,length,diameter,height,whole_weight,shucked_weight,viscera_weight,shell_weight,rings
0,M,0.455,0.365,0.095,0.514,0.2245,0.101,0.15,15
1,M,0.35,0.265,0.09,0.2255,0.0995,0.0485,0.07,7
2,F,0.53,0.42,0.135,0.677,0.2565,0.1415,0.21,9
3,M,0.44,0.365,0.125,0.516,0.2155,0.114,0.155,10
4,I,0.33,0.255,0.08,0.205,0.0895,0.0395,0.055,7
5,I,0.425,0.3,0.095,0.3515,0.141,0.0775,0.12,8
6,F,0.53,0.415,0.15,0.7775,0.237,0.1415,0.33,20
7,F,0.545,0.425,0.125,0.768,0.294,0.1495,0.26,16
8,M,0.475,0.37,0.125,0.5095,0.2165,0.1125,0.165,9
9,F,0.55,0.44,0.15,0.8945,0.3145,0.151,0.32,19


# 1. Separate your data into the feature matrix (X) and the target vector (y)

In [None]:
# the target we are trying to predict
target = 'rings'

## 1. rings will be your y

In [None]:
# assign y to target
y = df[target]

In [None]:
# check target (y)
print(y)

0       15
1        7
2        9
3       10
4        7
        ..
4172    11
4173    10
4174     9
4175    10
4176    12
Name: rings, Length: 4177, dtype: int64


## 2. The rest of the features will be your X

In [None]:
# the features we will use to make the prediction
X = df.drop(columns = target)

In [None]:
# check features (X)
print(X.head(10))

  sex  length  diameter  height  whole_weight  shucked_weight  viscera_weight  \
0   M   0.455     0.365   0.095        0.5140          0.2245          0.1010   
1   M   0.350     0.265   0.090        0.2255          0.0995          0.0485   
2   F   0.530     0.420   0.135        0.6770          0.2565          0.1415   
3   M   0.440     0.365   0.125        0.5160          0.2155          0.1140   
4   I   0.330     0.255   0.080        0.2050          0.0895          0.0395   
5   I   0.425     0.300   0.095        0.3515          0.1410          0.0775   
6   F   0.530     0.415   0.150        0.7775          0.2370          0.1415   
7   F   0.545     0.425   0.125        0.7680          0.2940          0.1495   
8   M   0.475     0.370   0.125        0.5095          0.2165          0.1125   
9   F   0.550     0.440   0.150        0.8945          0.3145          0.1510   

   shell_weight  
0         0.150  
1         0.070  
2         0.210  
3         0.155  
4         0.055  


# 2. Train/test split the data. Please use the random number 42 for consistency.

In [None]:
# train test split with random_state 42
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state = 42)

# 3. Create a ColumnTransformer to preprocess the data. Remember to:

## 1. Create column selectors for the numeric and categorical columns.

In [None]:
# create column selector for numeric columns
num_selector = make_column_selector(dtype_include='number')

In [None]:
# create column selector for categorical columns
cat_selector = make_column_selector(dtype_include='object')

## 2. Create a StandardScaler for scaling numeric columns.

In [None]:
# create standard scaler
scaler = StandardScaler()

## 3. Create a OneHotEncoder for one-hot encoding the categorical columns.

In [None]:
# create one hot encoder
ohe = OneHotEncoder(handle_unknown = 'ignore')

## 4. Match each transformer with the appropriate selector in a tuple.

In [None]:
# make tuple with numeric column selector and scaler
num_tuple = (scaler, num_selector)

In [None]:
# make tuple with categorical column selector and ohe
cat_tuple = (ohe, cat_selector)

## 5. Use the tuples to create a ColumnTransformer to preprocess the data.

In [None]:
# create the column transformer and pass in num_tuple and cat_tuple
col_transformer = make_column_transformer(num_tuple, 
                                          cat_tuple, 
                                          remainder = 'passthrough')

# 4. Transform your data and display the result.

## 1. Individual transformers do NOT need to be fit separately.  Just fit the resulting preprocessing object once on the training data, and use it to transform both the training and testing data.

In [None]:
# fit the column transformer on the TRAINING DATA ONLY
col_transformer.fit(X_train)

In [None]:
# transform training set
X_train_processed = col_transformer.transform(X_train)

In [None]:
# transform testing set
X_test_processed = col_transformer.transform(X_test)

In [None]:
# view the transformations (not necessary for modelling, but 
# helpful to see results of transformations)

In [None]:
# create dataframe for training set processed data (originally a numpy array)
X_train_df = pd.DataFrame(X_train_processed)

In [None]:
# view first 10 rows
X_train_df.head(10)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9
0,0.749291,0.464226,-0.118869,0.457447,0.499098,0.743973,0.241135,1.0,0.0,0.0
1,-0.090254,-0.144654,-0.001647,-0.301655,-0.364269,-0.51404,-0.145838,1.0,0.0,0.0
2,1.127086,1.225326,0.81891,1.523852,1.692114,1.544526,1.179902,0.0,0.0,1.0
3,-0.59398,-0.449095,-1.056649,-0.651696,-0.617673,-0.738195,-0.647469,0.0,0.0,1.0
4,-0.258163,-0.093914,0.35002,-0.052352,-0.572823,-0.605532,0.785763,0.0,0.0,1.0
5,0.833245,0.920886,0.936133,1.091388,1.26828,1.10994,0.932669,0.0,0.0,1.0
6,-1.181662,-1.311675,-0.939426,-1.137074,-1.039266,-1.104162,-1.202846,0.0,1.0,0.0
7,-0.971775,-0.855015,-0.939426,-1.037352,-1.032538,-1.003521,-0.96278,0.0,1.0,0.0
8,-1.391548,-1.362415,-1.291094,-1.295813,-1.290427,-1.282571,-1.220762,0.0,1.0,0.0
9,-0.342117,-0.398355,-0.236092,-0.627275,-0.510033,-0.701598,-0.608055,0.0,0.0,1.0


In [None]:
# view info
X_train_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3132 entries, 0 to 3131
Data columns (total 10 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   0       3132 non-null   float64
 1   1       3132 non-null   float64
 2   2       3132 non-null   float64
 3   3       3132 non-null   float64
 4   4       3132 non-null   float64
 5   5       3132 non-null   float64
 6   6       3132 non-null   float64
 7   7       3132 non-null   float64
 8   8       3132 non-null   float64
 9   9       3132 non-null   float64
dtypes: float64(10)
memory usage: 244.8 KB
