In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.compose import make_column_selector
from sklearn.preprocessing import StandardScaler, OneHotEncoder

In [2]:
datos = pd.read_csv(r'C:\Users\pedro\Desktop\Data Science\Machine Learning\Intro to ML\abalone.data')

In [4]:
datos.head()

Unnamed: 0,M,0.455,0.365,0.095,0.514,0.2245,0.101,0.15,15
0,M,0.35,0.265,0.09,0.2255,0.0995,0.0485,0.07,7
1,F,0.53,0.42,0.135,0.677,0.2565,0.1415,0.21,9
2,M,0.44,0.365,0.125,0.516,0.2155,0.114,0.155,10
3,I,0.33,0.255,0.08,0.205,0.0895,0.0395,0.055,7
4,I,0.425,0.3,0.095,0.3515,0.141,0.0775,0.12,8


In [9]:
datos.columns =['Sex','Lenght','Diameter','Height','Whole weight','Shucked weight','Viscera weight','Shell weight','Rings']
datos.head()

Unnamed: 0,Sex,Lenght,Diameter,Height,Whole weight,Shucked weight,Viscera weight,Shell weight,Rings
0,M,0.35,0.265,0.09,0.2255,0.0995,0.0485,0.07,7
1,F,0.53,0.42,0.135,0.677,0.2565,0.1415,0.21,9
2,M,0.44,0.365,0.125,0.516,0.2155,0.114,0.155,10
3,I,0.33,0.255,0.08,0.205,0.0895,0.0395,0.055,7
4,I,0.425,0.3,0.095,0.3515,0.141,0.0775,0.12,8


Name		Data Type	Meas.	Description
	----		---------	-----	-----------
	Sex		nominal			M, F, and I (infant)
	Length		continuous	mm	Longest shell measurement
	Diameter	continuous	mm	perpendicular to length
	Height		continuous	mm	with meat in shell
	Whole weight	continuous	grams	whole abalone
	Shucked weight	continuous	grams	weight of meat
	Viscera weight	continuous	grams	gut weight (after bleeding)
	Shell weight	continuous	grams	after being dried
	Rings		integer			+1.5 gives the age in years

In [10]:
X = datos.drop(columns = 'Rings')
y = datos['Rings']
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state = 42)

In [11]:
cat_selector = make_column_selector(dtype_include='object')
num_selector = make_column_selector(dtype_include='number')

In [12]:
scaler = StandardScaler()
ohe = OneHotEncoder(handle_unknown='ignore')

In [13]:
num_tuple = (scaler, num_selector)
cat_tuple = (ohe, cat_selector)

In [14]:
from sklearn.compose import make_column_transformer
col_transformer = make_column_transformer(num_tuple, cat_tuple, remainder = 'passthrough')

In [15]:
col_transformer.fit(X_train)

ColumnTransformer(remainder='passthrough',
                  transformers=[('standardscaler', StandardScaler(),
                                 <sklearn.compose._column_transformer.make_column_selector object at 0x00000210AB120AC8>),
                                ('onehotencoder',
                                 OneHotEncoder(handle_unknown='ignore'),
                                 <sklearn.compose._column_transformer.make_column_selector object at 0x00000210AB1207F0>)])

In [16]:
X_train_processed = col_transformer.transform(X_train)
X_test_processed = col_transformer.transform(X_test)

In [23]:
X_train_processed

array([[-1.54642176, -1.5561698 , -1.05355826, ...,  0.        ,
         0.        ,  1.        ],
       [ 0.79572536,  0.52191671,  0.7068688 , ...,  0.        ,
         0.        ,  1.        ],
       [ 0.25201264,  0.31917656,  0.35478338, ...,  1.        ,
         0.        ,  0.        ],
       ...,
       [-0.04075575,  0.21780648,  0.23742158, ...,  1.        ,
         0.        ,  0.        ],
       [ 0.41930886,  0.52191671,  0.23742158, ...,  1.        ,
         0.        ,  0.        ],
       [ 0.58660508,  0.57260174,  0.00269797, ...,  1.        ,
         0.        ,  0.        ]])

In [24]:
X_test_processed

array([[ 0.75390131,  0.927397  ,  0.8242306 , ...,  1.        ,
         0.        ,  0.        ],
       [ 0.54478103,  0.57260174,  0.23742158, ...,  1.        ,
         0.        ,  0.        ],
       [ 0.08471641,  0.11643641,  0.12005978, ...,  0.        ,
         1.        ,  0.        ],
       ...,
       [ 0.46113292,  0.72465685,  0.9415924 , ...,  1.        ,
         0.        ,  0.        ],
       [ 0.96302159,  1.07945211,  1.52840142, ...,  1.        ,
         0.        ,  0.        ],
       [-1.21182931, -0.84657929, -0.70147285, ...,  1.        ,
         0.        ,  0.        ]])

In [17]:
X_train_df = pd.DataFrame(X_train_processed)
X_train_df.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9
0,-1.546422,-1.55617,-1.053558,-1.258947,-1.260633,-1.337852,-1.212341,0.0,0.0,1.0
1,0.795725,0.521917,0.706869,0.605245,0.789463,0.749584,0.409117,0.0,0.0,1.0
2,0.252013,0.319177,0.354783,0.37885,0.602065,0.040129,0.172355,1.0,0.0,0.0
3,1.172142,0.927397,0.824231,1.234461,1.277152,1.490874,0.940037,1.0,0.0,0.0
4,-1.462774,-1.4548,-1.17092,-1.233452,-1.177094,-1.205966,-1.212341,0.0,0.0,1.0


In [18]:
X_train_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3132 entries, 0 to 3131
Data columns (total 10 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   0       3132 non-null   float64
 1   1       3132 non-null   float64
 2   2       3132 non-null   float64
 3   3       3132 non-null   float64
 4   4       3132 non-null   float64
 5   5       3132 non-null   float64
 6   6       3132 non-null   float64
 7   7       3132 non-null   float64
 8   8       3132 non-null   float64
 9   9       3132 non-null   float64
dtypes: float64(10)
memory usage: 244.8 KB


In [22]:
datos.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4176 entries, 0 to 4175
Data columns (total 9 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   Sex             4176 non-null   object 
 1   Lenght          4176 non-null   float64
 2   Diameter        4176 non-null   float64
 3   Height          4176 non-null   float64
 4   Whole weight    4176 non-null   float64
 5   Shucked weight  4176 non-null   float64
 6   Viscera weight  4176 non-null   float64
 7   Shell weight    4176 non-null   float64
 8   Rings           4176 non-null   int64  
dtypes: float64(7), int64(1), object(1)
memory usage: 293.8+ KB
