## Importação dos pacotes

In [1]:
# importar pacotes necessários
import numpy as np
import pandas as pd

In [2]:
# definir parâmetros extras
pd.set_option('precision', 3)
pd.set_option('display.max_columns', 100)

## Carga dos dados

In [3]:
# carregar arquivo de dados de treino
data = pd.read_csv('abalone-train.csv', index_col='id')

# mostrar alguns exemplos de registros
data.head()

Unnamed: 0_level_0,sex,length,diameter,height,whole_weight,shucked_weight,viscera_weight,shell_weight,rings
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
2758,M,0.535,0.43,0.155,0.784,0.329,0.169,0.245,10
1384,F,0.63,0.485,0.17,1.321,0.595,0.345,0.345,9
1131,M,0.565,0.435,0.15,0.99,0.58,0.182,0.206,8
3726,I,0.5,0.395,0.145,0.786,0.332,0.181,0.245,8
3445,I,0.495,0.4,0.145,0.578,0.255,0.131,0.165,8


In [4]:
# quantas linhas e colunas existem?
data.shape

(2784, 9)

## Análise dos dados

In [5]:
# quais são as colunas e respectivos tipos de dados?
data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 2784 entries, 2758 to 852
Data columns (total 9 columns):
sex               2784 non-null object
length            2784 non-null float64
diameter          2784 non-null float64
height            2784 non-null float64
whole_weight      2784 non-null float64
shucked_weight    2784 non-null float64
viscera_weight    2784 non-null float64
shell_weight      2784 non-null float64
rings             2784 non-null int64
dtypes: float64(7), int64(1), object(1)
memory usage: 206.6+ KB


In [6]:
# existem colunas com dados nulos?
data[data.columns[data.isnull().any()]].isnull().sum()

Series([], dtype: float64)

In [7]:
# sumário estatístico das características numéricas
data.describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
length,2784.0,0.522,0.12,0.075,0.45,0.54,0.61,0.815
diameter,2784.0,0.406,0.099,0.055,0.345,0.42,0.48,0.65
height,2784.0,0.139,0.039,0.0,0.11,0.14,0.165,0.515
whole_weight,2784.0,0.817,0.484,0.002,0.435,0.785,1.136,2.78
shucked_weight,2784.0,0.355,0.219,0.001,0.182,0.332,0.498,1.349
viscera_weight,2784.0,0.178,0.109,0.0005,0.091,0.168,0.247,0.76
shell_weight,2784.0,0.235,0.136,0.0015,0.126,0.23,0.322,0.885
rings,2784.0,9.936,3.241,1.0,8.0,10.0,11.0,29.0


In [8]:
# quais as correlações entre as características numéricas?
data.corr()

Unnamed: 0,length,diameter,height,whole_weight,shucked_weight,viscera_weight,shell_weight,rings
length,1.0,0.987,0.89,0.926,0.899,0.902,0.903,0.556
diameter,0.987,1.0,0.896,0.926,0.895,0.9,0.91,0.573
height,0.89,0.896,1.0,0.881,0.832,0.858,0.882,0.594
whole_weight,0.926,0.926,0.881,1.0,0.97,0.97,0.956,0.536
shucked_weight,0.899,0.895,0.832,0.97,1.0,0.935,0.886,0.419
viscera_weight,0.902,0.9,0.858,0.97,0.935,1.0,0.914,0.501
shell_weight,0.903,0.91,0.882,0.956,0.886,0.914,1.0,0.623
rings,0.556,0.573,0.594,0.536,0.419,0.501,0.623,1.0


In [9]:
# show variable correlation which is more than 0.6 (positive or negative)
corr = data.corr()
corr[corr != 1][abs(corr) > 0.6].dropna(how='all', axis=1).dropna(how='all', axis=0)

Unnamed: 0,length,diameter,height,whole_weight,shucked_weight,viscera_weight,shell_weight,rings
length,,0.987,0.89,0.926,0.899,0.902,0.903,
diameter,0.987,,0.896,0.926,0.895,0.9,0.91,
height,0.89,0.896,,0.881,0.832,0.858,0.882,
whole_weight,0.926,0.926,0.881,,0.97,0.97,0.956,
shucked_weight,0.899,0.895,0.832,0.97,,0.935,0.886,
viscera_weight,0.902,0.9,0.858,0.97,0.935,,0.914,
shell_weight,0.903,0.91,0.882,0.956,0.886,0.914,,0.623
rings,,,,,,,0.623,


In [10]:
data.groupby('rings').mean()

Unnamed: 0_level_0,length,diameter,height,whole_weight,shucked_weight,viscera_weight,shell_weight
rings,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
1,0.075,0.055,0.01,0.002,0.001,0.0005,0.002
3,0.175,0.128,0.041,0.029,0.012,0.006042,0.009
4,0.223,0.165,0.054,0.06,0.025,0.01296,0.018
5,0.292,0.215,0.072,0.134,0.069,0.02911,0.039
6,0.365,0.275,0.09,0.265,0.119,0.05719,0.077
7,0.423,0.322,0.106,0.397,0.184,0.08532,0.111
8,0.497,0.384,0.127,0.638,0.292,0.1377,0.178
9,0.545,0.423,0.143,0.849,0.389,0.1873,0.234
10,0.569,0.444,0.153,0.978,0.431,0.2161,0.275
11,0.593,0.466,0.16,1.112,0.49,0.2444,0.311


In [11]:
numeric_feats = data.dtypes[data.dtypes != "object"].index
numeric_feats

Index(['length', 'diameter', 'height', 'whole_weight', 'shucked_weight',
       'viscera_weight', 'shell_weight', 'rings'],
      dtype='object')

In [13]:
data.head(10).T

id,2758,1384,1131,3726,3445,817,2742,3757,2609,2728
sex,M,F,M,I,I,I,I,I,F,I
length,0.535,0.63,0.565,0.5,0.495,0.35,0.46,0.52,0.63,0.405
diameter,0.43,0.485,0.435,0.395,0.4,0.27,0.345,0.41,0.495,0.31
height,0.155,0.17,0.15,0.145,0.145,0.09,0.105,0.14,0.2,0.11
whole_weight,0.784,1.32,0.99,0.786,0.578,0.205,0.415,0.699,1.43,0.91
shucked_weight,0.329,0.595,0.58,0.332,0.255,0.075,0.187,0.34,0.659,0.416
viscera_weight,0.169,0.345,0.182,0.181,0.131,0.0575,0.087,0.129,0.336,0.207
shell_weight,0.245,0.345,0.206,0.245,0.165,0.062,0.11,0.195,0.38,0.0995
rings,10,9,8,8,8,6,8,10,11,8


In [14]:
data.isna().sum()

sex               0
length            0
diameter          0
height            0
whole_weight      0
shucked_weight    0
viscera_weight    0
shell_weight      0
rings             0
dtype: int64

In [15]:
data.head()

Unnamed: 0_level_0,sex,length,diameter,height,whole_weight,shucked_weight,viscera_weight,shell_weight,rings
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
2758,M,0.535,0.43,0.155,0.784,0.329,0.169,0.245,10
1384,F,0.63,0.485,0.17,1.321,0.595,0.345,0.345,9
1131,M,0.565,0.435,0.15,0.99,0.58,0.182,0.206,8
3726,I,0.5,0.395,0.145,0.786,0.332,0.181,0.245,8
3445,I,0.495,0.4,0.145,0.578,0.255,0.131,0.165,8


In [16]:
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()
print(scaler.fit(data))

data_scaled = pd.DataFrame(scaler.transform(data), columns=data.columns, index=data.index)
data_scaled.head()

ValueError: could not convert string to float: 'M'