## Importação dos pacotes

In [1]:
# importar pacotes necessários
import numpy as np
import pandas as pd

In [2]:
# definir parâmetros extras
pd.set_option('precision', 4)
pd.set_option('display.max_columns', 100)

## Carga dos dados

In [3]:
# carregar arquivo de dados de treino
data = pd.read_csv('abalone-train.csv', index_col='id')

# mostrar alguns exemplos de registros
data.head()

Unnamed: 0_level_0,sex,length,diameter,height,whole_weight,shucked_weight,viscera_weight,shell_weight,rings
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
2758,M,0.535,0.43,0.155,0.7845,0.3285,0.169,0.245,10
1384,F,0.63,0.485,0.17,1.3205,0.5945,0.345,0.345,9
1131,M,0.565,0.435,0.15,0.99,0.5795,0.1825,0.206,8
3726,I,0.5,0.395,0.145,0.7865,0.332,0.1815,0.2455,8
3445,I,0.495,0.4,0.145,0.578,0.2545,0.1305,0.1645,8


In [4]:
# quantas linhas e colunas existem?
data.shape

(2784, 9)

## Análise dos dados

In [5]:
# quais são as colunas e respectivos tipos de dados?
data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 2784 entries, 2758 to 852
Data columns (total 9 columns):
sex               2784 non-null object
length            2784 non-null float64
diameter          2784 non-null float64
height            2784 non-null float64
whole_weight      2784 non-null float64
shucked_weight    2784 non-null float64
viscera_weight    2784 non-null float64
shell_weight      2784 non-null float64
rings             2784 non-null int64
dtypes: float64(7), int64(1), object(1)
memory usage: 206.6+ KB


In [6]:
# existem colunas com dados nulos?
data[data.columns[data.isnull().any()]].isnull().sum()

Series([], dtype: float64)

In [7]:
# sumário estatístico das características numéricas
data.describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
length,2784.0,0.5218,0.1198,0.075,0.45,0.54,0.61,0.815
diameter,2784.0,0.4059,0.0988,0.055,0.345,0.42,0.48,0.65
height,2784.0,0.1387,0.0388,0.0,0.11,0.14,0.165,0.515
whole_weight,2784.0,0.8168,0.4837,0.002,0.4351,0.7853,1.136,2.7795
shucked_weight,2784.0,0.3548,0.2186,0.001,0.1819,0.332,0.4981,1.3485
viscera_weight,2784.0,0.1781,0.1086,0.0005,0.091,0.1675,0.2471,0.76
shell_weight,2784.0,0.2347,0.136,0.0015,0.1258,0.23,0.3221,0.885
rings,2784.0,9.9361,3.2405,1.0,8.0,10.0,11.0,29.0


In [8]:
# quais as correlações entre as características numéricas?
data.corr()

Unnamed: 0,length,diameter,height,whole_weight,shucked_weight,viscera_weight,shell_weight,rings
length,1.0,0.9866,0.8896,0.9263,0.8987,0.9022,0.9029,0.5559
diameter,0.9866,1.0,0.8962,0.9264,0.8947,0.8999,0.9095,0.5728
height,0.8896,0.8962,1.0,0.8805,0.8318,0.8577,0.8821,0.5936
whole_weight,0.9263,0.9264,0.8805,1.0,0.9699,0.9695,0.9565,0.5359
shucked_weight,0.8987,0.8947,0.8318,0.9699,1.0,0.9347,0.8864,0.4188
viscera_weight,0.9022,0.8999,0.8577,0.9695,0.9347,1.0,0.9137,0.5012
shell_weight,0.9029,0.9095,0.8821,0.9565,0.8864,0.9137,1.0,0.6227
rings,0.5559,0.5728,0.5936,0.5359,0.4188,0.5012,0.6227,1.0


In [9]:
# show variable correlation which is more than 0.6 (positive or negative)
corr = data.corr()
corr[corr != 1][abs(corr) > 0.6].dropna(how='all', axis=1).dropna(how='all', axis=0)

Unnamed: 0,length,diameter,height,whole_weight,shucked_weight,viscera_weight,shell_weight,rings
length,,0.9866,0.8896,0.9263,0.8987,0.9022,0.9029,
diameter,0.9866,,0.8962,0.9264,0.8947,0.8999,0.9095,
height,0.8896,0.8962,,0.8805,0.8318,0.8577,0.8821,
whole_weight,0.9263,0.9264,0.8805,,0.9699,0.9695,0.9565,
shucked_weight,0.8987,0.8947,0.8318,0.9699,,0.9347,0.8864,
viscera_weight,0.9022,0.8999,0.8577,0.9695,0.9347,,0.9137,
shell_weight,0.9029,0.9095,0.8821,0.9565,0.8864,0.9137,,0.6227
rings,,,,,,,0.6227,


In [10]:
data.groupby('rings').mean()

Unnamed: 0_level_0,length,diameter,height,whole_weight,shucked_weight,viscera_weight,shell_weight
rings,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
1,0.075,0.055,0.01,0.002,0.001,0.0005,0.0015
3,0.1746,0.1283,0.0408,0.0287,0.0118,0.006,0.0092
4,0.2231,0.1649,0.0537,0.06,0.0252,0.013,0.0183
5,0.2917,0.2151,0.0721,0.1342,0.069,0.0291,0.039
6,0.3649,0.275,0.0904,0.2647,0.1193,0.0572,0.077
7,0.4227,0.3222,0.1057,0.3972,0.1836,0.0853,0.1111
8,0.4973,0.3839,0.1272,0.6383,0.2922,0.1377,0.1781
9,0.5454,0.4234,0.1429,0.8489,0.3885,0.1873,0.2344
10,0.5689,0.4444,0.1526,0.9784,0.4314,0.2161,0.2753
11,0.5931,0.4659,0.1599,1.1121,0.4903,0.2444,0.3114


In [11]:
numeric_feats = data.dtypes[data.dtypes != "object"].index
numeric_feats

Index(['length', 'diameter', 'height', 'whole_weight', 'shucked_weight',
       'viscera_weight', 'shell_weight', 'rings'],
      dtype='object')

In [12]:
data.head(10).T

id,2758,1384,1131,3726,3445,817,2742,3757,2609,2728
sex,M,F,M,I,I,I,I,I,F,I
length,0.535,0.63,0.565,0.5,0.495,0.35,0.46,0.52,0.63,0.405
diameter,0.43,0.485,0.435,0.395,0.4,0.27,0.345,0.41,0.495,0.31
height,0.155,0.17,0.15,0.145,0.145,0.09,0.105,0.14,0.2,0.11
whole_weight,0.7845,1.321,0.99,0.7865,0.578,0.2055,0.415,0.699,1.425,0.91
shucked_weight,0.3285,0.5945,0.5795,0.332,0.2545,0.075,0.187,0.3395,0.659,0.416
viscera_weight,0.169,0.345,0.1825,0.1815,0.1305,0.0575,0.087,0.129,0.336,0.2075
shell_weight,0.245,0.345,0.206,0.2455,0.1645,0.062,0.11,0.1945,0.38,0.0995
rings,10,9,8,8,8,6,8,10,11,8


In [13]:
data.isna().sum()

sex               0
length            0
diameter          0
height            0
whole_weight      0
shucked_weight    0
viscera_weight    0
shell_weight      0
rings             0
dtype: int64

In [14]:
data.sex.describe()

count     2784
unique       3
top          M
freq       992
Name: sex, dtype: object

In [15]:
data.head()

Unnamed: 0_level_0,sex,length,diameter,height,whole_weight,shucked_weight,viscera_weight,shell_weight,rings
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
2758,M,0.535,0.43,0.155,0.784,0.329,0.169,0.245,10
1384,F,0.63,0.485,0.17,1.321,0.595,0.345,0.345,9
1131,M,0.565,0.435,0.15,0.99,0.58,0.182,0.206,8
3726,I,0.5,0.395,0.145,0.786,0.332,0.181,0.245,8
3445,I,0.495,0.4,0.145,0.578,0.255,0.131,0.165,8
