## Importação dos pacotes

In [1]:
# importar pacotes necessários
import numpy as np
import pandas as pd

In [2]:
# definir parâmetros extras
pd.set_option('precision', 3)
pd.set_option('display.max_columns', 100)

## Carga dos dados de entrada

### Dados originais de treino

In [3]:
# carregar arquivo de dados de treino
data = pd.read_csv('zoo-train.csv', index_col='animal_name')

# mostrar alguns exemplos de registros
data.head()

Unnamed: 0_level_0,hair,feathers,eggs,milk,airborne,aquatic,predator,toothed,backbone,breathes,venomous,fins,legs,tail,domestic,catsize,class_type
animal_name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1
swan,n,y,y,n,y,y,n,n,y,y,n,n,2,y,n,y,2
tuatara,n,n,y,n,n,n,y,y,y,y,n,n,4,y,n,n,3
bass,n,n,y,n,n,y,y,y,y,n,n,y,0,y,n,n,4
hare,y,n,n,y,n,n,n,y,y,y,n,n,4,y,n,n,1
chub,n,n,y,n,n,y,y,y,y,n,n,y,0,y,n,n,4


In [4]:
# quantas linhas e colunas existem?
data.shape

(101, 17)

### Dados adicionais de treino

In [5]:
# carregar arquivo de dados de treino
data2 = pd.read_csv('zoo-train2.csv', index_col='animal_name')

# mostrar alguns exemplos de registros
data2.head()

Unnamed: 0_level_0,hair,feathers,eggs,milk,airborne,aquatic,predator,toothed,backbone,breathes,venomous,fins,legs,tail,domestic,catsize,class_type
animal_name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1
boa,n,n,y,n,n,n,y,y,y,y,n,n,0,y,n,y,3
barracuda,n,n,y,n,n,y,y,y,y,n,n,y,0,y,n,y,4
mosquito,n,n,y,n,y,n,n,n,n,y,n,n,6,n,n,n,6
anchovy,n,n,y,n,n,y,n,n,y,n,n,y,0,y,n,n,4
spider,n,n,y,n,n,n,y,y,n,y,y,n,8,n,n,n,7


In [6]:
# quantas linhas e colunas existem?
data2.shape

(43, 17)

### Unir dados de treinamento

In [7]:
# unir ambos os dados de treinamento
data = data.append(data2)

# mostrar tamanho
print(data.shape)

# mostrar alguns exemplos de registros
data.tail()

(144, 17)


Unnamed: 0_level_0,hair,feathers,eggs,milk,airborne,aquatic,predator,toothed,backbone,breathes,venomous,fins,legs,tail,domestic,catsize,class_type
animal_name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1
gecko,n,n,y,n,n,n,n,y,y,y,n,n,4,y,y,n,3
gharial,n,n,y,n,n,y,y,y,y,y,n,n,4,y,n,y,3
trout,n,n,y,n,n,y,n,n,y,n,n,y,0,y,n,y,4
skink,n,n,y,n,n,n,n,y,y,y,n,n,4,y,n,n,3
palmetto,n,n,y,n,y,n,y,n,n,y,n,n,6,n,n,n,6


## Analisar dados de treinamento

In [8]:
# quais são as colunas e respectivos tipos de dados?
data.info()

<class 'pandas.core.frame.DataFrame'>
Index: 144 entries, swan to palmetto
Data columns (total 17 columns):
hair          144 non-null object
feathers      144 non-null object
eggs          144 non-null object
milk          144 non-null object
airborne      144 non-null object
aquatic       144 non-null object
predator      144 non-null object
toothed       144 non-null object
backbone      144 non-null object
breathes      144 non-null object
venomous      144 non-null object
fins          144 non-null object
legs          144 non-null int64
tail          144 non-null object
domestic      144 non-null object
catsize       144 non-null object
class_type    144 non-null int64
dtypes: int64(2), object(15)
memory usage: 11.2+ KB


In [9]:
# existem colunas com dados nulos?
data[data.columns[data.isnull().any()]].isnull().sum()

Series([], dtype: float64)

## Transformações nos dados

In [10]:
# classe do animal deve ser uma categoria
data['class_type'] = data['class_type'].astype('category')

In [11]:
# atributos devem ser convertidos para 0 e 1

objcols = data.select_dtypes(['object']).columns
print(objcols)

data[objcols] = data[objcols].astype('category')
for col in objcols:
    data[col] = data[col].cat.codes

Index(['hair', 'feathers', 'eggs', 'milk', 'airborne', 'aquatic', 'predator',
       'toothed', 'backbone', 'breathes', 'venomous', 'fins', 'tail',
       'domestic', 'catsize'],
      dtype='object')


In [12]:
data.info()

<class 'pandas.core.frame.DataFrame'>
Index: 144 entries, swan to palmetto
Data columns (total 17 columns):
hair          144 non-null int8
feathers      144 non-null int8
eggs          144 non-null int8
milk          144 non-null int8
airborne      144 non-null int8
aquatic       144 non-null int8
predator      144 non-null int8
toothed       144 non-null int8
backbone      144 non-null int8
breathes      144 non-null int8
venomous      144 non-null int8
fins          144 non-null int8
legs          144 non-null int64
tail          144 non-null int8
domestic      144 non-null int8
catsize       144 non-null int8
class_type    144 non-null category
dtypes: category(1), int64(1), int8(15)
memory usage: 4.2+ KB


In [13]:
data.tail()

Unnamed: 0_level_0,hair,feathers,eggs,milk,airborne,aquatic,predator,toothed,backbone,breathes,venomous,fins,legs,tail,domestic,catsize,class_type
animal_name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1
gecko,0,0,1,0,0,0,0,1,1,1,0,0,4,1,1,0,3
gharial,0,0,1,0,0,1,1,1,1,1,0,0,4,1,0,1,3
trout,0,0,1,0,0,1,0,0,1,0,0,1,0,1,0,1,4
skink,0,0,1,0,0,0,0,1,1,1,0,0,4,1,0,0,3
palmetto,0,0,1,0,1,0,1,0,0,1,0,0,6,0,0,0,6


## Outras análises nos dados

In [14]:
# sumário estatístico das características numéricas
data.describe()

Unnamed: 0,hair,feathers,eggs,milk,airborne,aquatic,predator,toothed,backbone,breathes,venomous,fins,legs,tail,domestic,catsize
count,144.0,144.0,144.0,144.0,144.0,144.0,144.0,144.0,144.0,144.0,144.0,144.0,144.0,144.0,144.0,144.0
mean,0.306,0.139,0.708,0.285,0.215,0.389,0.479,0.556,0.75,0.785,0.09,0.167,2.951,0.667,0.125,0.417
std,0.462,0.347,0.456,0.453,0.412,0.489,0.501,0.499,0.435,0.412,0.288,0.374,2.21,0.473,0.332,0.495
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.75,1.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,1.0,1.0,0.0,0.0,4.0,1.0,0.0,0.0
75%,1.0,0.0,1.0,1.0,0.0,1.0,1.0,1.0,1.0,1.0,0.0,0.0,4.0,1.0,0.0,1.0
max,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,8.0,1.0,1.0,1.0


In [15]:
# quais as correlações entre as características numéricas?
data.corr()

Unnamed: 0,hair,feathers,eggs,milk,airborne,aquatic,predator,toothed,backbone,breathes,venomous,fins,legs,tail,domestic,catsize
hair,1.0,-0.266,-0.835,0.884,-0.091,-0.405,-0.033,0.411,0.209,0.347,-0.104,-0.216,0.261,0.117,0.16,0.357
feathers,-0.266,1.0,0.258,-0.253,0.571,-0.073,-0.023,-0.449,0.232,0.21,-0.127,-0.18,-0.173,0.284,0.03,-0.095
eggs,-0.835,0.258,1.0,-0.949,0.262,0.324,-0.088,-0.543,-0.335,-0.299,0.096,0.123,-0.125,-0.259,-0.127,-0.418
milk,0.884,-0.253,-0.949,1.0,-0.256,-0.314,0.073,0.533,0.364,0.33,-0.199,-0.117,0.119,0.25,0.134,0.466
airborne,-0.091,0.571,0.262,-0.256,1.0,-0.245,-0.198,-0.518,-0.205,0.274,0.012,-0.234,0.173,-0.096,0.006,-0.34
aquatic,-0.405,-0.073,0.324,-0.314,-0.245,1.0,0.176,0.054,0.164,-0.622,-0.003,0.561,-0.39,0.05,-0.172,0.048
predator,-0.033,-0.023,-0.088,0.073,-0.198,0.176,1.0,0.242,0.136,-0.106,0.183,0.093,-0.055,0.147,-0.236,0.204
toothed,0.411,-0.449,-0.543,0.533,-0.518,0.054,0.242,1.0,0.613,0.076,-0.011,0.175,-0.128,0.376,0.127,0.274
backbone,0.209,0.232,-0.335,0.364,-0.205,0.164,0.136,0.613,1.0,0.088,-0.21,0.258,-0.428,0.782,0.17,0.39
breathes,0.347,0.21,-0.299,0.33,0.274,-0.622,-0.106,0.076,0.088,1.0,-0.071,-0.672,0.479,-0.012,0.147,-0.003


In [22]:
# show variable correlation which is more than 0.7 (positive or negative)
corr = data.corr()
corr[corr != 1][abs(corr) > 0.7].dropna(how='all', axis=1).dropna(how='all', axis=0)

Unnamed: 0,hair,eggs,milk,backbone,tail
hair,,-0.835,0.884,,
eggs,-0.835,,-0.949,,
milk,0.884,-0.949,,,
backbone,,,,,0.782
tail,,,,0.782,


In [23]:
data.groupby('class_type').mean()

Unnamed: 0_level_0,hair,feathers,eggs,milk,airborne,aquatic,predator,toothed,backbone,breathes,venomous,fins,legs,tail,domestic,catsize
class_type,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1
1,0.951,0.0,0.024,1.0,0.049,0.146,0.537,0.976,1.0,1.0,0.0,0.098,3.366,0.854,0.195,0.78
2,0.0,1.0,1.0,0.0,0.8,0.3,0.45,0.0,1.0,1.0,0.0,0.0,2.0,1.0,0.15,0.3
3,0.0,0.0,0.941,0.0,0.0,0.294,0.706,0.882,1.0,0.941,0.235,0.0,2.588,1.0,0.235,0.529
4,0.0,0.0,1.0,0.0,0.0,1.0,0.5,0.7,1.0,0.0,0.05,1.0,0.0,1.0,0.05,0.5
5,0.0,0.0,1.0,0.0,0.0,1.0,0.3,1.0,1.0,1.0,0.1,0.0,3.8,0.3,0.1,0.0
6,0.25,0.0,1.0,0.0,0.65,0.0,0.2,0.0,0.0,1.0,0.15,0.0,6.0,0.0,0.05,0.0
7,0.0,0.0,0.938,0.0,0.0,0.562,0.562,0.062,0.0,0.375,0.25,0.0,2.812,0.062,0.0,0.188


## Gravar dados consolidados

In [16]:
# gravar arquivo CSV consolidado
data.to_csv('zoo-train-all.csv')

In [17]:
# carregar arquivo de dados de treino
data = pd.read_csv('zoo-train-all.csv', index_col='animal_name')
data['class_type'] = data['class_type'].astype('category')

# mostrar alguns exemplos de registros
data.head()

Unnamed: 0_level_0,hair,feathers,eggs,milk,airborne,aquatic,predator,toothed,backbone,breathes,venomous,fins,legs,tail,domestic,catsize,class_type
animal_name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1
swan,0,1,1,0,1,1,0,0,1,1,0,0,2,1,0,1,2
tuatara,0,0,1,0,0,0,1,1,1,1,0,0,4,1,0,0,3
bass,0,0,1,0,0,1,1,1,1,0,0,1,0,1,0,0,4
hare,1,0,0,1,0,0,0,1,1,1,0,0,4,1,0,0,1
chub,0,0,1,0,0,1,1,1,1,0,0,1,0,1,0,0,4
