# Importando Códigos

In [2]:
import pandas as pd

pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000)

# Carregando Conjunto de Dados

In [3]:
url = "imports-85.csv"
col_names=['symboling','normalized-losses','fuel-type','aspiration','num-of-doors','body-style','drive-wheels','engine-location','wheel-base',
                                    'length','width','height','curb-weight','engine-type','num-of-cylinders','engine-size',
                                    'fuel-system','bore','stroke','compression-ratio','horsepower','peak-rpm','city-mpg','highway-mpg','price']

df_car = pd.read_csv(url,',',names = col_names ,na_values="?",  header=None)

## Vendo número de linhas e colunas

In [4]:
df_car.shape

(205, 25)

## Examinando os dados

In [5]:
df_car.head(5)

Unnamed: 0,symboling,normalized-losses,fuel-type,aspiration,num-of-doors,body-style,drive-wheels,engine-location,wheel-base,length,width,height,curb-weight,engine-type,num-of-cylinders,engine-size,fuel-system,bore,stroke,compression-ratio,horsepower,peak-rpm,city-mpg,highway-mpg,price
0,3,,gas,std,two,convertible,rwd,front,88.6,168.8,64.1,48.8,2548,dohc,four,130,mpfi,3.47,2.68,9.0,111.0,5000.0,21,27,13495.0
1,3,,gas,std,two,convertible,rwd,front,88.6,168.8,64.1,48.8,2548,dohc,four,130,mpfi,3.47,2.68,9.0,111.0,5000.0,21,27,16500.0
2,1,,gas,std,two,hatchback,rwd,front,94.5,171.2,65.5,52.4,2823,ohcv,six,152,mpfi,2.68,3.47,9.0,154.0,5000.0,19,26,16500.0
3,2,164.0,gas,std,four,sedan,fwd,front,99.8,176.6,66.2,54.3,2337,ohc,four,109,mpfi,3.19,3.4,10.0,102.0,5500.0,24,30,13950.0
4,2,164.0,gas,std,four,sedan,4wd,front,99.4,176.6,66.4,54.3,2824,ohc,five,136,mpfi,3.19,3.4,8.0,115.0,5500.0,18,22,17450.0


### Obtendo informações das colunas

In [6]:
df_car.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 205 entries, 0 to 204
Data columns (total 25 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   symboling          205 non-null    int64  
 1   normalized-losses  164 non-null    float64
 2   fuel-type          205 non-null    object 
 3   aspiration         205 non-null    object 
 4   num-of-doors       203 non-null    object 
 5   body-style         205 non-null    object 
 6   drive-wheels       205 non-null    object 
 7   engine-location    205 non-null    object 
 8   wheel-base         205 non-null    float64
 9   length             205 non-null    float64
 10  width              205 non-null    float64
 11  height             205 non-null    float64
 12  curb-weight        205 non-null    int64  
 13  engine-type        205 non-null    object 
 14  num-of-cylinders   205 non-null    object 
 15  engine-size        205 non-null    int64  
 16  fuel-system        205 non

### Eliminando colunas não utilizadas

In [7]:
df_car.columns

Index(['symboling', 'normalized-losses', 'fuel-type', 'aspiration', 'num-of-doors', 'body-style', 'drive-wheels', 'engine-location', 'wheel-base', 'length', 'width', 'height', 'curb-weight', 'engine-type', 'num-of-cylinders', 'engine-size', 'fuel-system', 'bore', 'stroke', 'compression-ratio', 'horsepower', 'peak-rpm', 'city-mpg', 'highway-mpg', 'price'], dtype='object')

In [8]:
df_car = df_car[[ 'aspiration', 'num-of-doors',  'drive-wheels',  'num-of-cylinders']].copy()

### Verificando colunas selecionadas

In [9]:
df_car.head()

Unnamed: 0,aspiration,num-of-doors,drive-wheels,num-of-cylinders
0,std,two,rwd,four
1,std,two,rwd,four
2,std,two,rwd,six
3,std,four,fwd,four
4,std,four,4wd,five


# Codificação de Componentes Ordinais

### Obtendo informações sobre o dataframe

In [10]:
df_car.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 205 entries, 0 to 204
Data columns (total 4 columns):
 #   Column            Non-Null Count  Dtype 
---  ------            --------------  ----- 
 0   aspiration        205 non-null    object
 1   num-of-doors      203 non-null    object
 2   drive-wheels      205 non-null    object
 3   num-of-cylinders  205 non-null    object
dtypes: object(4)
memory usage: 6.5+ KB


### Descobrindo valores nas features ordinais com a função value_counts

In [11]:
df_car['num-of-doors'].value_counts()

four    114
two      89
Name: num-of-doors, dtype: int64

### Criando mapeamento

In [12]:
door_mapper = {"two": 2,
              "four": 4}

### Utilizando replace do pandas para gerar nova coluna numérica

In [13]:
# nova coluna num-of-doors

In [15]:
df_car['doors'] = df_car["num-of-doors"].replace(door_mapper)

### Exibindo o dataframe

In [16]:
df_car.head()

Unnamed: 0,aspiration,num-of-doors,drive-wheels,num-of-cylinders,doors
0,std,two,rwd,four,2.0
1,std,two,rwd,four,2.0
2,std,two,rwd,six,2.0
3,std,four,fwd,four,4.0
4,std,four,4wd,five,4.0


#### Repetindo o processo com a var num-of-cylinders

In [17]:
df_car['num-of-cylinders'].value_counts()

four      159
six        24
five       11
eight       5
two         4
twelve      1
three       1
Name: num-of-cylinders, dtype: int64

In [18]:
cylinder_mapper = {"two":2,
                  "three":3,
                  "four":4,
                  "five":5,
                  "six":6,
                  "eight":8,
                  "twelve":12}

In [19]:
df_car['cylinders'] = df_car['num-of-cylinders'].replace(cylinder_mapper)

In [20]:
df_car.head()

Unnamed: 0,aspiration,num-of-doors,drive-wheels,num-of-cylinders,doors,cylinders
0,std,two,rwd,four,2.0,4
1,std,two,rwd,four,2.0,4
2,std,two,rwd,six,2.0,6
3,std,four,fwd,four,4.0,4
4,std,four,4wd,five,4.0,5


## Codificando Dados Categóricos Não-Ordinais

In [21]:
# utilizaremos a função get_dummies

### Verificando o atributodrive-wheels

In [22]:
df_car['drive-wheels'].value_counts()

fwd    120
rwd     76
4wd      9
Name: drive-wheels, dtype: int64

### Utilizando o get_dummies para adicionar novos componentes binários ao dataframe

In [23]:
df_car = pd.get_dummies(df_car,columns=['drive-wheels'])

In [25]:
df_car.head()

Unnamed: 0,aspiration,num-of-doors,num-of-cylinders,doors,cylinders,drive-wheels_4wd,drive-wheels_fwd,drive-wheels_rwd
0,std,two,four,2.0,4,0,0,1
1,std,two,four,2.0,4,0,0,1
2,std,two,six,2.0,6,0,0,1
3,std,four,four,4.0,4,0,1,0
4,std,four,five,4.0,5,1,0,0


### Utilizando o método drop_first para categorizar como True um certo resultado do atributo

In [26]:
# Trabalharemos no atributo aspiration que possui dois resultados std ou turbo

In [27]:
df_car['aspiration'].value_counts()

std      168
turbo     37
Name: aspiration, dtype: int64

In [28]:
df_car = pd.get_dummies(df_car,columns=['aspiration'], drop_first=True)

In [29]:
df_car.head()

Unnamed: 0,num-of-doors,num-of-cylinders,doors,cylinders,drive-wheels_4wd,drive-wheels_fwd,drive-wheels_rwd,aspiration_turbo
0,two,four,2.0,4,0,0,1,0
1,two,four,2.0,4,0,0,1,0
2,two,six,2.0,6,0,0,1,0
3,four,four,4.0,4,0,1,0,0
4,four,five,4.0,5,1,0,0,0
