# Preparação do dataset
Preparação do dataset original [Diamonds](https://www.kaggle.com/shivam2503/diamonds) para utilização no modelo de regressão linear.

In [1]:
import mxnet as mx
import numpy as np
import pandas as pd
from pandas.api.types import CategoricalDtype

In [2]:
traindata = pd.read_csv('diamonds.csv', header=0)

In [3]:
display(traindata)

Unnamed: 0.1,Unnamed: 0,carat,cut,color,clarity,depth,table,price,x,y,z
0,1,0.23,Ideal,E,SI2,61.5,55.0,326,3.95,3.98,2.43
1,2,0.21,Premium,E,SI1,59.8,61.0,326,3.89,3.84,2.31
2,3,0.23,Good,E,VS1,56.9,65.0,327,4.05,4.07,2.31
3,4,0.29,Premium,I,VS2,62.4,58.0,334,4.20,4.23,2.63
4,5,0.31,Good,J,SI2,63.3,58.0,335,4.34,4.35,2.75
5,6,0.24,Very Good,J,VVS2,62.8,57.0,336,3.94,3.96,2.48
6,7,0.24,Very Good,I,VVS1,62.3,57.0,336,3.95,3.98,2.47
7,8,0.26,Very Good,H,SI1,61.9,55.0,337,4.07,4.11,2.53
8,9,0.22,Fair,E,VS2,65.1,61.0,337,3.87,3.78,2.49
9,10,0.23,Very Good,H,VS1,59.4,61.0,338,4.00,4.05,2.39


In [4]:
traindata.cut = pd.Categorical(traindata.cut)
traindata['cut_code'] = traindata.cut.cat.codes

In [5]:
traindata['carat'].corr(traindata['price'])


0.9215913011934779

In [6]:
traindata.color = pd.Categorical(traindata.color)
traindata['color_code'] = traindata.color.cat.codes

In [7]:
traindata['color_code'].corr(traindata['price'])


0.17251092815346727

In [8]:
traindata['cut_code'].corr(traindata['price'])

0.039860290913308664

In [9]:
traindata.cut


0            Ideal
1          Premium
2             Good
3          Premium
4             Good
5        Very Good
6        Very Good
7        Very Good
8             Fair
9        Very Good
10            Good
11           Ideal
12         Premium
13           Ideal
14         Premium
15         Premium
16           Ideal
17            Good
18            Good
19       Very Good
20            Good
21       Very Good
22       Very Good
23       Very Good
24       Very Good
25       Very Good
26         Premium
27       Very Good
28       Very Good
29       Very Good
           ...    
53910      Premium
53911      Premium
53912      Premium
53913         Good
53914         Good
53915        Ideal
53916         Good
53917    Very Good
53918      Premium
53919        Ideal
53920    Very Good
53921    Very Good
53922    Very Good
53923        Ideal
53924        Ideal
53925        Ideal
53926        Ideal
53927         Good
53928      Premium
53929        Ideal
53930      Premium
53931      P

### Converter coluna 'clarity' em categorias

In [10]:
traindata.clarity.unique()

array(['SI2', 'SI1', 'VS1', 'VS2', 'VVS2', 'VVS1', 'I1', 'IF'],
      dtype=object)

In [11]:
clarity_types  = CategoricalDtype(categories=['I1', 'SI2', 'SI1', 'VS2', 'VS1', 
                                              'VVS2', 'VVS1', 'IF'], ordered=True)

In [12]:
traindata.clarity = traindata.clarity.astype(clarity_types)

In [13]:
traindata.clarity.unique()

[SI2, SI1, VS1, VS2, VVS2, VVS1, I1, IF]
Categories (8, object): [I1 < SI2 < SI1 < VS2 < VS1 < VVS2 < VVS1 < IF]

In [14]:
traindata['clarity_code'] = traindata.clarity.cat.codes

In [15]:
traindata[['clarity', 'clarity_code']]


Unnamed: 0,clarity,clarity_code
0,SI2,1
1,SI1,2
2,VS1,4
3,VS2,3
4,SI2,1
5,VVS2,5
6,VVS1,6
7,SI1,2
8,VS2,3
9,VS1,4


### Converter coluna 'cut' em categorias 

In [17]:
traindata.cut.unique()

[Ideal, Premium, Good, Very Good, Fair]
Categories (5, object): [Ideal, Premium, Good, Very Good, Fair]

In [18]:
cut_types  = CategoricalDtype(categories=['Fair', 'Good', 'Very Good', 'Premium', 'Ideal'], ordered=True)

In [19]:
traindata.cut = traindata.cut.astype(cut_types)

In [20]:
traindata.cut.unique()

[Ideal, Premium, Good, Very Good, Fair]
Categories (5, object): [Fair < Good < Very Good < Premium < Ideal]

In [21]:
traindata['cut_code'] = traindata.cut.cat.codes

In [22]:
traindata[['cut','cut_code']]

Unnamed: 0,cut,cut_code
0,Ideal,4
1,Premium,3
2,Good,1
3,Premium,3
4,Good,1
5,Very Good,2
6,Very Good,2
7,Very Good,2
8,Fair,0
9,Very Good,2


### Converter coluna 'color' em categorias

In [24]:
traindata.color.unique()

[E, I, J, H, F, G, D]
Categories (7, object): [E, I, J, H, F, G, D]

In [25]:
color_types  = CategoricalDtype(categories=['J', 'I', 'H', 'G', 'F', 'E', 'D'], ordered=True)

In [26]:
traindata.color = traindata.color.astype(color_types)

In [27]:
traindata.color.unique()

[E, I, J, H, F, G, D]
Categories (7, object): [J < I < H < G < F < E < D]

In [28]:
traindata['color_code'] = traindata.color.cat.codes

In [30]:
traindata[['color','color_code']]

Unnamed: 0,color,color_code
0,E,5
1,E,5
2,E,5
3,I,1
4,J,0
5,J,0
6,I,1
7,H,2
8,E,5
9,H,2


### Análise de correlação com o preço

In [16]:
traindata['clarity_code'].corr(traindata['price'])

-0.14680007107894996

In [23]:
traindata['cut_code'].corr(traindata['price'])

-0.05349066086810445

In [31]:
traindata['color_code'].corr(traindata['price'])

-0.17251092815346727

In [36]:
traindata['carat'].corr(traindata['price'])

0.9215913011934779

In [39]:
traindata['z'].corr(traindata['price'])

0.8612494438514481

## Gerar dataset para o modelo

In [34]:
display(traindata)

Unnamed: 0.1,Unnamed: 0,carat,cut,color,clarity,depth,table,price,x,y,z,cut_code,color_code,clarity_code
0,1,0.23,Ideal,E,SI2,61.5,55.0,326,3.95,3.98,2.43,4,5,1
1,2,0.21,Premium,E,SI1,59.8,61.0,326,3.89,3.84,2.31,3,5,2
2,3,0.23,Good,E,VS1,56.9,65.0,327,4.05,4.07,2.31,1,5,4
3,4,0.29,Premium,I,VS2,62.4,58.0,334,4.20,4.23,2.63,3,1,3
4,5,0.31,Good,J,SI2,63.3,58.0,335,4.34,4.35,2.75,1,0,1
5,6,0.24,Very Good,J,VVS2,62.8,57.0,336,3.94,3.96,2.48,2,0,5
6,7,0.24,Very Good,I,VVS1,62.3,57.0,336,3.95,3.98,2.47,2,1,6
7,8,0.26,Very Good,H,SI1,61.9,55.0,337,4.07,4.11,2.53,2,2,2
8,9,0.22,Fair,E,VS2,65.1,61.0,337,3.87,3.78,2.49,0,5,3
9,10,0.23,Very Good,H,VS1,59.4,61.0,338,4.00,4.05,2.39,2,2,4


In [45]:
new_dataset = traindata[['carat', 'depth', 'table', 'x', 'y', 'z', 'cut_code', 'color_code', 'clarity_code', 'price']]

In [46]:
new_dataset.to_csv('diamond_prices.csv', index=False)