# 3. Cols volume and density

In [11]:
# Data handling
import numpy as np
import pandas as pd
from scipy import stats
from sklearn.impute import SimpleImputer
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer
from sklearn.impute import KNNImputer

# Standarization
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import RobustScaler
from scipy import stats

# Encoding
from sklearn.preprocessing import LabelEncoder 
from sklearn.preprocessing import OneHotEncoder  
from sklearn.preprocessing import OrdinalEncoder

# Plots
import matplotlib.pyplot as plt
from matplotlib import style
import matplotlib.ticker as ticker
import seaborn as sns

# Preprocessing and modeling
plt.rcParams['figure.figsize'] = (10,8)
from sklearn import metrics
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score 
from sklearn.metrics import precision_score 
from sklearn.metrics import recall_score 
from sklearn.metrics import f1_score 
from sklearn.metrics import cohen_kappa_score
from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestRegressor
from sklearn.tree import DecisionTreeRegressor 
from sklearn.model_selection import cross_val_score
from sklearn.neighbors import KNeighborsRegressor
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import GridSearchCV
from sklearn import tree

# Pepinillo
import pickle

# Warning configuration
import warnings
warnings.filterwarnings('ignore')

In [12]:
# Reads csv
df_train = pd.read_csv('../data/train.csv', index_col=0)
df_train.reset_index(drop = False, inplace = True)
df_train.head()

Unnamed: 0,id,carat,cut,color,clarity,depth,table,x,y,z,price
0,0,0.3,Premium,D,SI2,62.4,58.0,4.31,4.28,2.68,6.353
1,1,1.01,Ideal,E,VVS2,62.7,56.0,6.42,6.46,4.04,9.183
2,2,0.72,Ideal,F,VS2,61.8,59.0,5.71,5.74,3.54,7.983
3,3,1.08,Very Good,G,SI2,63.2,57.0,6.54,6.5,4.12,8.371
4,4,0.36,Premium,G,VS1,62.3,59.0,4.5,4.55,2.82,6.588


In [13]:
df_test = pd.read_csv('../data/test.csv', index_col=0)
df_test.reset_index(drop = False, inplace = True)
df_test.head()

Unnamed: 0,id,carat,cut,color,clarity,depth,table,x,y,z
0,0,0.32,Ideal,I,SI1,60.5,58.0,4.43,4.49,2.7
1,1,1.24,Premium,I,SI1,62.9,60.0,6.8,6.74,4.26
2,2,1.66,Premium,D,SI1,62.0,59.0,7.55,7.6,4.7
3,3,0.75,Premium,D,SI2,60.6,56.0,5.94,5.9,3.59
4,4,1.5,Fair,E,SI2,64.8,55.0,7.26,7.15,4.67


### Features
- id: only for test & sample submission files, id for prediction sample identification
- price: price in USD
- carat: weight of the diamond
- cut: quality of the cut (Fair, Good, Very Good, Premium, Ideal)
- color: diamond colour
- clarity: a measurement of how clear the diamond is
- x: length in mm
- y: width in mm
- z: depth in mm
- depth: total depth percentage = z / mean(x, y) = 2 * z / (x + y) (43--79)
- table: width of top of diamond relative to widest point (43--95)

### Info:
- 1 carat = 200mg

In [14]:
df_density = df_train.copy()
df_density['weight'] = df_density.carat * 200 # mg
df_density['volume'] = df_density.x * df_density.y * df_density.z # mm3
df_density['volume'] = df_density['volume'] / 3 # aprox for a diamond
df_density['density'] = df_density['weight']/df_density['volume'] #mg/mm3
df_density['density'] = df_density['density'] * 1000 #kg/m3
df_density.head()

Unnamed: 0,id,carat,cut,color,clarity,depth,table,x,y,z,price,weight,volume,density
0,0,0.3,Premium,D,SI2,62.4,58.0,4.31,4.28,2.68,6.353,60.0,16.479141,3640.966406
1,1,1.01,Ideal,E,VVS2,62.7,56.0,6.42,6.46,4.04,9.183,202.0,55.850576,3616.793496
2,2,0.72,Ideal,F,VS2,61.8,59.0,5.71,5.74,3.54,7.983,144.0,38.674972,3723.338184
3,3,1.08,Very Good,G,SI2,63.2,57.0,6.54,6.5,4.12,8.371,216.0,58.3804,3699.871875
4,4,0.36,Premium,G,VS1,62.3,59.0,4.5,4.55,2.82,6.588,72.0,19.2465,3740.939911


In [15]:
# mean density of a diamond = 3500

threshold = int(input('threshold'))/100

df_density = df_density[(df_density.density > 3500-3500 * threshold) & (df_density.density < 3500+3500 * threshold)]
df_density.drop(['x', 'y', 'z'], axis=1, inplace=True)
df_density.head()

Unnamed: 0,id,carat,cut,color,clarity,depth,table,price,weight,volume,density
0,0,0.3,Premium,D,SI2,62.4,58.0,6.353,60.0,16.479141,3640.966406
1,1,1.01,Ideal,E,VVS2,62.7,56.0,9.183,202.0,55.850576,3616.793496
3,3,1.08,Very Good,G,SI2,63.2,57.0,8.371,216.0,58.3804,3699.871875
5,5,0.53,Ideal,F,VS1,61.4,57.0,7.496,106.0,28.641947,3700.86577
7,7,0.33,Ideal,E,VVS2,60.0,56.0,6.877,66.0,18.728832,3523.978431


In [16]:
df_density.shape

(29508, 11)

In [17]:
df_train = df_density

In [18]:
df_density = df_test.copy()
df_density['weight'] = df_density.carat * 200 # mg
df_density['volume'] = df_density.x * df_density.y * df_density.z # mm3
df_density['volume'] = df_density['volume'] / 3 # aprox for a diamond
df_density['density'] = df_density['weight']/df_density['volume'] #mg/mm3
df_density['density'] = df_density['density'] * 1000 #kg/m3
df_density.head()

Unnamed: 0,id,carat,cut,color,clarity,depth,table,x,y,z,weight,volume,density
0,0,0.32,Ideal,I,SI1,60.5,58.0,4.43,4.49,2.7,64.0,17.90163,3575.093441
1,1,1.24,Premium,I,SI1,62.9,60.0,6.8,6.74,4.26,248.0,65.08144,3810.610214
2,2,1.66,Premium,D,SI1,62.0,59.0,7.55,7.6,4.7,332.0,89.895333,3693.183925
3,3,0.75,Premium,D,SI2,60.6,56.0,5.94,5.9,3.59,150.0,41.93838,3576.676066
4,4,1.5,Fair,E,SI2,64.8,55.0,7.26,7.15,4.67,300.0,80.80501,3712.641085


In [19]:
# mean density of a diamond = 3500

threshold = int(input('threshold'))/100

df_density = df_density[(df_density.density > 3500-3500 * threshold) & (df_density.density < 3500+3500 * threshold)]
df_density.drop(['x', 'y', 'z'], axis=1, inplace=True)
df_density.head()

Unnamed: 0,id,carat,cut,color,clarity,depth,table,weight,volume,density
0,0,0.32,Ideal,I,SI1,60.5,58.0,64.0,17.90163,3575.093441
2,2,1.66,Premium,D,SI1,62.0,59.0,332.0,89.895333,3693.183925
3,3,0.75,Premium,D,SI2,60.6,56.0,150.0,41.93838,3576.676066
5,5,0.82,Ideal,H,VS2,61.5,56.0,164.0,45.310592,3619.462752
6,6,0.54,Ideal,G,SI2,61.8,54.0,108.0,29.9728,3603.266962


In [20]:
df_test = df_density

# standarization

In [21]:
def standardize_numeric_data(df, cols):
    
    robust = RobustScaler()
    robust.fit(df[cols])
    
    X_robust = robust.transform(df[cols])
    df[cols] = X_robust

    return df

In [22]:
numeric_cols = df_train.select_dtypes(include=np.number).drop(['id', 'price'], axis=1).columns
numeric_cols = df_test.select_dtypes(include=np.number).drop(['id'], axis=1).columns

In [23]:
df_train = standardize_numeric_data(df_train, numeric_cols)
df_test = standardize_numeric_data(df_test, numeric_cols)

In [24]:
df_train.head(1)

Unnamed: 0,id,carat,cut,color,clarity,depth,table,price,weight,volume,density
0,0,-0.393939,Premium,D,SI2,0.615385,0.5,6.353,-0.393939,-0.396789,-0.123521


In [25]:
df_test.head(1)

Unnamed: 0,id,carat,cut,color,clarity,depth,table,weight,volume,density
0,0,-0.378788,Ideal,I,SI1,-0.923077,0.5,-0.378788,-0.371497,-1.182855


# encoding

In [26]:
def custom_encoder(dataframe, column_name, encoding_order):

    encoding_dict = {value: index for index, value in enumerate(encoding_order)}
    dataframe[column_name + '_encoded'] = dataframe[column_name].map(encoding_dict)
    return dataframe


In [27]:
orden = ['Ideal', 'Premium', 'Very Good', 'Good', 'Fair']
#orden = ['Fair', 'Premium', 'Good', 'Very Good', 'Ideal']
df_train = custom_encoder(df_train, 'cut', orden)
df_test = custom_encoder(df_test, 'cut', orden)

In [28]:
orden = ['D', 'E', 'F', 'G', 'H', 'I', 'J']
#orden = ['J', 'I', 'H', 'G', 'F', 'D', 'E']
df_train = custom_encoder(df_train, 'color', orden)
df_test = custom_encoder(df_test, 'color', orden)

In [29]:
# https://www.diamonds.pro/education/clarity/
orden = ['IF', 'VVS1', 'VVS2', 'VS1', 'VS2', 'SI1', 'SI2', 'I1']
#orden = ['SI2', 'I1', 'SI1', 'VS2', 'VS1', 'VVS2', 'IF', 'VVS1']
df_train = custom_encoder(df_train, 'clarity', orden)
df_test = custom_encoder(df_test, 'clarity', orden)

In [30]:
df_train.drop(['cut', 'color', 'clarity'], axis=1, inplace=True)
df_test.drop(['cut', 'color', 'clarity'], axis=1, inplace=True)

In [31]:
df_train.head(1)

Unnamed: 0,id,carat,depth,table,price,weight,volume,density,cut_encoded,color_encoded,clarity_encoded
0,0,-0.393939,0.615385,0.5,6.353,-0.393939,-0.396789,-0.123521,1,0,6


In [32]:
df_test.head(1)

Unnamed: 0,id,carat,depth,table,weight,volume,density,cut_encoded,color_encoded,clarity_encoded
0,0,-0.378788,-0.923077,0.5,-0.378788,-0.371497,-1.182855,0,5,5


In [33]:
name = input('eda name')

In [34]:
df_train.to_csv(f'../data/clean_train_{name}.csv', index = False, sep = ',')
df_test.to_csv(f'../data/clean_test_{name}.csv', index = False, sep = ',')