# 1. Exploratory analysis (EDA), standarization and encoding

In [18]:
# Data handling
import numpy as np
import pandas as pd
from scipy import stats
from sklearn.impute import SimpleImputer
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer
from sklearn.impute import KNNImputer

# Standarization
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import RobustScaler
from scipy import stats

# Encoding
from sklearn.preprocessing import LabelEncoder 
from sklearn.preprocessing import OneHotEncoder  
from sklearn.preprocessing import OrdinalEncoder

# Plots
import matplotlib.pyplot as plt
from matplotlib import style
import matplotlib.ticker as ticker
import seaborn as sns

# Preprocessing and modeling
plt.rcParams['figure.figsize'] = (10,8)
from sklearn import metrics
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score 
from sklearn.metrics import precision_score 
from sklearn.metrics import recall_score 
from sklearn.metrics import f1_score 
from sklearn.metrics import cohen_kappa_score
from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestRegressor
from sklearn.tree import DecisionTreeRegressor 
from sklearn.model_selection import cross_val_score
from sklearn.neighbors import KNeighborsRegressor
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import GridSearchCV
from sklearn import tree

# Pepinillo
import pickle

# Warning configuration
import warnings
warnings.filterwarnings('ignore')

In [19]:
# Reads csv
df_train = pd.read_csv('../data/train.csv', index_col=0)
df_train.reset_index(drop = False, inplace = True)
df_train.head()

Unnamed: 0,id,carat,cut,color,clarity,depth,table,x,y,z,price
0,0,0.3,Premium,D,SI2,62.4,58.0,4.31,4.28,2.68,6.353
1,1,1.01,Ideal,E,VVS2,62.7,56.0,6.42,6.46,4.04,9.183
2,2,0.72,Ideal,F,VS2,61.8,59.0,5.71,5.74,3.54,7.983
3,3,1.08,Very Good,G,SI2,63.2,57.0,6.54,6.5,4.12,8.371
4,4,0.36,Premium,G,VS1,62.3,59.0,4.5,4.55,2.82,6.588


In [20]:
df_test = pd.read_csv('../data/test.csv', index_col=0)
df_test.reset_index(drop = False, inplace = True)
df_test.head()

Unnamed: 0,id,carat,cut,color,clarity,depth,table,x,y,z
0,0,0.32,Ideal,I,SI1,60.5,58.0,4.43,4.49,2.7
1,1,1.24,Premium,I,SI1,62.9,60.0,6.8,6.74,4.26
2,2,1.66,Premium,D,SI1,62.0,59.0,7.55,7.6,4.7
3,3,0.75,Premium,D,SI2,60.6,56.0,5.94,5.9,3.59
4,4,1.5,Fair,E,SI2,64.8,55.0,7.26,7.15,4.67


### Features
- id: only for test & sample submission files, id for prediction sample identification
- price: price in USD
- carat: weight of the diamond
- cut: quality of the cut (Fair, Good, Very Good, Premium, Ideal)
- color: diamond colour
- clarity: a measurement of how clear the diamond is
- x: length in mm
- y: width in mm
- z: depth in mm
- depth: total depth percentage = z / mean(x, y) = 2 * z / (x + y) (43--79)
- table: width of top of diamond relative to widest point (43--95)

# standarization

In [21]:
def standardize_numeric_data(df, cols):
    
    robust = RobustScaler()
    robust.fit(df[cols])
    
    X_robust = robust.transform(df[cols])
    df[cols] = X_robust

    return df

In [22]:
numeric_cols = df_train.select_dtypes(include=np.number).drop(['id', 'price'], axis=1).columns
numeric_cols = df_test.select_dtypes(include=np.number).drop(['id'], axis=1).columns

In [23]:
df_train = standardize_numeric_data(df_train, numeric_cols)
df_test = standardize_numeric_data(df_test, numeric_cols)

In [24]:
df_train.head(1)

Unnamed: 0,id,carat,cut,color,clarity,depth,table,x,y,z,price
0,0,-0.625,Premium,D,SI2,0.4,0.333333,-0.754098,-0.78022,-0.75,6.353


In [25]:
df_test.head(1)

Unnamed: 0,id,carat,cut,color,clarity,depth,table,x,y,z
0,0,-0.609375,Ideal,I,SI1,-1.0,0.333333,-0.707182,-0.683333,-0.741071


# encoding

In [26]:
def custom_encoder(dataframe, column_name, encoding_order):

    encoding_dict = {value: index for index, value in enumerate(encoding_order)}
    dataframe[column_name + '_encoded'] = dataframe[column_name].map(encoding_dict)
    return dataframe


In [27]:
orden = ['Ideal', 'Premium', 'Very Good', 'Good', 'Fair']
df_train = custom_encoder(df_train, 'cut', orden)
df_test = custom_encoder(df_test, 'cut', orden)

In [28]:
orden = ['D', 'E', 'F', 'G', 'H', 'I', 'J']
df_train = custom_encoder(df_train, 'color', orden)
df_test = custom_encoder(df_test, 'color', orden)

In [29]:
# https://www.diamonds.pro/education/clarity/
orden = ['IF', 'VVS1', 'VVS2', 'VS1', 'VS2', 'SI1', 'SI2', 'I1']
df_train = custom_encoder(df_train, 'clarity', orden)
df_test = custom_encoder(df_test, 'clarity', orden)

In [30]:
df_train.drop(['cut', 'color', 'clarity'], axis=1, inplace=True)
df_test.drop(['cut', 'color', 'clarity'], axis=1, inplace=True)

In [31]:
df_train.head(1)

Unnamed: 0,id,carat,depth,table,x,y,z,price,cut_encoded,color_encoded,clarity_encoded
0,0,-0.625,0.4,0.333333,-0.754098,-0.78022,-0.75,6.353,1,0,6


In [32]:
df_test.head(1)

Unnamed: 0,id,carat,depth,table,x,y,z,cut_encoded,color_encoded,clarity_encoded
0,0,-0.609375,-1.0,0.333333,-0.707182,-0.683333,-0.741071,0,5,5


In [33]:
name = input('eda name')

In [34]:
df_train.to_csv(f'../data/clean_train_{name}.csv', index = False, sep = ',')
df_test.to_csv(f'../data/clean_test_{name}.csv', index = False, sep = ',')