## 1. Diamond data loading

In [2]:
import pandas as pd
import numpy as np
import math

from lightgbm import LGBMRegressor
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import RandomizedSearchCV

from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error

In [3]:
# load the .csv train and predict in their variables (diamond and diamonds_predict)
diamonds = pd.read_csv('../data/diamonds_train.csv')
diamonds_predict = pd.read_csv('../data/diamonds_predict.csv')

In [7]:
diamonds.head()

Unnamed: 0,carat,cut,color,clarity,depth,table,price,x,y,z
0,1.21,Premium,J,VS2,62.4,58.0,4268,6.83,6.79,4.25
1,0.32,Very Good,H,VS2,63.0,57.0,505,4.35,4.38,2.75
2,0.71,Fair,G,VS1,65.5,55.0,2686,5.62,5.53,3.65
3,0.41,Good,D,SI1,63.8,56.0,738,4.68,4.72,3.0
4,1.02,Ideal,G,SI1,60.5,59.0,4882,6.55,6.51,3.95


## 2. Change and add new columns in DFs

### Rows with x, y, z with zeros:

In [133]:
#for example, rows with x == 0:
filter1 = diamonds['x']==0
diamonds[filter1]

Unnamed: 0,carat,cut,color,clarity,depth,table,price,x,y,z
6465,0.71,Good,F,SI2,64.1,60.0,2130,0.0,0.0,0.0
14815,1.07,Ideal,F,SI2,61.6,56.0,4954,0.0,6.62,0.0
26192,2.25,Premium,H,SI2,62.8,59.0,18034,0.0,0.0,0.0
28029,0.71,Good,F,SI2,64.1,60.0,2130,0.0,0.0,0.0
34423,1.14,Fair,G,VS1,57.5,67.0,6381,0.0,0.0,0.0


In [134]:
def changes_zeros(x):
    '''
    This function calculates the mean value of the column 'x', 'y' or 'z' based on diamonds 
    that have the same type of cut, color, clarity and a similar carat 
    (for a small carat, we take +- 0.1 ; for a big carat, we take +- 0.3)
    '''
    for row in diamonds.index:
        if diamonds[x][row] == 0:
            if diamonds['carat'][row]>2.5:
                filter_carat_up = diamonds['carat']< diamonds['carat'][row]+0.3
                filter_carat_down = diamonds['carat']> diamonds['carat'][row]-0.3
            else:
                filter_carat_up = diamonds['carat']< diamonds['carat'][row]+0.1
                filter_carat_down = diamonds['carat']> diamonds['carat'][row]-0.1
                        
            filter_cut = diamonds['cut'] == diamonds['cut'][row]
            filter_color = diamonds['color'] == diamonds['color'][row]
            filter_clarity = diamonds['clarity'] == diamonds['clarity'][row]
            diamonds[x][row] = diamonds[filter_carat_up & filter_carat_down & 
                                        filter_cut & filter_color & filter_clarity][x].mean()


In [135]:
# apply the function in each column x, y, z of our diamond df
changes_zeros('x')
changes_zeros('y')
changes_zeros('z')
diamonds.head()

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


Unnamed: 0,carat,cut,color,clarity,depth,table,price,x,y,z
0,1.21,Premium,J,VS2,62.4,58.0,4268,6.83,6.79,4.25
1,0.32,Very Good,H,VS2,63.0,57.0,505,4.35,4.38,2.75
2,0.71,Fair,G,VS1,65.5,55.0,2686,5.62,5.53,3.65
3,0.41,Good,D,SI1,63.8,56.0,738,4.68,4.72,3.0
4,1.02,Ideal,G,SI1,60.5,59.0,4882,6.55,6.51,3.95


In [136]:
def changes_zeros_predict(x):
    '''
    This function calculates the mean value of the column 'x', 'y' or 'z' based on diamonds 
    that have the same type of cut, color, clarity and a similar carat 
    (for a small carat, we take +- 0.1 ; for a big carat, we take +- 0.3)
    '''
    for row in diamonds_predict.index:
        if diamonds_predict[x][row] == 0:
            if diamonds['carat'][row]>2.5:
                filter_carat_up = diamonds['carat']< diamonds['carat'][row]+0.3
                filter_carat_down = diamonds['carat']> diamonds['carat'][row]-0.3
            else:
                filter_carat_up = diamonds['carat']< diamonds['carat'][row]+0.1
                filter_carat_down = diamonds['carat']> diamonds['carat'][row]-0.1
                
            filter_cut = diamonds_predict['cut'] == diamonds_predict['cut'][row]
            filter_color = diamonds_predict['color'] == diamonds_predict['color'][row]
            filter_clarity = diamonds_predict['clarity'] == diamonds_predict['clarity'][row]
            diamonds_predict[x][row] = diamonds_predict[filter_carat_up & filter_carat_down & 
                                                        filter_cut & filter_color & filter_clarity][x].mean()


# apply the function in each column x, y, z of our diamond_predict df
changes_zeros_predict('x')
changes_zeros_predict('y')
changes_zeros_predict('z')
diamonds_predict.head()

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


Unnamed: 0,id,carat,cut,color,clarity,depth,table,x,y,z
0,0,0.79,Very Good,F,SI1,62.7,60.0,5.82,5.89,3.67
1,1,1.2,Ideal,J,VS1,61.0,57.0,6.81,6.89,4.18
2,2,1.57,Premium,H,SI1,62.2,61.0,7.38,7.32,4.57
3,3,0.9,Very Good,F,SI1,63.8,54.0,6.09,6.13,3.9
4,4,0.5,Very Good,F,VS1,62.9,58.0,5.05,5.09,3.19


### Carat logarithmic

In [137]:
def carat_log(carat):
    '''
    this function gets the logarithmic value of the number you enter
    '''
    return math.log(carat)

In [138]:
# Apply the previous function on the carat column of the diamonds df 
diamonds['carat_log'] = diamonds[['carat']].apply(carat_log, axis=1)
diamonds.head()

Unnamed: 0,carat,cut,color,clarity,depth,table,price,x,y,z,carat_log
0,1.21,Premium,J,VS2,62.4,58.0,4268,6.83,6.79,4.25,0.19062
1,0.32,Very Good,H,VS2,63.0,57.0,505,4.35,4.38,2.75,-1.139434
2,0.71,Fair,G,VS1,65.5,55.0,2686,5.62,5.53,3.65,-0.34249
3,0.41,Good,D,SI1,63.8,56.0,738,4.68,4.72,3.0,-0.891598
4,1.02,Ideal,G,SI1,60.5,59.0,4882,6.55,6.51,3.95,0.019803


In [139]:
# Apply the previous function on the carat column of the diamonds_predict df 
diamonds_predict['carat_log'] = diamonds_predict[['carat']].apply(carat_log, axis=1)
diamonds_predict.head()

Unnamed: 0,id,carat,cut,color,clarity,depth,table,x,y,z,carat_log
0,0,0.79,Very Good,F,SI1,62.7,60.0,5.82,5.89,3.67,-0.235722
1,1,1.2,Ideal,J,VS1,61.0,57.0,6.81,6.89,4.18,0.182322
2,2,1.57,Premium,H,SI1,62.2,61.0,7.38,7.32,4.57,0.451076
3,3,0.9,Very Good,F,SI1,63.8,54.0,6.09,6.13,3.9,-0.105361
4,4,0.5,Very Good,F,VS1,62.9,58.0,5.05,5.09,3.19,-0.693147


### Shape of each diamond (very important)!!

In [140]:
def search_shape(x):
    '''
    this function returns the SHAPE of diamond based on its characteristics. 
    For this we need to know the type of 'cut', the values of 'table' and 'shape', 
    in addition to its measurements 'x' and 'y'
    
    Each diamond cut has specific table, depth and x / y values to determine the shape of the diamond.
    In these calculations, we have oversized the ranges of the "round" diamond shape 
    since it is the most common diamond on the planet (about 75% of the world's total diamonds)
    '''
    cut = x[0]                #type of cut of the diamond --> x[0]
    table = x[1]              #table of the diamond       --> x[1]
    depth = x[2]              #depth of the diamond       --> x[2]
    L_W = round(x[3]/x[4],4)  #proportion measured x/y  --> x[3]/x[4] Round the number with 4 decimals.
    
    # for a cut diamond Ideal or Premium:
    if cut == 'Ideal' or cut =='Premium':
        if (53<= table <=58) and (59<= depth <=62.5) and (0.90<= L_W <=1.05):
            return 'Round'
        elif (67<= table <=72) and (64<= depth <=75) and (0.90<= L_W <=1.05): 
            return 'Princess'
        elif (53<= table <=63) and (58<= depth <=62) and (1.35<= L_W <=1.50):
            return 'Oval'
        elif (53<= table <=63) and (58<= depth <=62) and (1.85<= L_W <=2.0):
            return 'Marquise'
        elif (53<= table <=63) and (58<= depth <=62) and (1.45<= L_W <=1.55):
            return 'Pear'
        elif (61<= table <=67) and (61<= depth <=67) and ((0.98<= L_W <=1.03) or (1.15<= L_W <=1.20)):
            return 'Cushion'
        elif (61<= table <=69) and (61<= depth <=67) and ((0.98<= L_W <=1.03) or (1.40<= L_W <=1.50)):
            return 'Esmerald'
        elif (61<= table <=69) and (61<= depth <=67) and (0.98<= L_W <=1.03):
            return 'Asscher'
        elif (61<= table <=69) and (61<= depth <=67) and ((0.98<= L_W <=1.03) or (1.20<= L_W <=1.30)):
            return 'Radiant'
        elif (53<= table <=63) and (58<= depth <=63) and (0.95<= L_W <=1.02):
            return 'Heart'
        #If the diamond does not meet any of the above ranges, 
        #we can include it within the 'Round' shape if it meets the following x/y ratio:
        elif 0.95<= L_W <=1.05:
            return 'Round'
        #If it does not meet any of the above, the diamond data may be wrong and 
        #we include it as an unknown form 'Other':
        else:
            return 'Other'
    
    # for a cut diamond Very Good:
    elif cut == 'Very Good':
        if ((52<=table<=55)or(56<=table<=60)) and ((57<=depth<=59)or(61<=depth<=63.5)) and (0.90<=L_W<=1.05):
            return 'Round'
        elif ((59<=table<=67)or(72<=table<=75)) and (64<= depth <=75) and (0.90<= L_W <=1.05): 
            return 'Princess'
        elif ((52<=table<=53)or(63<=table<=65)) and  ((56<=depth<=58)or(62<=depth<=66)) and ((1.30<=L_W<=1.34)or(1.51<=L_W<=1.55)):
            return 'Oval'
        elif ((52<=table<=53)or(64<=table<=65)) and ((56<=depth<=57.9)or(62.1<=depth<=66)) and ((1.75<=L_W<=1.84)or(2.01<=L_W<=2.15)):
            return 'Marquise'
        elif ((52<=table<=53)or(64<=table<=65)) and ((56<=depth<=57.9)or(62.1<=depth<=66)) and ((1.40<=L_W<=1.44)or(1.56<=L_W<=1.65)):
            return 'Pear'
        elif ((58<=table<=61)or(67<=table<=70)) and ((58<=depth<=60.9)or(67.1<=depth<=70)) and ((0.98<= L_W <=1.03)or(1.10<= L_W <=1.14)or(1.21<= L_W <=1.30)):
            return 'Cushion'
        elif ((57<=table<=61)or(69<=table<=72))and ((59<=depth<=60.9)or(67.1<=depth<=70)) and ((0.98<= L_W <=1.03) or (1.30<= L_W <=1.39)or(1.51<= L_W <=1.60)):
            return 'Esmerald'
        elif ((57<=table<=61)or(69<=table<=72))and ((59<=depth<=60.9)or(67.1<=depth<=70)) and (0.98<= L_W <=1.03):
            return 'Asscher'
        elif ((57<=table<=61)or(69<=table<=72))and ((59<=depth<=60.9)or(67.1<=depth<=70)) and ((0.98<= L_W <=1.03) or (1.15<= L_W <=1.19)or(1.31<= L_W <=1.35)):
            return 'Radiant'
        elif ((52<=table<=53)or(63<=table<=65))and ((56<=depth<=58)or(62.1<=depth<=66)) and ((0.89<=L_W<=0.95)or(1.02<=L_W<=1.05)):
            return 'Heart'
        #If the diamond does not meet any of the above ranges, 
        #we can include it within the 'Round' shape if it meets the following x/y ratio:
        elif 0.95<= L_W <=1.05:
            return 'Round'
        #If it does not meet any of the above, the diamond data may be wrong and 
        #we include it as an unknown form 'Other':
        else:
            return 'Other'
        
    
    # for a cut diamond Good:
    elif cut == 'Good':
        if ((51<=table<=55)or(58<=table<=64)) and ((56.5<=depth<=59)or(61<=depth<=66)) and (0.90<=L_W<=1.07):
            return 'Round'
        elif ((56<=table<=59)or(75<=table<=82)) and ((58<=depth<=63.9)or(75.1<=depth<=80)) and (0.90<= L_W <=1.05): 
            return 'Princess'
        elif ((51<=table<=52)or(65<=table<=68)) and ((53<=depth<=55.9)or(66<=depth<=71)) and ((1.25<=L_W<=1.30)or(1.55<=L_W<=1.60)):
            return 'Oval'
        elif ((51<=table<=52)or(65<=table<=68)) and ((53<=depth<=55.9)or(66<=depth<=71)) and ((1.65<=L_W<=1.75)or(2.15<=L_W<=2.30)):
            return 'Marquise'
        elif ((51<=table<=52)or(65<=table<=68)) and ((53<=depth<=55.9)or(66<=depth<=71)) and ((1.35<=L_W<=1.40)or(1.65<=L_W<=1.80)):
            return 'Pear'
        elif ((56<=table<=58)or(70<=table<=71)) and ((56<=depth<=58)or(70<=depth<=71)) and ((1.03<= L_W <=1.05)or(1.10<= L_W <=1.14)or(1.21<=L_W<=1.30)):
            return 'Cushion'
        elif ((54<=table<=57)or(72<=table<=74)) and ((57<=depth<=59)or(70<=depth<=74)) and ((1.03<= L_W <=1.05) or (1.20<= L_W <=1.30)or(1.60<=L_W<=1.80)):
            return 'Esmerald'
        elif ((54<=table<=57)or(72<=table<=74)) and ((57<=depth<=59)or(70<=depth<=74)) and (1.03<= L_W <=1.05):
            return 'Asscher'
        elif ((54<=table<=57)or(72<=table<=74)) and ((57<=depth<=59)or(70<=depth<=74)) and ((1.03<= L_W <=1.05) or (1.10<= L_W <=1.15)or(1.35<= L_W <=1.40)):
            return 'Radiant'
        elif ((51<=table<=52)or(65<=table<=68))and ((53<=depth<=55.9)or(66.1<=depth<=71)) and ((0.83<=L_W<=0.89)or(1.05<=L_W<=1.10)):
            return 'Heart'
        #If the diamond does not meet any of the above ranges, 
        #we can include it within the 'Round' shape if it meets the following x/y ratio:
        elif 0.95<= L_W <=1.05:
            return 'Round'
        #If it does not meet any of the above, the diamond data may be wrong and 
        #we include it as an unknown form 'Other':
        else:
            return 'Other'
        
    
    # for a cut diamond Fair:
    elif cut == 'Fair':
        if ((table<=51)or(64<=table)) and ((depth<=57.4)or(64.2<=depth)) and (0.90<=L_W):
            return 'Round'
        elif ((table<=56)or(82<=table)) and ((depth<=58)or(80<=depth)) and (0.90<=L_W): 
            return 'Princess'
        elif ((table<=51)or(68<=table)) and ((depth<=53)or(71<=depth)) and (1.20<=L_W):
            return 'Oval'
        elif ((table<=51)or(68<=table)) and ((depth<=53)or(71<=depth)) and (1.55<=L_W):
            return 'Marquise'
        elif ((table<=51)or(68<=table)) and ((depth<=53)or(71<=depth)) and (1.25<=L_W):
            return 'Pear'
        elif ((table<=56)or(71<=table)) and ((depth<=56)or(71<=depth)):
            return 'Cushion'
        elif ((table<=54)or(74<=table)) and ((depth<=57)or(74<=depth)) and (L_W<=1.20)or(1.80<=L_W):
            return 'Esmerald'
        elif ((table<=54)or(74<=table)) and ((depth<=57)or(74<=depth)) and (1.05<=L_W):
            return 'Asscher'
        elif ((table<=54)or(72<=table)) and ((depth<=57)or(74<=depth)):
            return 'Radiant'
        elif ((table<=51)or(68<=table))and ((depth<=53)or(71<=depth)):
            return 'Heart'
        #If the diamond does not meet any of the above ranges, 
        #we can include it within the 'Round' shape if it meets the following x/y ratio:
        elif 0.95<= L_W <=1.05:
            return 'Round'
        #If it does not meet any of the above, the diamond data may be wrong and 
        #we include it as an unknown form 'Other':
        else:
            return 'Other'

In [141]:
# Apply the previous function on the diamonds df 
diamonds['shape'] = diamonds[['cut','table','depth','x', 'y']].apply(search_shape, axis=1)

diamonds.head()

Unnamed: 0,carat,cut,color,clarity,depth,table,price,x,y,z,carat_log,shape
0,1.21,Premium,J,VS2,62.4,58.0,4268,6.83,6.79,4.25,0.19062,Round
1,0.32,Very Good,H,VS2,63.0,57.0,505,4.35,4.38,2.75,-1.139434,Round
2,0.71,Fair,G,VS1,65.5,55.0,2686,5.62,5.53,3.65,-0.34249,Round
3,0.41,Good,D,SI1,63.8,56.0,738,4.68,4.72,3.0,-0.891598,Princess
4,1.02,Ideal,G,SI1,60.5,59.0,4882,6.55,6.51,3.95,0.019803,Heart


In [142]:
# Apply the previous function on the diamonds_predict df 
diamonds_predict['shape'] = diamonds_predict[['cut','table',
                                              'depth','x', 'y']].apply(search_shape, axis=1)

diamonds_predict.head()

Unnamed: 0,id,carat,cut,color,clarity,depth,table,x,y,z,carat_log,shape
0,0,0.79,Very Good,F,SI1,62.7,60.0,5.82,5.89,3.67,-0.235722,Round
1,1,1.2,Ideal,J,VS1,61.0,57.0,6.81,6.89,4.18,0.182322,Round
2,2,1.57,Premium,H,SI1,62.2,61.0,7.38,7.32,4.57,0.451076,Cushion
3,3,0.9,Very Good,F,SI1,63.8,54.0,6.09,6.13,3.9,-0.105361,Round
4,4,0.5,Very Good,F,VS1,62.9,58.0,5.05,5.09,3.19,-0.693147,Round


### Volume of the diamond

In [143]:
def volume_shape(value):
    '''
    In this function we calculate the volume of the diamond, 
    taking into account the shape of the diamond.
    Each diamond shape can be approximated to a different pyramid:
    'Round' diamond: circular base pyramid
    'Heart' diamond: triangular base pyramid
    other diamonds: square / rectangular base pyramid
    '''
    x = value[0]     #--> x value
    y = value[1]     #--> y value
    z = value[2]     #--> z value
    shape = value[3] #--> shape of the diamond
    
    if shape == 'Round':
        return ((x/2)**2)*3.141592*z/3
    if shape == 'Heart':
        return x*x*z*(3**0.5)/12
    else:
        return x*y*z/3


In [144]:
# Apply the previous function on the diamonds df 
diamonds['volume'] = diamonds[['x','y','z','shape']].apply(volume_shape, axis=1)

diamonds.head()

Unnamed: 0,carat,cut,color,clarity,depth,table,price,x,y,z,carat_log,shape,volume
0,1.21,Premium,J,VS2,62.4,58.0,4268,6.83,6.79,4.25,0.19062,Round,51.903766
1,0.32,Very Good,H,VS2,63.0,57.0,505,4.35,4.38,2.75,-1.139434,Round,13.623219
2,0.71,Fair,G,VS1,65.5,55.0,2686,5.62,5.53,3.65,-0.34249,Round,30.181028
3,0.41,Good,D,SI1,63.8,56.0,738,4.68,4.72,3.0,-0.891598,Princess,22.0896
4,1.02,Ideal,G,SI1,60.5,59.0,4882,6.55,6.51,3.95,0.019803,Heart,24.460148


In [145]:
# Apply the previous function on the diamonds_predict df 
diamonds_predict['volume'] = diamonds_predict[['x','y','z','shape']].apply(volume_shape, axis=1)

diamonds_predict.head()

Unnamed: 0,id,carat,cut,color,clarity,depth,table,x,y,z,carat_log,shape,volume
0,0,0.79,Very Good,F,SI1,62.7,60.0,5.82,5.89,3.67,-0.235722,Round,32.544722
1,1,1.2,Ideal,J,VS1,61.0,57.0,6.81,6.89,4.18,0.182322,Round,50.75035
2,2,1.57,Premium,H,SI1,62.2,61.0,7.38,7.32,4.57,0.451076,Cushion,82.292904
3,3,0.9,Very Good,F,SI1,63.8,54.0,6.09,6.13,3.9,-0.105361,Round,37.867595
4,4,0.5,Very Good,F,VS1,62.9,58.0,5.05,5.09,3.19,-0.693147,Round,21.298155


### change the categorical columns

In [146]:
# The model does not accept categorical columns, 
# for this we are going to change the values of each categorical column for a numerical value.
# cut column: the best cut will be a 4 'Ideal, the worst a 0' Fair '
# color column: the best color will be a 6 'D', the worst a 0 'J'
# clarity column: the best clarity will be a 7 'IF', but a 0 'I1'
# shape column: the most common shape will be a 5 'Round', the least common a 0 'Other'

# change each column according to the scale we have defined in the diamonds df:
diamonds['cut'] = diamonds['cut'].map({'Fair':0,'Good':1,'Very Good':2,'Premium':3, 'Ideal':4})
diamonds['color'] = diamonds['color'].map({'J':0, 'I':1, 'H':2, 'G':3, 'F': 4, 'E': 5, 'D':6})
diamonds['clarity'] = diamonds['clarity'].map({'I1':0,'SI2':1,'SI1':2,'VS2':3,'VS1':4,'VVS2':5,
                                               'VVS1':6,'IF':7})
diamonds['shape'] = diamonds['shape'].map({'Other':0,'Esmerald':1,'Princess':2,'Cushion':3,
                                           'Heart':4,'Round':5})
diamonds.head()

Unnamed: 0,carat,cut,color,clarity,depth,table,price,x,y,z,carat_log,shape,volume
0,1.21,3,0,3,62.4,58.0,4268,6.83,6.79,4.25,0.19062,5,51.903766
1,0.32,2,2,3,63.0,57.0,505,4.35,4.38,2.75,-1.139434,5,13.623219
2,0.71,0,3,4,65.5,55.0,2686,5.62,5.53,3.65,-0.34249,5,30.181028
3,0.41,1,6,2,63.8,56.0,738,4.68,4.72,3.0,-0.891598,2,22.0896
4,1.02,4,3,2,60.5,59.0,4882,6.55,6.51,3.95,0.019803,4,24.460148


In [147]:
# change each column according to the scale we have defined in the diamonds_predict df:
diamonds_predict['cut'] = diamonds_predict['cut'].map({'Fair':0,'Good':1,'Very Good':2,'Premium':3, 
                                                       'Ideal':4})
diamonds_predict['color'] = diamonds_predict['color'].map({'J':0, 'I':1, 'H':2, 'G':3, 'F': 4, 
                                                           'E': 5, 'D':6})
diamonds_predict['clarity'] = diamonds_predict['clarity'].map({'I1':0,'SI2':1,'SI1':2,'VS2':3,
                                                               'VS1':4,'VVS2':5,'VVS1':6,'IF':7})
diamonds_predict['shape'] = diamonds_predict['shape'].map({'Other':0,'Esmerald':1,'Princess':2,
                                                           'Cushion':3,'Heart':4,'Round':5})
diamonds_predict.head()

Unnamed: 0,id,carat,cut,color,clarity,depth,table,x,y,z,carat_log,shape,volume
0,0,0.79,2,4,2,62.7,60.0,5.82,5.89,3.67,-0.235722,5,32.544722
1,1,1.2,4,0,4,61.0,57.0,6.81,6.89,4.18,0.182322,5,50.75035
2,2,1.57,3,2,2,62.2,61.0,7.38,7.32,4.57,0.451076,3,82.292904
3,3,0.9,2,4,2,63.8,54.0,6.09,6.13,3.9,-0.105361,5,37.867595
4,4,0.5,2,4,4,62.9,58.0,5.05,5.09,3.19,-0.693147,5,21.298155


##### I have tested the models with other characteristics, such as: density, x/y or x/z ratio, logarithm of the price ... but I have obtained worse results when testing the final model, therefore I have only applied the previous changes.

## 3. ML model...

#### We have tested various models to predict the best price for each diamond:
##### RandomForestRegressor
##### HistGradientBoostingRegressor
##### LGBMRegressor
##### AdaBoostRegressor
#### and after testing all of them, we have obtained the best result with the model: **LGBMRegressor**

In [157]:
# for FEATS, don't use the x,y,z because the error in the model is bigger.
TARGET = 'price'
FEATS = ['cut', 'color', 'clarity','carat_log','shape','depth','table','volume']

In [163]:
diamonds_predict[FEATS].head()

Unnamed: 0,cut,color,clarity,carat_log,shape,depth,table,volume
0,2,4,2,-0.235722,5,62.7,60.0,32.544722
1,4,0,4,0.182322,5,61.0,57.0,50.75035
2,3,2,2,0.451076,3,62.2,61.0,82.292904
3,2,4,2,-0.105361,5,63.8,54.0,37.867595
4,2,4,4,-0.693147,5,62.9,58.0,21.298155


In [164]:
# To introduce the numerical values of our columns into the model, 
#we first standardize these measures with StandardScaler ().

numeric_transformer = Pipeline(steps=[('scaler', StandardScaler())])

preprocessor = ColumnTransformer(transformers=[('num', numeric_transformer, FEATS)])

model= Pipeline(steps=[('preprocessor', preprocessor),
                       ('regressor', LGBMRegressor())])

In [None]:
# To obtain the best hyperparameters of our model, we test a wider range of values and then 
#we reduce until obtaining close but different values, in order to obtain the best results.

param_grid = {
    'regressor__num_leaves': [47,48,49,50,51,52],
    'regressor__n_estimators': [350,360,370,380,390,400,410,420,430],
    'regressor__min_data_in_leaf': [4,5,6,7,8,12,15],
    'regressor__max_depth': [5,6,7],
    'regressor__learning_rate': [0.02,0.03,0.04,0.045,0.05,0.055,0.06],
    'regressor__feature_fraction': [0.75,0.8,0.85,0.9,0.95,1],
    'regressor__bagging_frequency': [0.75,0.80,0.85,0.9,0.95,1],
    'regressor__bagging_fraction': [0.75,0.8,0.85,0.9,0.95,1],
}

grid_search = RandomizedSearchCV(model, 
                                 param_grid, 
                                 cv=5, 
                                 verbose=5, 
                                 scoring='neg_root_mean_squared_error', 
                                 n_jobs=-1,
                                 n_iter=400)

grid_search.fit(diamonds[FEATS], diamonds[TARGET])

In [None]:
#Once we have trained the model with different hyperparameter values, 
#we see which parameters have obtained the best score:
grid_search.best_params_

In [None]:
# see the best score that we have trained:
grid_search.best_score_

In [170]:
#for a score of 510 we obtain these parameters:
#(in public kaggle obtain 496.33 and in the private obtain 514.34)
model_best = Pipeline(steps=[('preprocessor', preprocessor),
                            ('regressor', LGBMRegressor(num_leaves=51,
                                                        n_estimators=370,
                                                        min_data_in_leaf=6,
                                                        max_depth=7,
                                                        learning_rate=0.045,
                                                        feature_fraction=0.85,
                                                        bagging_frequency=0.9,
                                                        bagging_fraction=0.8,
                                                       ))])


#for a score of 513 we obtain these parameters:
#(in public kaggle obtain 494.51 and in the private obtain 516.08)

# Pipeline(steps=[('preprocessor', preprocessor),
#                 ('regressor', LGBMRegressor(num_leaves=49,
#                                             n_estimators=380,
#                                             min_data_in_leaf=6,
#                                             max_depth=6,
#                                             learning_rate=0.04,
#                                             feature_fraction=0.9,
#                                             bagging_frequency=0.9,
#                                             bagging_fraction=0.95,             
#                                             ))])

In [171]:
#To test the results and verify that it did not commit a great overfitting, 
#I divided the df of diamonds into two parts, train and test, 
#and I saw the error that I obtained with the parameters of the best trained model.

diamonds_train, diamonds_test = train_test_split(diamonds)

model_best.fit(diamonds_train[FEATS], diamonds_train[TARGET]);

y_test_best = model_best.predict(diamonds_test[FEATS])
y_train_best = model_best.predict(diamonds_train[FEATS])


print(f"test error: {mean_squared_error(y_pred=y_test_best, y_true=diamonds_test[TARGET], squared=False)}")
print(f"train error:{mean_squared_error(y_pred=y_train_best, y_true=diamonds_train[TARGET], squared=False)}")


test error: 526.0445463195549
train error:412.04927019558556


#### when we have already achieved the best result of our model, we train it with all the diamonds we have in our df diamond, we predict the prices of the df diamonds_predict and save the results in a new df submission_df to save it and upload it to the kaggle competition!

In [99]:
model_best.fit(diamonds[FEATS], diamonds[TARGET]);

In [100]:
y_pred = model_best.predict(diamonds_predict[FEATS])

submission_df = pd.DataFrame({'id': diamonds_predict['id'], 'price': y_pred})
submission_df.head()

Unnamed: 0,id,price
0,0,2832.593813
1,1,5747.218639
2,2,9545.086506
3,3,4116.281068
4,4,1590.00048


In [101]:
# We check that the values of the predicted prices have a certain logic and 
#do not give very high or very small values ...
submission_df.describe()

Unnamed: 0,id,price
count,13485.0,13485.0
mean,6742.0,3951.796801
std,3892.928525,3958.438423
min,0.0,334.959253
25%,3371.0,939.927483
50%,6742.0,2459.66613
75%,10113.0,5313.072356
max,13484.0,18285.089257


In [102]:
submission_df.to_csv('../submissions/diamonds_standardscale_LGBM_FEATS_510.csv', index=False)