## Diamond Project

In [1]:
import pandas as pd
import numpy as np

In [2]:
diamond = pd.read_csv(('./diamonds-datamad0620/train.csv'))
diamond_test = pd.read_csv(('./diamonds-datamad0620/predict.csv'))


In [3]:
diamond.head()

Unnamed: 0,id,carat,cut,color,clarity,depth,table,x,y,z,price
0,0,0.3,Very Good,F,VS2,62.8,56.0,4.29,4.31,2.7,605
1,1,0.34,Ideal,E,SI1,62.6,55.0,4.46,4.49,2.8,565
2,2,0.4,Very Good,D,SI1,60.3,62.0,4.7,4.75,2.85,720
3,3,0.4,Premium,H,VS1,61.8,59.2,4.72,4.74,2.92,793
4,4,0.9,Very Good,D,SI1,61.0,63.0,6.1,6.13,3.73,4381


In [4]:
diamond.shape

(40455, 11)

Features
- id: only for test & sample submission files, id for prediction sample identification
- price: price in USD
- carat: weight of the diamond
- cut: quality of the cut (Fair, Good, Very Good, Premium, Ideal)
- color: diamond colour, from J (worst) to D (best)
- clarity: a measurement of how clear the diamond is (I1 (worst), SI2, SI1, VS2, VS1, VVS2, VVS1, IF (best))
- x: length in mm
- y: width in mm
- z: depth in mm
- depth: total depth percentage = z / mean(x, y) = 2 * z / (x + y) (43--79)
- table: width of top of diamond relative to widest point (43--95)

In [5]:
diamond.info()
# no nun values 

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 40455 entries, 0 to 40454
Data columns (total 11 columns):
 #   Column   Non-Null Count  Dtype  
---  ------   --------------  -----  
 0   id       40455 non-null  int64  
 1   carat    40455 non-null  float64
 2   cut      40455 non-null  object 
 3   color    40455 non-null  object 
 4   clarity  40455 non-null  object 
 5   depth    40455 non-null  float64
 6   table    40455 non-null  float64
 7   x        40455 non-null  float64
 8   y        40455 non-null  float64
 9   z        40455 non-null  float64
 10  price    40455 non-null  int64  
dtypes: float64(6), int64(2), object(3)
memory usage: 3.4+ MB


> Its important to pass the cut, clarity and color columns to numerical

In [6]:
clarity = ['I1', 'SI2', 'SI1', 'VS2', 'VS1', 'VVS2', 'VVS1', 'IF']
cut = ['Fair', 'Good', 'Very Good', 'Premium', 'Ideal']
color = ['J', 'H', 'I','E','G','F','D']

In [7]:
diamond["clarity"]=pd.Categorical(diamond["clarity"],ordered=True,categories=clarity)
diamond["cut"]=pd.Categorical(diamond['cut'],ordered=True,categories=cut)
diamond["color"]=pd.Categorical(diamond['color'],ordered=True,categories=color)

In [8]:
diamond['value_clarity'] = diamond["clarity"].cat.codes
diamond['value_cut'] = diamond['cut'].cat.codes
diamond['value_color'] = diamond['color'].cat.codes

diamond.head()

Unnamed: 0,id,carat,cut,color,clarity,depth,table,x,y,z,price,value_clarity,value_cut,value_color
0,0,0.3,Very Good,F,VS2,62.8,56.0,4.29,4.31,2.7,605,3,2,5
1,1,0.34,Ideal,E,SI1,62.6,55.0,4.46,4.49,2.8,565,2,4,3
2,2,0.4,Very Good,D,SI1,60.3,62.0,4.7,4.75,2.85,720,2,2,6
3,3,0.4,Premium,H,VS1,61.8,59.2,4.72,4.74,2.92,793,4,3,1
4,4,0.9,Very Good,D,SI1,61.0,63.0,6.1,6.13,3.73,4381,2,2,6


In [9]:
diamond= diamond.drop(columns=["cut","color",'clarity'])

> Lets start looking at other columns in the data set, like (x,y,z which i will rename): 


In [10]:
diamond = diamond.rename(columns={'x': 'length in mm', 'y': 'width in mm','z': 'depth in mm'})
diamond[['length in mm','width in mm','depth in mm']].describe()

Unnamed: 0,length in mm,width in mm,depth in mm
count,40455.0,40455.0,40455.0
mean,5.729391,5.733217,3.537644
std,1.121283,1.151076,0.709557
min,0.0,0.0,0.0
25%,4.71,4.72,2.91
50%,5.7,5.71,3.53
75%,6.54,6.54,4.04
max,10.23,58.9,31.8


> By doing the describe we can notice that width, depth and length have 0.0 min values. These is odd, I will replace them for there mean values.

> Also de max value of width looks to high. I will investigate on the matter

In [11]:

diamond["width in mm"]= diamond["width in mm"].replace([0.0], 5.73)
diamond["length in mm"]=diamond["length in mm"].replace([0.0], 5.73)
diamond["depth in mm"]=diamond["depth in mm"].replace([0.0], 3.54)

In [12]:
# min values changed! 
diamond[['length in mm','width in mm','depth in mm']].describe()

Unnamed: 0,length in mm,width in mm,depth in mm
count,40455.0,40455.0,40455.0
mean,5.730382,5.734067,3.539132
std,1.118747,1.148956,0.70584
min,3.73,3.68,1.07
25%,4.71,4.72,2.91
50%,5.7,5.71,3.53
75%,6.54,6.54,4.04
max,10.23,58.9,31.8



> diamond["length in mm"].unique() # after analyzing all the individual unique values I conclude that there is nothing out or the ordinary
> diamond["depth in mm"].unique()#after analyzing all the individual unique values I conclude that there is one value out of the ordinary
> diamond["width in mm"].unique()#after analyzing all the individual unique values I conclude that there are values out of the ordinary

In [13]:
display (diamond[diamond["width in mm"]>12])
diamond[diamond["depth in mm"]>10]
# these values must have been wrongly written. Im gonna drop them

Unnamed: 0,id,carat,depth,table,length in mm,width in mm,depth in mm,price,value_clarity,value_cut,value_color
12402,12402,0.51,61.8,55.0,5.15,31.8,5.12,2075,4,4,3
27676,27676,2.0,58.9,57.0,8.09,58.9,8.06,12210,1,3,1


Unnamed: 0,id,carat,depth,table,length in mm,width in mm,depth in mm,price,value_clarity,value_cut,value_color
38759,38759,0.51,61.8,54.7,5.12,5.15,31.8,1970,4,2,3


In [14]:
diamond["width in mm"]=diamond["width in mm"].replace([31.8], 5.73)
diamond["width in mm"]=diamond["width in mm"].replace([58.9], 5.73)
diamond["depth in mm"]=diamond["depth in mm"].replace([31.8], 3.53)

In [15]:
diamond[['length in mm','width in mm','depth in mm']].describe()

Unnamed: 0,length in mm,width in mm,depth in mm
count,40455.0,40455.0,40455.0
mean,5.730382,5.732109,3.538433
std,1.118747,1.110599,0.691713
min,3.73,3.68,1.07
25%,4.71,4.72,2.91
50%,5.7,5.71,3.53
75%,6.54,6.54,4.04
max,10.23,10.16,8.06


In [16]:
diamond.head()

Unnamed: 0,id,carat,depth,table,length in mm,width in mm,depth in mm,price,value_clarity,value_cut,value_color
0,0,0.3,62.8,56.0,4.29,4.31,2.7,605,3,2,5
1,1,0.34,62.6,55.0,4.46,4.49,2.8,565,2,4,3
2,2,0.4,60.3,62.0,4.7,4.75,2.85,720,2,2,6
3,3,0.4,61.8,59.2,4.72,4.74,2.92,793,4,3,1
4,4,0.9,61.0,63.0,6.1,6.13,3.73,4381,2,2,6


In [17]:
#diamond= diamond.drop(columns=["table"])

In [18]:
diamond.to_csv("inputs/clean_train.csv", header=True,index=False)

 > Hemos cambiado las columnas ya que habia valores que estaban fuera de rango.
## considerar si cargarse table??

## DIVIDE THE COLUMNS TO CREATE X and y 

In [19]:
X = diamond.drop(columns="price")
y = diamond["price"]


In [20]:
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import GradientBoostingRegressor, RandomForestRegressor
from sklearn.datasets import make_regression
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.model_selection import train_test_split
import math
from sklearn.metrics import mean_squared_error
from sklearn.experimental import enable_hist_gradient_boosting  
from sklearn.ensemble import HistGradientBoostingClassifier
from sklearn.ensemble import ExtraTreesRegressor
from sklearn.experimental import enable_hist_gradient_boosting  
from sklearn.ensemble import HistGradientBoostingRegressor
from sklearn.datasets import load_diabetes
from sklearn.svm import SVR
from sklearn.neighbors import KNeighborsRegressor
from sklearn  import tree
from sklearn.model_selection import GridSearchCV


In [21]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

> Training different models

In [23]:
models = {
   "DecisionTreeRegressor": DecisionTreeRegressor(),
    "RandomForest" : RandomForestRegressor(),
    "RandomForestN200" : RandomForestRegressor(n_estimators = 200),
    "RandomForestN900" : RandomForestRegressor(n_estimators = 900, ),
    "GradientBoosting" : GradientBoostingRegressor(n_estimators = 900),
    "HistGradinetBoostingClass2": HistGradientBoostingRegressor(),
    "ExtraTreeRegressor": ExtraTreesRegressor(n_estimators = 900,n_jobs =1, min_samples_split= 5, random_state=5, max_features = 0.9, warm_start=True),
    "neigbor":KNeighborsRegressor(),
    "tree":tree.DecisionTreeClassifier()
}


In [24]:
for name, model in models.items():
    print(f"Training {name}")
    model.fit(X_train,y_train)
print("He acabado :) ")

Training DecisionTreeRegressor
Training RandomForest
Training RandomForestN200
Training RandomForestN900
Training GradientBoosting
Training HistGradinetBoostingClass2
Training ExtraTreeRegressor
Training neigbor
Training tree
He acabado :) 


In [25]:
comparar= {modelName:model.predict(X_test) for modelName, model in models.items()}
df = pd.DataFrame(comparar)
df

Unnamed: 0,DecisionTreeRegressor,RandomForest,RandomForestN200,RandomForestN900,GradientBoosting,HistGradinetBoostingClass2,ExtraTreeRegressor,neigbor,tree
0,1656.0,1665.37,1661.840,1655.782222,1734.002712,1670.408342,1671.640000,1025.8,1449
1,1972.0,1983.82,1979.530,1982.434444,2019.557751,1895.382849,1933.201019,3243.8,1999
2,8887.0,7878.66,7915.480,7874.284444,8147.498317,8503.933951,7784.654074,4954.6,11268
3,3811.0,3516.47,3494.830,3528.167778,3815.040513,3697.714204,3562.909074,4816.2,3306
4,3703.0,3193.87,3150.500,3172.463333,3094.613514,3305.332751,3135.145278,4082.0,2852
...,...,...,...,...,...,...,...,...,...
8086,662.0,847.67,835.040,831.014444,816.140701,783.556229,884.838981,1709.4,942
8087,4362.0,4242.78,4137.260,4109.226667,4134.256257,3794.687974,4148.005278,2950.2,4619
8088,1883.0,1893.56,1896.825,1891.366667,1932.706995,1984.751711,1896.953519,1159.0,1883
8089,8979.0,7791.54,7932.295,7953.023333,10160.031667,10086.351755,8371.707130,7508.4,4612


In [26]:
printMetric= lambda label,value:print(f"\t {label}: {round(value,4)}")
for name,m in models.items():
    y_pred = m.predict(X_test)
    print (f"Analyzing -- {name}")
    
    printMetric ("THE RMSE IS:", math.sqrt(mean_squared_error(y_test,y_pred)))


Analyzing -- DecisionTreeRegressor
	 THE RMSE IS:: 789.2719
Analyzing -- RandomForest
	 THE RMSE IS:: 570.993
Analyzing -- RandomForestN200
	 THE RMSE IS:: 568.623
Analyzing -- RandomForestN900
	 THE RMSE IS:: 564.5493
Analyzing -- GradientBoosting
	 THE RMSE IS:: 556.3273
Analyzing -- HistGradinetBoostingClass2
	 THE RMSE IS:: 552.9788
Analyzing -- ExtraTreeRegressor
	 THE RMSE IS:: 542.0826
Analyzing -- neigbor
	 THE RMSE IS:: 3726.496
Analyzing -- tree
	 THE RMSE IS:: 1156.2219


## Grid Search of RandomForest 900

In [None]:

#rfc = RandomForestRegressor()

#grid = GridSearchCV(rfc,params,verbose=1)
#grid.fit(X_train,y_train)

In [None]:
#params = {
 #    "criterion":["mse", "mae"]
#}
#rfc = RandomForestRegressor()

#grid = GridSearchCV(rfc,params,verbose=1)
#grid.fit(X_train,y_train)

## APPLY CLEAN TO PREDICT DATASET

In [27]:
diamond_test.head()


Unnamed: 0,id,carat,cut,color,clarity,depth,table,x,y,z
0,0,2.36,Ideal,I,SI2,60.8,54.0,8.68,8.57,5.24
1,1,2.04,Ideal,H,SI2,62.0,56.0,8.18,8.23,5.09
2,2,0.51,Ideal,I,SI1,61.7,54.0,5.18,5.19,3.2
3,3,0.3,Ideal,I,SI1,61.3,56.0,4.32,4.33,2.65
4,4,0.96,Fair,H,VS2,68.8,56.0,6.11,5.98,4.16


In [28]:
clarity = ['I1', 'SI2', 'SI1', 'VS2', 'VS1', 'VVS2', 'VVS1', 'IF']
cut = ['Fair', 'Good', 'Very Good', 'Premium', 'Ideal']
color = ['J', 'H', 'I','E','G','F','D']

In [29]:
diamond_test["clarity"]=pd.Categorical(diamond_test["clarity"],ordered=True,categories=clarity)
diamond_test["cut"]=pd.Categorical(diamond_test['cut'],ordered=True,categories=cut)
diamond_test["color"]=pd.Categorical(diamond_test['color'],ordered=True,categories=color)

In [30]:
diamond_test['value_clarity'] = diamond_test["clarity"].cat.codes
diamond_test['value_cut'] = diamond_test['cut'].cat.codes
diamond_test['value_color'] = diamond_test['color'].cat.codes

diamond_test.head()

Unnamed: 0,id,carat,cut,color,clarity,depth,table,x,y,z,value_clarity,value_cut,value_color
0,0,2.36,Ideal,I,SI2,60.8,54.0,8.68,8.57,5.24,1,4,2
1,1,2.04,Ideal,H,SI2,62.0,56.0,8.18,8.23,5.09,1,4,1
2,2,0.51,Ideal,I,SI1,61.7,54.0,5.18,5.19,3.2,2,4,2
3,3,0.3,Ideal,I,SI1,61.3,56.0,4.32,4.33,2.65,2,4,2
4,4,0.96,Fair,H,VS2,68.8,56.0,6.11,5.98,4.16,3,0,1


In [31]:
diamond_test= diamond_test.drop(columns=["cut","color",'clarity'])

In [32]:
diamond_test = diamond_test.rename(columns={'x': 'length in mm', 'y': 'width in mm','z': 'depth in mm'})
diamond_test[['length in mm','width in mm','depth in mm']].describe()

Unnamed: 0,length in mm,width in mm,depth in mm
count,13485.0,13485.0,13485.0
mean,5.736456,5.738452,3.542003
std,1.123217,1.114912,0.69401
min,0.0,0.0,0.0
25%,4.72,4.73,2.92
50%,5.7,5.71,3.53
75%,6.53,6.53,4.03
max,10.74,10.54,6.98


In [33]:
diamond["width in mm"]= diamond["width in mm"].replace([0.0], 5.73)
diamond["length in mm"]=diamond["length in mm"].replace([0.0], 5.73)
diamond["depth in mm"]=diamond["depth in mm"].replace([0.0], 3.54)

In [34]:
diamond["width in mm"]=diamond["width in mm"].replace([31.8], 5.73)
diamond["width in mm"]=diamond["width in mm"].replace([58.9], 5.73)
diamond["depth in mm"]=diamond["depth in mm"].replace([31.8], 3.53)

In [35]:

diamond.describe()

Unnamed: 0,id,carat,depth,table,length in mm,width in mm,depth in mm,price,value_clarity,value_cut,value_color
count,40455.0,40455.0,40455.0,40455.0,40455.0,40455.0,40455.0,40455.0,40455.0,40455.0,40455.0
mean,20227.0,0.79715,61.746612,57.453561,5.730382,5.732109,3.538433,3928.715264,3.055321,2.904066,3.37098
std,11678.496907,0.472872,1.431006,2.235668,1.118747,1.110599,0.691713,3985.070609,1.6449,1.115369,1.755401
min,0.0,0.2,43.0,43.0,3.73,3.68,1.07,326.0,0.0,0.0,0.0
25%,10113.5,0.4,61.0,56.0,4.71,4.72,2.91,946.5,2.0,2.0,2.0
50%,20227.0,0.7,61.8,57.0,5.7,5.71,3.53,2398.0,3.0,3.0,4.0
75%,30340.5,1.04,62.5,59.0,6.54,6.54,4.04,5328.5,4.0,4.0,5.0
max,40454.0,4.5,79.0,95.0,10.23,10.16,8.06,18818.0,7.0,4.0,6.0


In [36]:
diamond_test.shape

(13485, 10)

## First model try!! RandomForestRegressor

In [59]:
model = RandomForestRegressor( n_estimators = 900)
model.fit(X, y)

RandomForestRegressor(n_estimators=900)

In [60]:
y_pred = model.predict(diamond_test)

In [61]:
y_pred = pd.DataFrame(y_pred, columns=["price"])
y_pred = y_pred.reset_index()
y_pred = y_pred.rename(columns={'index':'id'})

In [64]:
y_pred.head()


Unnamed: 0,id,price
0,0,14883.84
1,1,15723.211111
2,2,1029.131111
3,3,446.932222
4,4,3477.116667


In [65]:
y_pred.shape

(13485, 2)

In [46]:
y_pred.to_csv("./outputs/RandomForestRegressor1.csv", header= True, index=False)

## Second model - HistGradientBoosting

In [66]:
model = HistGradientBoostingRegressor()
model.fit(X, y)



HistGradientBoostingRegressor()

In [67]:
y_pred = model.predict(diamond_test)

In [68]:
y_pred = pd.DataFrame(y_pred, columns=["price"])
y_pred = y_pred.reset_index()
y_pred = y_pred.rename(columns={'index':'id'})

In [69]:
y_pred.head()

Unnamed: 0,id,price
0,0,16200.787449
1,1,15835.880476
2,2,1173.671991
3,3,459.821127
4,4,3219.336099


In [70]:
y_pred.to_csv("./outputs/HistGradeint.csv", header= True, index=False)

## third model -- GradientBoosting
   

In [71]:
model = GradientBoostingRegressor()
model.fit(X, y)

GradientBoostingRegressor()

In [72]:
y_pred = model.predict(diamond_test)

In [73]:
y_pred = pd.DataFrame(y_pred, columns=["price"])
y_pred = y_pred.reset_index()
y_pred = y_pred.rename(columns={'index':'id'})

In [74]:
y_pred.head()

Unnamed: 0,id,price
0,0,15385.45216
1,1,14956.534142
2,2,1303.19268
3,3,394.430098
4,4,3632.383214


In [75]:
y_pred.to_csv("./outputs/GradientBoosting.csv", header= True, index=False)

## FOURTH 

In [76]:
model = ExtraTreesRegressor()
model.fit(X, y)

ExtraTreesRegressor()

In [77]:
y_pred = model.predict(diamond_test)

In [78]:
y_pred = pd.DataFrame(y_pred, columns=["price"])
y_pred = y_pred.reset_index()
y_pred = y_pred.rename(columns={'index':'id'})

In [79]:
y_pred.head()

Unnamed: 0,id,price
0,0,14646.11
1,1,15447.86
2,2,1004.5
3,3,457.92
4,4,3846.59


In [None]:
y_pred.to_csv("./outputs/ExtraTreesRegressor1.csv", header= True, index=False)