## Diamond Project

In [1]:
import pandas as pd
import numpy as np

In [2]:
diamond = pd.read_csv(('./diamonds-datamad0620/train.csv'))
diamond_test = pd.read_csv(('./diamonds-datamad0620/predict.csv'))


In [3]:
diamond.head()

Unnamed: 0,id,carat,cut,color,clarity,depth,table,x,y,z,price
0,0,0.3,Very Good,F,VS2,62.8,56.0,4.29,4.31,2.7,605
1,1,0.34,Ideal,E,SI1,62.6,55.0,4.46,4.49,2.8,565
2,2,0.4,Very Good,D,SI1,60.3,62.0,4.7,4.75,2.85,720
3,3,0.4,Premium,H,VS1,61.8,59.2,4.72,4.74,2.92,793
4,4,0.9,Very Good,D,SI1,61.0,63.0,6.1,6.13,3.73,4381


In [4]:
diamond.shape

(40455, 11)

Features
- id: only for test & sample submission files, id for prediction sample identification
- price: price in USD
- carat: weight of the diamond
- cut: quality of the cut (Fair, Good, Very Good, Premium, Ideal)
- color: diamond colour, from J (worst) to D (best)
- clarity: a measurement of how clear the diamond is (I1 (worst), SI2, SI1, VS2, VS1, VVS2, VVS1, IF (best))
- x: length in mm
- y: width in mm
- z: depth in mm
- depth: total depth percentage = z / mean(x, y) = 2 * z / (x + y) (43--79)
- table: width of top of diamond relative to widest point (43--95)

In [5]:
diamond.info()
# no nun values 

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 40455 entries, 0 to 40454
Data columns (total 11 columns):
 #   Column   Non-Null Count  Dtype  
---  ------   --------------  -----  
 0   id       40455 non-null  int64  
 1   carat    40455 non-null  float64
 2   cut      40455 non-null  object 
 3   color    40455 non-null  object 
 4   clarity  40455 non-null  object 
 5   depth    40455 non-null  float64
 6   table    40455 non-null  float64
 7   x        40455 non-null  float64
 8   y        40455 non-null  float64
 9   z        40455 non-null  float64
 10  price    40455 non-null  int64  
dtypes: float64(6), int64(2), object(3)
memory usage: 3.4+ MB


> Its important to pass the cut, clarity and color columns to numerical

In [6]:
clarity = ['I1', 'SI2', 'SI1', 'VS2', 'VS1', 'VVS2', 'VVS1', 'IF']
cut = ['Fair', 'Good', 'Very Good', 'Premium', 'Ideal']
color = ['J', 'H', 'I','E','G','F','D']

In [7]:
diamond["clarity"]=pd.Categorical(diamond["clarity"],ordered=True,categories=clarity)
diamond["cut"]=pd.Categorical(diamond['cut'],ordered=True,categories=cut)
diamond["color"]=pd.Categorical(diamond['color'],ordered=True,categories=color)

In [8]:
diamond['value_clarity'] = diamond["clarity"].cat.codes
diamond['value_cut'] = diamond['cut'].cat.codes
diamond['value_color'] = diamond['color'].cat.codes

diamond.head()

Unnamed: 0,id,carat,cut,color,clarity,depth,table,x,y,z,price,value_clarity,value_cut,value_color
0,0,0.3,Very Good,F,VS2,62.8,56.0,4.29,4.31,2.7,605,3,2,5
1,1,0.34,Ideal,E,SI1,62.6,55.0,4.46,4.49,2.8,565,2,4,3
2,2,0.4,Very Good,D,SI1,60.3,62.0,4.7,4.75,2.85,720,2,2,6
3,3,0.4,Premium,H,VS1,61.8,59.2,4.72,4.74,2.92,793,4,3,1
4,4,0.9,Very Good,D,SI1,61.0,63.0,6.1,6.13,3.73,4381,2,2,6


In [9]:
diamond= diamond.drop(columns=["cut","color",'clarity'])

> Lets start looking at other columns in the data set, like (x,y,z which i will rename): 


In [10]:
diamond = diamond.rename(columns={'x': 'length in mm', 'y': 'width in mm','z': 'depth in mm'})
diamond[['length in mm','width in mm','depth in mm']].describe()

Unnamed: 0,length in mm,width in mm,depth in mm
count,40455.0,40455.0,40455.0
mean,5.729391,5.733217,3.537644
std,1.121283,1.151076,0.709557
min,0.0,0.0,0.0
25%,4.71,4.72,2.91
50%,5.7,5.71,3.53
75%,6.54,6.54,4.04
max,10.23,58.9,31.8


> By doing the describe we can notice that width, depth and length have 0.0 min values. These is odd, I will replace them for there mean values.

> Also de max value of width looks to high. I will investigate on the matter

In [11]:

diamond["width in mm"]= diamond["width in mm"].replace([0.0], 5.73)
diamond["length in mm"]=diamond["length in mm"].replace([0.0], 5.73)
diamond["depth in mm"]=diamond["depth in mm"].replace([0.0], 3.54)

In [12]:
# min values changed! 
diamond[['length in mm','width in mm','depth in mm']].describe()

Unnamed: 0,length in mm,width in mm,depth in mm
count,40455.0,40455.0,40455.0
mean,5.730382,5.734067,3.539132
std,1.118747,1.148956,0.70584
min,3.73,3.68,1.07
25%,4.71,4.72,2.91
50%,5.7,5.71,3.53
75%,6.54,6.54,4.04
max,10.23,58.9,31.8



> diamond["length in mm"].unique() # after analyzing all the individual unique values I conclude that there is nothing out or the ordinary
> diamond["depth in mm"].unique()#after analyzing all the individual unique values I conclude that there is one value out of the ordinary
> diamond["width in mm"].unique()#after analyzing all the individual unique values I conclude that there are values out of the ordinary

In [13]:
display (diamond[diamond["width in mm"]>12])
diamond[diamond["depth in mm"]>10]
# these values must have been wrongly written. Im gonna drop them

Unnamed: 0,id,carat,depth,table,length in mm,width in mm,depth in mm,price,value_clarity,value_cut,value_color
12402,12402,0.51,61.8,55.0,5.15,31.8,5.12,2075,4,4,3
27676,27676,2.0,58.9,57.0,8.09,58.9,8.06,12210,1,3,1


Unnamed: 0,id,carat,depth,table,length in mm,width in mm,depth in mm,price,value_clarity,value_cut,value_color
38759,38759,0.51,61.8,54.7,5.12,5.15,31.8,1970,4,2,3


In [14]:
diamond["width in mm"]=diamond["width in mm"].replace([31.8], 5.73)
diamond["width in mm"]=diamond["width in mm"].replace([58.9], 5.73)
diamond["depth in mm"]=diamond["depth in mm"].replace([31.8], 3.53)

In [15]:
diamond[['length in mm','width in mm','depth in mm']].describe()

Unnamed: 0,length in mm,width in mm,depth in mm
count,40455.0,40455.0,40455.0
mean,5.730382,5.732109,3.538433
std,1.118747,1.110599,0.691713
min,3.73,3.68,1.07
25%,4.71,4.72,2.91
50%,5.7,5.71,3.53
75%,6.54,6.54,4.04
max,10.23,10.16,8.06


In [16]:
diamond.head()

Unnamed: 0,id,carat,depth,table,length in mm,width in mm,depth in mm,price,value_clarity,value_cut,value_color
0,0,0.3,62.8,56.0,4.29,4.31,2.7,605,3,2,5
1,1,0.34,62.6,55.0,4.46,4.49,2.8,565,2,4,3
2,2,0.4,60.3,62.0,4.7,4.75,2.85,720,2,2,6
3,3,0.4,61.8,59.2,4.72,4.74,2.92,793,4,3,1
4,4,0.9,61.0,63.0,6.1,6.13,3.73,4381,2,2,6


 > Hemos cambiado las columnas ya que habia valores que estaban fuera de rango.
## considerar si cargarse table??

## DIVIDE THE COLUMNS TO CREATE X and y 

In [17]:
X = diamond.drop(columns="price")
y = diamond["price"]


In [69]:
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import GradientBoostingRegressor, RandomForestRegressor
from sklearn.datasets import make_regression
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.model_selection import train_test_split
import math
from sklearn.metrics import mean_squared_error
from sklearn.experimental import enable_hist_gradient_boosting  
from sklearn.ensemble import HistGradientBoostingClassifier

In [70]:
X_train, X_test, y_train, Y_test = train_test_split(X, y, test_size=0.2, random_state=0)

> Training different models

In [81]:
models = {
   #"DecisionTreeRegressor": DecisionTreeRegressor(),
    #"RandomForest" : RandomForestRegressor(),
    #"RandomForestN200" : RandomForestRegressor(n_estimators = 200),
    "RandomForestN900" : RandomForestRegressor(n_estimators = 2500),
    #"GradientBoosting" : GradientBoostingRegressor(n_estimators = 900),
    #"HistGradinetBoostingClass": HistGradientBoostingClassifier(n_estimators = 900)
 
}

In [None]:
for name, model in models.items():
    print(f"Training {name}")
    model.fit(X_train,y_train)
print("He acabado :) ")

Training RandomForestN900


In [22]:
comparar= {modelName:model.predict(X_test) for modelName, model in models.items()}
df = pd.DataFrame(comparar)
df

Unnamed: 0,DecisionTreeRegressor,RandomForest,RandomForestN200,RandomForestN900,GradientBoosting
0,1656.0,1652.26,1654.895,1653.957778,1723.766809
1,1972.0,1957.98,1972.080,1981.211111,2000.145530
2,8820.0,7828.61,8034.835,7886.158889,8047.919192
3,3811.0,3522.90,3566.425,3528.408889,3599.564718
4,3703.0,3176.60,3201.910,3159.207778,3152.246696
...,...,...,...,...,...
8086,662.0,833.16,816.980,835.147778,807.189096
8087,4619.0,4228.80,4102.530,4133.003333,3912.814439
8088,1883.0,1904.97,1892.665,1895.206667,1959.587478
8089,8707.0,8061.35,7888.890,7931.085556,8947.250428


In [23]:
printMetric= lambda label,value:print(f"\t {label}: {round(value,4)}")
for name,m in models.items():
    y_pred = m.predict(X)
    print (f"Analyzing -- {name}")
    
    printMetric ("THE RMSE IS:", math.sqrt(mean_squared_error(y,y_pred)))


Analyzing -- DecisionTreeRegressor
	 THE RMSE IS:: 352.0947
Analyzing -- RandomForest
	 THE RMSE IS:: 315.8324
Analyzing -- RandomForestN200
	 THE RMSE IS:: 314.1056
Analyzing -- RandomForestN900
	 THE RMSE IS:: 312.9187
Analyzing -- GradientBoosting
	 THE RMSE IS:: 633.2111


## APPLY CLEAN TO PREDICT DATASET

In [24]:
diamond_test.head()


Unnamed: 0,id,carat,cut,color,clarity,depth,table,x,y,z
0,0,2.36,Ideal,I,SI2,60.8,54.0,8.68,8.57,5.24
1,1,2.04,Ideal,H,SI2,62.0,56.0,8.18,8.23,5.09
2,2,0.51,Ideal,I,SI1,61.7,54.0,5.18,5.19,3.2
3,3,0.3,Ideal,I,SI1,61.3,56.0,4.32,4.33,2.65
4,4,0.96,Fair,H,VS2,68.8,56.0,6.11,5.98,4.16


In [25]:
clarity = ['I1', 'SI2', 'SI1', 'VS2', 'VS1', 'VVS2', 'VVS1', 'IF']
cut = ['Fair', 'Good', 'Very Good', 'Premium', 'Ideal']
color = ['J', 'H', 'I','E','G','F','D']

In [26]:
diamond_test["clarity"]=pd.Categorical(diamond_test["clarity"],ordered=True,categories=clarity)
diamond_test["cut"]=pd.Categorical(diamond_test['cut'],ordered=True,categories=cut)
diamond_test["color"]=pd.Categorical(diamond_test['color'],ordered=True,categories=color)

In [27]:
diamond_test['value_clarity'] = diamond_test["clarity"].cat.codes
diamond_test['value_cut'] = diamond_test['cut'].cat.codes
diamond_test['value_color'] = diamond_test['color'].cat.codes

diamond_test.head()

Unnamed: 0,id,carat,cut,color,clarity,depth,table,x,y,z,value_clarity,value_cut,value_color
0,0,2.36,Ideal,I,SI2,60.8,54.0,8.68,8.57,5.24,1,4,2
1,1,2.04,Ideal,H,SI2,62.0,56.0,8.18,8.23,5.09,1,4,1
2,2,0.51,Ideal,I,SI1,61.7,54.0,5.18,5.19,3.2,2,4,2
3,3,0.3,Ideal,I,SI1,61.3,56.0,4.32,4.33,2.65,2,4,2
4,4,0.96,Fair,H,VS2,68.8,56.0,6.11,5.98,4.16,3,0,1


In [28]:
diamond_test= diamond_test.drop(columns=["cut","color",'clarity'])

In [29]:
diamond_test = diamond_test.rename(columns={'x': 'length in mm', 'y': 'width in mm','z': 'depth in mm'})
diamond_test[['length in mm','width in mm','depth in mm']].describe()

Unnamed: 0,length in mm,width in mm,depth in mm
count,13485.0,13485.0,13485.0
mean,5.736456,5.738452,3.542003
std,1.123217,1.114912,0.69401
min,0.0,0.0,0.0
25%,4.72,4.73,2.92
50%,5.7,5.71,3.53
75%,6.53,6.53,4.03
max,10.74,10.54,6.98


In [30]:
diamond["width in mm"]= diamond["width in mm"].replace([0.0], 5.73)
diamond["length in mm"]=diamond["length in mm"].replace([0.0], 5.73)
diamond["depth in mm"]=diamond["depth in mm"].replace([0.0], 3.54)

In [31]:
diamond["width in mm"]=diamond["width in mm"].replace([31.8], 5.73)
diamond["width in mm"]=diamond["width in mm"].replace([58.9], 5.73)
diamond["depth in mm"]=diamond["depth in mm"].replace([31.8], 3.53)

In [32]:
#diamond_one = diamond_test[(diamond_test["width in mm"]<=12)]
#diamond_predict = diamond_one[(diamond_test["depth in mm"]<=10)]

In [33]:
diamond_test.shape

(13485, 10)

## First model try!! RandomForestRegressor

In [34]:
model = RandomForestRegressor( n_estimators = 900)
model.fit(X, y)

RandomForestRegressor(n_estimators=900)

In [50]:
y_pred = model.predict(diamond_test)

In [58]:
y_pred = pd.DataFrame(y_pred, columns=["price"])
y_pred = y_pred.reset_index()
y_pred = y_pred.rename(columns={'index':'id'})

In [59]:
y_pred.head()


Unnamed: 0,id,price
0,0,14953.352222
1,1,15715.883333
2,2,1034.164444
3,3,452.847778
4,4,3483.603333


In [60]:
y_pred.shape

(13485, 2)

In [65]:
y_pred.to_csv("./outputs/RandomForestRegressor.csv", header= True, index=False)