# LIBRARIES

In [1]:
import pandas as pd
import seaborn as sns
from sklearn.model_selection import train_test_split

In [2]:
dtrain = pd.read_csv("Data/train.csv")


# EXPLORE THE DATASET

In [3]:
dtrain.shape

(40455, 11)

In [4]:
dtrain.head(1)

Unnamed: 0,id,carat,cut,color,clarity,depth,table,x,y,z,price
0,0,0.3,Premium,D,SI2,62.4,58.0,4.31,4.28,2.68,6.353


In [5]:
dtrain.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 40455 entries, 0 to 40454
Data columns (total 11 columns):
 #   Column   Non-Null Count  Dtype  
---  ------   --------------  -----  
 0   id       40455 non-null  int64  
 1   carat    40455 non-null  float64
 2   cut      40455 non-null  object 
 3   color    40455 non-null  object 
 4   clarity  40455 non-null  object 
 5   depth    40455 non-null  float64
 6   table    40455 non-null  float64
 7   x        40455 non-null  float64
 8   y        40455 non-null  float64
 9   z        40455 non-null  float64
 10  price    40455 non-null  float64
dtypes: float64(7), int64(1), object(3)
memory usage: 3.4+ MB


In [6]:
(dtrain.color.unique())

array(['D', 'E', 'F', 'G', 'H', 'I', 'J'], dtype=object)

In [7]:
dtrain.cut.unique()

array(['Premium', 'Ideal', 'Very Good', 'Fair', 'Good'], dtype=object)

In [8]:
(dtrain.clarity.unique())

array(['SI2', 'VVS2', 'VS2', 'VS1', 'SI1', 'VVS1', 'IF', 'I1'],
      dtype=object)

In [9]:
# Le damos un valor dependiendo de la claridad, cuanto mayor el numero, mayor claridad

In [10]:
#primero vamos a ver la relacion entre las 4cs. Lo mas importante a la hora de determinar el precio de los diamantes.

In [11]:
clrarity_ranking = {'SI2': 2, 'VVS2': 6, 'VS2': 4, 'VS1': 5,'SI1': 3, 'VVS1': 7, 'IF': 8,'I1':1}

In [12]:
color_ranking = {'D': 7,  'E': 6, 'F': 5, 'G': 4,'H': 3, 'I': 2, 'J': 1,}    

In [13]:
cut_ranking = {'Premium':4,'Ideal': 5,'Very Good':3,'Fair': 2,'Good': 1,}    

In [14]:
def transfromar_a_categoricas(df):
    df.clarity = df.clarity.map(clrarity_ranking)
    df.color = df.color.map(color_ranking)
    df.cut = df.cut.map(cut_ranking)
    return df
    
# Transformamos las variables a categoricas.

In [15]:
df_train = transfromar_a_categoricas(dtrain)

In [16]:
X = df_train.drop('price', axis=1)
y = df_train.price

In [17]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

In [18]:
X_train.shape

(32364, 10)

In [19]:
X_test.shape

(8091, 10)

In [20]:
y_test.shape

(8091,)

In [21]:
y_train.shape

(32364,)

# PREDICTIONS

In [22]:
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error as mse
from sklearn import metrics

In [23]:
dtrain.drop('id',axis = 1, inplace=True)

In [24]:
dtrain.corr()

Unnamed: 0,carat,cut,color,clarity,depth,table,x,y,z,price
carat,1.0,-0.112795,-0.293307,-0.353255,0.023585,0.185478,0.974648,0.944419,0.947469,0.920064
cut,-0.112795,1.0,0.012338,0.174997,-0.186604,-0.412244,-0.106041,-0.103462,-0.125538,-0.072587
color,-0.293307,0.012338,1.0,-0.023444,-0.047839,-0.028516,-0.272222,-0.263498,-0.268702,-0.157426
clarity,-0.353255,0.174997,-0.023444,1.0,-0.066904,-0.161419,-0.371745,-0.355509,-0.364293,-0.213649
depth,0.023585,-0.186604,-0.047839,-0.066904,1.0,-0.299203,-0.029867,-0.03396,0.089474,-0.003292
table,0.185478,-0.412244,-0.028516,-0.161419,-0.299203,1.0,0.200374,0.186788,0.153726,0.162804
x,0.974648,-0.106041,-0.272222,-0.371745,-0.029867,0.200374,1.0,0.967315,0.965496,0.957668
y,0.944419,-0.103462,-0.263498,-0.355509,-0.03396,0.186788,0.967315,1.0,0.941176,0.928909
z,0.947469,-0.125538,-0.268702,-0.364293,0.089474,0.153726,0.965496,0.941176,1.0,0.930014
price,0.920064,-0.072587,-0.157426,-0.213649,-0.003292,0.162804,0.957668,0.928909,0.930014,1.0


In [25]:
X = df_train.drop('price', axis=1)
y = df_train.price

# DECISION TREE REGRESSION

In [26]:
from sklearn.tree import DecisionTreeRegressor
from sklearn.tree import plot_tree
from sklearn.metrics import mean_squared_error

In [27]:
results = []

for depth in range(1, 30):
    model = DecisionTreeRegressor(max_depth=depth, random_state=222)
    model.fit(X_train, y_train)
    
    result = {
        "model": model,
        "depth": depth,
        "train_error": mean_squared_error(y_train, model.predict(X_train)),
        "test_error": mean_squared_error(y_test, model.predict(X_test))
    }
    
    results.append(result)

In [28]:
results_df = pd.DataFrame(results)

In [29]:
results_df

Unnamed: 0,model,depth,train_error,test_error
0,"DecisionTreeRegressor(max_depth=1, random_stat...",1,0.2836509,0.288411
1,"DecisionTreeRegressor(max_depth=2, random_stat...",2,0.1241126,0.124528
2,"DecisionTreeRegressor(max_depth=3, random_stat...",3,0.0734994,0.07293
3,"DecisionTreeRegressor(max_depth=4, random_stat...",4,0.05324591,0.053636
4,"DecisionTreeRegressor(max_depth=5, random_stat...",5,0.03937477,0.039973
5,"DecisionTreeRegressor(max_depth=6, random_stat...",6,0.02844959,0.029545
6,"DecisionTreeRegressor(max_depth=7, random_stat...",7,0.02126514,0.023429
7,"DecisionTreeRegressor(max_depth=8, random_stat...",8,0.01696827,0.019708
8,"DecisionTreeRegressor(max_depth=9, random_stat...",9,0.01366796,0.0172
9,"DecisionTreeRegressor(max_depth=10, random_sta...",10,0.0109166,0.015632


In [30]:
results_df.test_error.min()
# Depth must be 12

0.013943631054637041

In [31]:
from sklearn.model_selection import GridSearchCV
from sklearn.tree import DecisionTreeRegressor

In [32]:
gs = GridSearchCV(
    estimator=DecisionTreeRegressor(),
    param_grid={
        "max_depth": [16,14,15],
        "min_samples_split": [20, 30, 40],
    },
    cv=5,
    verbose=3,
    scoring="neg_mean_squared_error",
    return_train_score=True
)

In [33]:
gs.fit(X_train, y_train)

Fitting 5 folds for each of 9 candidates, totalling 45 fits
[CV 1/5] END max_depth=16, min_samples_split=20;, score=(train=-0.006, test=-0.013) total time=   0.1s
[CV 2/5] END max_depth=16, min_samples_split=20;, score=(train=-0.005, test=-0.013) total time=   0.1s
[CV 3/5] END max_depth=16, min_samples_split=20;, score=(train=-0.006, test=-0.014) total time=   0.1s
[CV 4/5] END max_depth=16, min_samples_split=20;, score=(train=-0.006, test=-0.014) total time=   0.1s
[CV 5/5] END max_depth=16, min_samples_split=20;, score=(train=-0.005, test=-0.013) total time=   0.1s
[CV 1/5] END max_depth=16, min_samples_split=30;, score=(train=-0.007, test=-0.012) total time=   0.1s
[CV 2/5] END max_depth=16, min_samples_split=30;, score=(train=-0.007, test=-0.013) total time=   0.1s
[CV 3/5] END max_depth=16, min_samples_split=30;, score=(train=-0.007, test=-0.014) total time=   0.1s
[CV 4/5] END max_depth=16, min_samples_split=30;, score=(train=-0.007, test=-0.014) total time=   0.1s
[CV 5/5] END 

GridSearchCV(cv=5, estimator=DecisionTreeRegressor(),
             param_grid={'max_depth': [16, 14, 15],
                         'min_samples_split': [20, 30, 40]},
             return_train_score=True, scoring='neg_mean_squared_error',
             verbose=3)

In [34]:
grid_search_results = pd.DataFrame(gs.cv_results_)
# we only keep some of the information
grid_search_results = grid_search_results[['param_max_depth', 'param_min_samples_split',
       'mean_test_score', 'mean_train_score']]

In [35]:
grid_search_results.sort_values("mean_test_score", ascending=False)

Unnamed: 0,param_max_depth,param_min_samples_split,mean_test_score,mean_train_score
1,16,30,-0.013138,-0.006854
3,14,20,-0.013233,-0.006154
7,15,30,-0.01328,-0.00701
4,14,30,-0.013295,-0.007282
6,15,20,-0.013429,-0.005726
8,15,40,-0.013442,-0.008075
2,16,40,-0.013462,-0.007982
0,16,20,-0.01348,-0.005475
5,14,40,-0.013529,-0.008254


In [36]:
gs1 = GridSearchCV(
    estimator=DecisionTreeRegressor(),
    param_grid={
        "max_depth": [16,14,15],
        "min_samples_split": [35, 30, 25],
    },
    cv=5,
    verbose=3,
    scoring="neg_mean_squared_error",
    return_train_score=True
)

In [37]:
gs1.fit(X_train, y_train)

Fitting 5 folds for each of 9 candidates, totalling 45 fits
[CV 1/5] END max_depth=16, min_samples_split=35;, score=(train=-0.007, test=-0.012) total time=   0.1s
[CV 2/5] END max_depth=16, min_samples_split=35;, score=(train=-0.007, test=-0.013) total time=   0.1s
[CV 3/5] END max_depth=16, min_samples_split=35;, score=(train=-0.007, test=-0.014) total time=   0.1s
[CV 4/5] END max_depth=16, min_samples_split=35;, score=(train=-0.008, test=-0.014) total time=   0.1s
[CV 5/5] END max_depth=16, min_samples_split=35;, score=(train=-0.007, test=-0.013) total time=   0.1s
[CV 1/5] END max_depth=16, min_samples_split=30;, score=(train=-0.007, test=-0.013) total time=   0.1s
[CV 2/5] END max_depth=16, min_samples_split=30;, score=(train=-0.007, test=-0.013) total time=   0.1s
[CV 3/5] END max_depth=16, min_samples_split=30;, score=(train=-0.007, test=-0.014) total time=   0.1s
[CV 4/5] END max_depth=16, min_samples_split=30;, score=(train=-0.007, test=-0.014) total time=   0.1s
[CV 5/5] END 

GridSearchCV(cv=5, estimator=DecisionTreeRegressor(),
             param_grid={'max_depth': [16, 14, 15],
                         'min_samples_split': [35, 30, 25]},
             return_train_score=True, scoring='neg_mean_squared_error',
             verbose=3)

In [38]:
grid_search_results = pd.DataFrame(gs1.cv_results_)
# we only keep some of the information
grid_search_results = grid_search_results[['param_max_depth', 'param_min_samples_split',
       'mean_test_score', 'mean_train_score']]

In [39]:
grid_search_results.sort_values("mean_test_score", ascending=False)

Unnamed: 0,param_max_depth,param_min_samples_split,mean_test_score,mean_train_score
4,14,30,-0.013136,-0.007282
3,14,35,-0.013211,-0.007753
8,15,25,-0.013219,-0.0064
1,16,30,-0.01329,-0.006854
0,16,35,-0.013301,-0.007408
7,15,30,-0.013304,-0.00701
6,15,35,-0.013307,-0.00753
2,16,25,-0.013313,-0.006206
5,14,25,-0.013347,-0.006744


In [40]:
gs2 = GridSearchCV(
    estimator=DecisionTreeRegressor(),
    param_grid={
        "max_depth": [16,17,15],
        "min_samples_split": [35, 38, 32],
    },
    cv=5,
    verbose=3,
    scoring="neg_mean_squared_error",
    return_train_score=True
)

In [41]:
gs2.fit(X_train, y_train)

Fitting 5 folds for each of 9 candidates, totalling 45 fits
[CV 1/5] END max_depth=16, min_samples_split=35;, score=(train=-0.007, test=-0.012) total time=   0.1s
[CV 2/5] END max_depth=16, min_samples_split=35;, score=(train=-0.007, test=-0.013) total time=   0.1s
[CV 3/5] END max_depth=16, min_samples_split=35;, score=(train=-0.007, test=-0.014) total time=   0.1s
[CV 4/5] END max_depth=16, min_samples_split=35;, score=(train=-0.008, test=-0.014) total time=   0.1s
[CV 5/5] END max_depth=16, min_samples_split=35;, score=(train=-0.007, test=-0.013) total time=   0.1s
[CV 1/5] END max_depth=16, min_samples_split=38;, score=(train=-0.008, test=-0.013) total time=   0.1s
[CV 2/5] END max_depth=16, min_samples_split=38;, score=(train=-0.008, test=-0.013) total time=   0.1s
[CV 3/5] END max_depth=16, min_samples_split=38;, score=(train=-0.008, test=-0.014) total time=   0.1s
[CV 4/5] END max_depth=16, min_samples_split=38;, score=(train=-0.008, test=-0.014) total time=   0.1s
[CV 5/5] END 

GridSearchCV(cv=5, estimator=DecisionTreeRegressor(),
             param_grid={'max_depth': [16, 17, 15],
                         'min_samples_split': [35, 38, 32]},
             return_train_score=True, scoring='neg_mean_squared_error',
             verbose=3)

In [42]:
grid_search_results = pd.DataFrame(gs2.cv_results_)
# we only keep some of the information
grid_search_results = grid_search_results[['param_max_depth', 'param_min_samples_split',
       'mean_test_score', 'mean_train_score']]

In [43]:
grid_search_results.sort_values("mean_test_score", ascending=False)

Unnamed: 0,param_max_depth,param_min_samples_split,mean_test_score,mean_train_score
2,16,32,-0.013209,-0.007071
8,15,32,-0.013232,-0.007214
0,16,35,-0.013279,-0.007408
6,15,35,-0.013318,-0.00753
5,17,32,-0.013352,-0.007015
1,16,38,-0.013414,-0.007775
3,17,35,-0.013416,-0.007361
7,15,38,-0.013425,-0.007876
4,17,38,-0.01343,-0.007734


In [44]:
gs3 = GridSearchCV(
    estimator=DecisionTreeRegressor(),
    param_grid={
        "max_depth": [16,17,15],
        "min_samples_split": [32, 34, 30],
    },
    cv=5,
    verbose=3,
    scoring="neg_mean_squared_error",
    return_train_score=True
)

In [45]:
gs3.fit(X_train, y_train)

Fitting 5 folds for each of 9 candidates, totalling 45 fits
[CV 1/5] END max_depth=16, min_samples_split=32;, score=(train=-0.007, test=-0.012) total time=   0.1s
[CV 2/5] END max_depth=16, min_samples_split=32;, score=(train=-0.007, test=-0.013) total time=   0.1s
[CV 3/5] END max_depth=16, min_samples_split=32;, score=(train=-0.007, test=-0.014) total time=   0.1s
[CV 4/5] END max_depth=16, min_samples_split=32;, score=(train=-0.007, test=-0.014) total time=   0.1s
[CV 5/5] END max_depth=16, min_samples_split=32;, score=(train=-0.007, test=-0.013) total time=   0.1s
[CV 1/5] END max_depth=16, min_samples_split=34;, score=(train=-0.007, test=-0.012) total time=   0.1s
[CV 2/5] END max_depth=16, min_samples_split=34;, score=(train=-0.007, test=-0.013) total time=   0.1s
[CV 3/5] END max_depth=16, min_samples_split=34;, score=(train=-0.007, test=-0.014) total time=   0.1s
[CV 4/5] END max_depth=16, min_samples_split=34;, score=(train=-0.007, test=-0.014) total time=   0.1s
[CV 5/5] END 

GridSearchCV(cv=5, estimator=DecisionTreeRegressor(),
             param_grid={'max_depth': [16, 17, 15],
                         'min_samples_split': [32, 34, 30]},
             return_train_score=True, scoring='neg_mean_squared_error',
             verbose=3)

In [46]:

grid_search_results = pd.DataFrame(gs3.cv_results_)
# we only keep some of the information
grid_search_results = grid_search_results[['param_max_depth', 'param_min_samples_split',
       'mean_test_score', 'mean_train_score']]

In [47]:
grid_search_results.sort_values("mean_test_score", ascending=False)

Unnamed: 0,param_max_depth,param_min_samples_split,mean_test_score,mean_train_score
2,16,30,-0.013133,-0.006854
6,15,32,-0.013276,-0.007214
8,15,30,-0.013279,-0.00701
1,16,34,-0.013313,-0.007299
0,16,32,-0.013316,-0.007071
5,17,30,-0.013326,-0.006789
4,17,34,-0.013338,-0.007249
3,17,32,-0.013345,-0.007015
7,15,34,-0.013378,-0.007428
