In [98]:
# imports 
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split


from sklearn import linear_model
from sklearn.linear_model import ElasticNet
from sklearn.linear_model import Ridge
from sklearn.svm import SVR
from sklearn.linear_model import SGDRegressor
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor


from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import RobustScaler


from sklearn.metrics import mean_squared_error


In [103]:
df= pd.read_csv('../data/train/diamonds_train_cleaned.csv', index_col=[0])
df

Unnamed: 0,depth,table,x,y,z,price,carat,cut,color,clarity,city
0,62.4,58.0,6.83,6.79,4.25,4268,1.21,Premium,J,VS2,Dubai
1,61.6,58.0,6.40,6.35,3.93,3513,1.02,Premium,J,VS2,Dubai
2,62.3,58.0,5.86,5.80,3.63,1792,0.77,Premium,J,VS2,Dubai
3,59.6,60.0,7.58,7.48,4.49,7553,1.51,Premium,J,VS2,Dubai
4,60.2,62.0,5.40,5.33,3.23,1176,0.57,Premium,J,VS2,Dubai
...,...,...,...,...,...,...,...,...,...,...,...
40450,62.2,54.0,5.24,5.27,3.27,2729,0.54,Ideal,F,IF,Surat
40451,61.9,54.0,5.22,5.25,3.24,2802,0.53,Ideal,F,IF,Surat
40452,62.3,55.0,4.30,4.34,2.69,886,0.30,Ideal,F,IF,Surat
40453,60.9,55.0,4.15,4.23,2.55,768,0.26,Ideal,F,IF,Surat


Como hemos visto que x,y y z estan muy relacionadas entre ellas, vamos a probar de eliminarlas

In [104]:
#df.drop(labels=['index_id','carat'], axis=1, inplace= True)

In [105]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 40425 entries, 0 to 40454
Data columns (total 11 columns):
 #   Column   Non-Null Count  Dtype  
---  ------   --------------  -----  
 0   depth    40425 non-null  float64
 1   table    40425 non-null  float64
 2   x        40425 non-null  float64
 3   y        40425 non-null  float64
 4   z        40425 non-null  float64
 5   price    40425 non-null  int64  
 6   carat    40425 non-null  float64
 7   cut      40425 non-null  object 
 8   color    40425 non-null  object 
 9   clarity  40425 non-null  object 
 10  city     40425 non-null  object 
dtypes: float64(6), int64(1), object(4)
memory usage: 3.7+ MB


In [106]:
# categorical features

cat_cols=['cut', 'color','clarity', 'city']
cat_list = []
for col in cat_cols:
    cat = df[col].unique()
    cat_num = len(cat)
    cat_dict = {"categorical_variable":col,
                "number_of_possible_values":cat_num,
                "values":cat}
    cat_list.append(cat_dict)
    
categories = pd.DataFrame(cat_list).sort_values(by="number_of_possible_values",
                                                ascending=False).reset_index(drop=True)
categories

Unnamed: 0,categorical_variable,number_of_possible_values,values
0,city,13,"[Dubai, Luxembourg, New York City, Antwerp, Ma..."
1,clarity,8,"[VS2, VVS2, SI1, VS1, SI2, I1, VVS1, IF]"
2,color,7,"[J, E, I, G, D, H, F]"
3,cut,5,"[Premium, Very Good, Fair, Good, Ideal]"


Por el momento, no vamos a tener en cuenta la columna de la ciudad, ya que hay demasiados valores.

In [107]:
df_one_hot_encoding = pd.get_dummies(df, 
                                    columns=['cut', 'color','clarity'], 
                                    drop_first=True)
df_one_hot_encoding.drop(labels='city', axis=1, inplace=True)
df_one_hot_encoding

Unnamed: 0,depth,table,x,y,z,price,carat,cut_Good,cut_Ideal,cut_Premium,...,color_H,color_I,color_J,clarity_IF,clarity_SI1,clarity_SI2,clarity_VS1,clarity_VS2,clarity_VVS1,clarity_VVS2
0,62.4,58.0,6.83,6.79,4.25,4268,1.21,0,0,1,...,0,0,1,0,0,0,0,1,0,0
1,61.6,58.0,6.40,6.35,3.93,3513,1.02,0,0,1,...,0,0,1,0,0,0,0,1,0,0
2,62.3,58.0,5.86,5.80,3.63,1792,0.77,0,0,1,...,0,0,1,0,0,0,0,1,0,0
3,59.6,60.0,7.58,7.48,4.49,7553,1.51,0,0,1,...,0,0,1,0,0,0,0,1,0,0
4,60.2,62.0,5.40,5.33,3.23,1176,0.57,0,0,1,...,0,0,1,0,0,0,0,1,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
40450,62.2,54.0,5.24,5.27,3.27,2729,0.54,0,1,0,...,0,0,0,1,0,0,0,0,0,0
40451,61.9,54.0,5.22,5.25,3.24,2802,0.53,0,1,0,...,0,0,0,1,0,0,0,0,0,0
40452,62.3,55.0,4.30,4.34,2.69,886,0.30,0,1,0,...,0,0,0,1,0,0,0,0,0,0
40453,60.9,55.0,4.15,4.23,2.55,768,0.26,0,1,0,...,0,0,0,1,0,0,0,0,0,0


In [108]:
df_one_hot_encoding.columns

Index(['depth', 'table', 'x', 'y', 'z', 'price', 'carat', 'cut_Good',
       'cut_Ideal', 'cut_Premium', 'cut_Very Good', 'color_E', 'color_F',
       'color_G', 'color_H', 'color_I', 'color_J', 'clarity_IF', 'clarity_SI1',
       'clarity_SI2', 'clarity_VS1', 'clarity_VS2', 'clarity_VVS1',
       'clarity_VVS2'],
      dtype='object')

In [111]:
x_cols=['depth','table','carat', 'x','y','z', 'cut_Good',
       'cut_Ideal', 'cut_Premium', 'cut_Very Good', 'color_E', 'color_F',
       'color_G', 'color_H', 'color_I', 'color_J', 'clarity_IF', 'clarity_SI1',
       'clarity_SI2', 'clarity_VS1', 'clarity_VS2', 'clarity_VVS1',
       'clarity_VVS2']
X= df_one_hot_encoding[x_cols]
X['dimensions']= X['x'] + X['y'] + X['z']
y= df_one_hot_encoding['price']
print(X.shape, y.shape)

(40425, 24) (40425,)


In [112]:
X.drop(labels=['x','y','z'], axis=1, inplace=True)
X

Unnamed: 0,depth,table,carat,cut_Good,cut_Ideal,cut_Premium,cut_Very Good,color_E,color_F,color_G,...,color_I,color_J,clarity_IF,clarity_SI1,clarity_SI2,clarity_VS1,clarity_VS2,clarity_VVS1,clarity_VVS2,dimensions
0,62.4,58.0,1.21,0,0,1,0,0,0,0,...,0,1,0,0,0,0,1,0,0,17.87
1,61.6,58.0,1.02,0,0,1,0,0,0,0,...,0,1,0,0,0,0,1,0,0,16.68
2,62.3,58.0,0.77,0,0,1,0,0,0,0,...,0,1,0,0,0,0,1,0,0,15.29
3,59.6,60.0,1.51,0,0,1,0,0,0,0,...,0,1,0,0,0,0,1,0,0,19.55
4,60.2,62.0,0.57,0,0,1,0,0,0,0,...,0,1,0,0,0,0,1,0,0,13.96
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
40450,62.2,54.0,0.54,0,1,0,0,0,1,0,...,0,0,1,0,0,0,0,0,0,13.78
40451,61.9,54.0,0.53,0,1,0,0,0,1,0,...,0,0,1,0,0,0,0,0,0,13.71
40452,62.3,55.0,0.30,0,1,0,0,0,1,0,...,0,0,1,0,0,0,0,0,0,11.33
40453,60.9,55.0,0.26,0,1,0,0,0,1,0,...,0,0,1,0,0,0,0,0,0,10.93


In [113]:
# Train and test datasets

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.15, random_state=42)
print(f"X_train: {X_train.shape}, X_test: {X_test.shape}, y_train: {y_train.shape}, y_test: {y_test.shape}")


X_train: (34361, 21), X_test: (6064, 21), y_train: (34361,), y_test: (6064,)


In [114]:
# Model definition
model=RandomForestRegressor()


In [115]:
# Model training
model.fit(X_train, y_train)

RandomForestRegressor()

In [116]:
# Model predictions

y_pred = model.predict(X_test)
y_pred

array([ 561.9       , 1224.35      , 3933.975     , ..., 1099.71333333,
        532.575     , 1465.12      ])

In [119]:
# RMSE

rmse = mean_squared_error(y_test, y_pred)**0.5
rmse

620.5121473019392

In [120]:
model.fit(X,y) #Volvemos a entrenar el modelo con el datframe original (sin la separacion de train y test), para tener mas datos en el entreno

RandomForestRegressor()

In [125]:
#Model prediction Test Dataframe (aka real prediction):
diamonds_train=pd.read_csv('../data/test/diamonds_test.csv', index_col=[0])
diamonds_train

Unnamed: 0_level_0,carat,cut,color,clarity,depth,table,x,y,z,city
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
0,0.79,Very Good,F,SI1,62.7,60.0,5.82,5.89,3.67,Amsterdam
1,1.20,Ideal,J,VS1,61.0,57.0,6.81,6.89,4.18,Surat
2,1.57,Premium,H,SI1,62.2,61.0,7.38,7.32,4.57,Kimberly
3,0.90,Very Good,F,SI1,63.8,54.0,6.09,6.13,3.90,Kimberly
4,0.50,Very Good,F,VS1,62.9,58.0,5.05,5.09,3.19,Amsterdam
...,...,...,...,...,...,...,...,...,...,...
13480,0.57,Ideal,E,SI1,61.9,56.0,5.35,5.32,3.30,Amsterdam
13481,0.71,Ideal,I,VS2,62.2,55.0,5.71,5.73,3.56,New York City
13482,0.70,Ideal,F,VS1,61.6,55.0,5.75,5.71,3.53,Tel Aviv
13483,0.70,Very Good,F,SI2,58.8,57.0,5.85,5.89,3.45,Surat


In [126]:
#diamonds_train.drop(labels='carat', axis=1, inplace= True)

In [127]:
#We must order the columns as per the train dataframe columns order:
diamonds_train=diamonds_train[['depth', 'table','carat','x','y','z', 'cut', 'color',
       'clarity','city']]

In [128]:
diamonds_train_encoded = pd.get_dummies(diamonds_train, 
                                    columns=['cut', 'color','clarity'], 
                                    drop_first=True)
diamonds_train_encoded.drop(labels='city', axis=1, inplace=True)
diamonds_train_encoded

Unnamed: 0_level_0,depth,table,carat,x,y,z,cut_Good,cut_Ideal,cut_Premium,cut_Very Good,...,color_H,color_I,color_J,clarity_IF,clarity_SI1,clarity_SI2,clarity_VS1,clarity_VS2,clarity_VVS1,clarity_VVS2
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,62.7,60.0,0.79,5.82,5.89,3.67,0,0,0,1,...,0,0,0,0,1,0,0,0,0,0
1,61.0,57.0,1.20,6.81,6.89,4.18,0,1,0,0,...,0,0,1,0,0,0,1,0,0,0
2,62.2,61.0,1.57,7.38,7.32,4.57,0,0,1,0,...,1,0,0,0,1,0,0,0,0,0
3,63.8,54.0,0.90,6.09,6.13,3.90,0,0,0,1,...,0,0,0,0,1,0,0,0,0,0
4,62.9,58.0,0.50,5.05,5.09,3.19,0,0,0,1,...,0,0,0,0,0,0,1,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
13480,61.9,56.0,0.57,5.35,5.32,3.30,0,1,0,0,...,0,0,0,0,1,0,0,0,0,0
13481,62.2,55.0,0.71,5.71,5.73,3.56,0,1,0,0,...,0,1,0,0,0,0,0,1,0,0
13482,61.6,55.0,0.70,5.75,5.71,3.53,0,1,0,0,...,0,0,0,0,0,0,1,0,0,0
13483,58.8,57.0,0.70,5.85,5.89,3.45,0,0,0,1,...,0,0,0,0,0,1,0,0,0,0


In [131]:
x_cols=['depth','table','carat', 'x','y','z', 'cut_Good',
       'cut_Ideal', 'cut_Premium', 'cut_Very Good', 'color_E', 'color_F',
       'color_G', 'color_H', 'color_I', 'color_J', 'clarity_IF', 'clarity_SI1',
       'clarity_SI2', 'clarity_VS1', 'clarity_VS2', 'clarity_VVS1',
       'clarity_VVS2']
X_diamonds= diamonds_train_encoded[x_cols]
X_diamonds['dimensions']= X_diamonds['x'] + X_diamonds['y'] + X_diamonds['z']
X_diamonds.drop(labels=['x','y','z'], axis=1, inplace=True)


In [132]:
y_pred = model.predict(X_diamonds)
y_pred


array([2958.88 , 5407.855, 9344.64 , ..., 3405.895, 2190.53 ,  961.055])

In [133]:
y_pred=pd.DataFrame(y_pred, columns=['price'])
y_pred.reset_index(inplace=True)
y_pred.rename(columns={"index": "id"}, inplace= True)
y_pred

Unnamed: 0,id,price
0,0,2958.880000
1,1,5407.855000
2,2,9344.640000
3,3,4182.340000
4,4,1742.550000
...,...,...
13480,13480,1652.600000
13481,13481,2381.305667
13482,13482,3405.895000
13483,13483,2190.530000


In [134]:
y_pred.to_csv('../data/prediction/price_prediction.csv', index= False)

Quitando el carat, se nos va a 1648 de RMSE, por lo que el carat si que va a tener peso