In [2]:
# imports 
import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split


from sklearn import linear_model
from sklearn.linear_model import ElasticNet
from sklearn.linear_model import Ridge
from sklearn.svm import SVR
from sklearn.linear_model import SGDRegressor
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor


from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import RobustScaler


from sklearn.metrics import mean_squared_error


In [81]:
df= pd.read_csv('../data/train/diamonds_train_cleaned.csv', index_col=[0])
df

Unnamed: 0,depth,table,x,y,z,price,carat,cut,color,clarity,city
0,62.4,58.0,6.83,6.79,4.25,4268,1.21,Premium,J,VS2,Dubai
1,61.6,58.0,6.40,6.35,3.93,3513,1.02,Premium,J,VS2,Dubai
2,62.3,58.0,5.86,5.80,3.63,1792,0.77,Premium,J,VS2,Dubai
3,59.6,60.0,7.58,7.48,4.49,7553,1.51,Premium,J,VS2,Dubai
4,60.2,62.0,5.40,5.33,3.23,1176,0.57,Premium,J,VS2,Dubai
...,...,...,...,...,...,...,...,...,...,...,...
40450,62.2,54.0,5.24,5.27,3.27,2729,0.54,Ideal,F,IF,Surat
40451,61.9,54.0,5.22,5.25,3.24,2802,0.53,Ideal,F,IF,Surat
40452,62.3,55.0,4.30,4.34,2.69,886,0.30,Ideal,F,IF,Surat
40453,60.9,55.0,4.15,4.23,2.55,768,0.26,Ideal,F,IF,Surat


In [83]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 40425 entries, 0 to 40454
Data columns (total 11 columns):
 #   Column   Non-Null Count  Dtype  
---  ------   --------------  -----  
 0   depth    40425 non-null  float64
 1   table    40425 non-null  float64
 2   x        40425 non-null  float64
 3   y        40425 non-null  float64
 4   z        40425 non-null  float64
 5   price    40425 non-null  int64  
 6   carat    40425 non-null  float64
 7   cut      40425 non-null  object 
 8   color    40425 non-null  object 
 9   clarity  40425 non-null  object 
 10  city     40425 non-null  object 
dtypes: float64(6), int64(1), object(4)
memory usage: 3.7+ MB


In [84]:
# Labels are the values we want to predict
target = np.array(df['price'])
# Remove the labels from the features
# axis 1 refers to the columns
features= df.drop('price', axis = 1)
# Saving feature names for later use
feature_list = list(features.columns)
# Convert to numpy array
#features = np.array(features)


In [85]:
# categorical features

cat_cols=['cut', 'color','clarity', 'city']
cat_list = []
for col in cat_cols:
    cat = df[col].unique()
    cat_num = len(cat)
    cat_dict = {"categorical_variable":col,
                "number_of_possible_values":cat_num,
                "values":cat}
    cat_list.append(cat_dict)
    
categories = pd.DataFrame(cat_list).sort_values(by="number_of_possible_values",
                                                ascending=False).reset_index(drop=True)
categories

Unnamed: 0,categorical_variable,number_of_possible_values,values
0,city,13,"[Dubai, Luxembourg, New York City, Antwerp, Ma..."
1,clarity,8,"[VS2, VVS2, SI1, VS1, SI2, I1, VVS1, IF]"
2,color,7,"[J, E, I, G, D, H, F]"
3,cut,5,"[Premium, Very Good, Fair, Good, Ideal]"


Por el momento, no vamos a tener en cuenta la columna de la ciudad, ya que hay demasiados valores.

In [86]:
df_one_hot_encoding = pd.get_dummies(features, 
                                    columns=['cut', 'color','clarity'], 
                                    drop_first=True)
df_one_hot_encoding.drop(labels='city', axis=1, inplace=True)
df_one_hot_encoding.head()

Unnamed: 0,depth,table,x,y,z,carat,cut_Good,cut_Ideal,cut_Premium,cut_Very Good,...,color_H,color_I,color_J,clarity_IF,clarity_SI1,clarity_SI2,clarity_VS1,clarity_VS2,clarity_VVS1,clarity_VVS2
0,62.4,58.0,6.83,6.79,4.25,1.21,0,0,1,0,...,0,0,1,0,0,0,0,1,0,0
1,61.6,58.0,6.4,6.35,3.93,1.02,0,0,1,0,...,0,0,1,0,0,0,0,1,0,0
2,62.3,58.0,5.86,5.8,3.63,0.77,0,0,1,0,...,0,0,1,0,0,0,0,1,0,0
3,59.6,60.0,7.58,7.48,4.49,1.51,0,0,1,0,...,0,0,1,0,0,0,0,1,0,0
4,60.2,62.0,5.4,5.33,3.23,0.57,0,0,1,0,...,0,0,1,0,0,0,0,1,0,0


In [87]:
# Train and test datasets

X_train, X_test, y_train, y_test = train_test_split(df_one_hot_encoding, target, test_size=0.15, random_state=42)
print(f"X_train: {X_train.shape}, X_test: {X_test.shape}, y_train: {y_train.shape}, y_test: {y_test.shape}")


X_train: (34361, 23), X_test: (6064, 23), y_train: (34361,), y_test: (6064,)


In [88]:
df_one_hot_encoding.columns

Index(['depth', 'table', 'x', 'y', 'z', 'carat', 'cut_Good', 'cut_Ideal',
       'cut_Premium', 'cut_Very Good', 'color_E', 'color_F', 'color_G',
       'color_H', 'color_I', 'color_J', 'clarity_IF', 'clarity_SI1',
       'clarity_SI2', 'clarity_VS1', 'clarity_VS2', 'clarity_VVS1',
       'clarity_VVS2'],
      dtype='object')

In [101]:
# Model definition

#model = linear_model.Lasso()
#model = ElasticNet()
#model = Ridge()
#model = SVR()
#model = SGDRegressor()
#model = LinearRegression()
model=RandomForestRegressor()



In [102]:
# Model training
model.fit(X_train, y_train)

RandomForestRegressor()

In [103]:
# Model predictions

y_pred = model.predict(X_test)
y_pred

array([ 560.62, 1245.35, 3993.18, ..., 1081.2 ,  571.55, 1439.87])

In [104]:
# Visual check

rmse = mean_squared_error(y_test, y_pred)**0.5
rmse

631.870764632403

In [99]:
model.fit(df_one_hot_encoding,target) #Volvemos a entrenar el modelo con el datframe original (sin la separacion de train y test), para tener mas datos en el entreno

  coef_, l1_reg, l2_reg, X, y, max_iter, tol, rng, random, positive


Lasso()

In [105]:
# Get numerical feature importances
importances = list(model.feature_importances_)

# List of tuples with variable and importance
feature_importances = [(feature, round(importance, 4)) for feature, importance in zip(feature_list, importances)]

# Sort the feature importances by most important first
feature_importances = sorted(feature_importances, key = lambda x: x[1], reverse = True)

# Print out the feature and importances 
[print('Variable: {:20} Importance: {}'.format(*pair)) for pair in feature_importances];


Variable: y                    Importance: 0.4768
Variable: carat                Importance: 0.4113
Variable: x                    Importance: 0.0068
Variable: z                    Importance: 0.0068
Variable: depth                Importance: 0.0059
Variable: table                Importance: 0.0031
Variable: color                Importance: 0.001
Variable: clarity              Importance: 0.0004
Variable: city                 Importance: 0.0004
Variable: cut                  Importance: 0.0003


Tal y como vemos, las únicas dos columnas que aportan un alto valor son los quilates y la dimensión y, por lo que pasamos a hacer de nuevo los cáclulos solo con estas dos columnas.

In [122]:
# New random forest with only the two most important variables

model_most_important = RandomForestRegressor()

# Extract the two most important features
X_train_important = X_train[['y','carat']]
X_test_important = X_test[['y','carat']]

# Train the random forest
model_most_important.fit(X_train_important, y_train)

# Make predictions and determine the error
predictions = model_most_important.predict(X_test_important)


In [123]:
rmse = mean_squared_error(y_test, predictions)**0.5
rmse

1429.8055426040135

Tal y como vemos, el RMSE empeora bastante, por lo que descartamos la idea de quedarnos solo con las dos columnas mas importantes.
Además, una vez subido a Kaggle también vemos que el resultado es bastante malo.
