In [1]:
import numpy as np
import pandas as pd

from scipy.stats import trim_mean   # conda install scipy
from statsmodels import robust      # conda install -c conda-forge statsmodels 
import wquantiles                   # pip install wquantiles

import matplotlib.pylab as plt
import seaborn as sns

from sklearn.impute import SimpleImputer

from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import RobustScaler
from sklearn.model_selection import train_test_split
from sklearn.model_selection import StratifiedShuffleSplit

from collections import Counter
from sklearn.datasets import make_classification
from imblearn.over_sampling import SMOTE

from sklearn import linear_model
from sklearn.linear_model import ElasticNet
from sklearn.linear_model import Ridge
from sklearn.svm import SVR
from sklearn.linear_model import SGDRegressor
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import cross_val_score
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import cross_val_score
from random import randint

from sklearn.metrics import mean_squared_error
from sklearn.metrics import r2_score

In [2]:
df_city = pd.read_csv('../../data/diamonds_city_202208201409.csv')
df_clarity = pd.read_csv('../../data/diamonds_clarity_202208201409.csv')
df_color = pd.read_csv('../../data/diamonds_color_202208201409.csv')
df_cut = pd.read_csv('../../data/diamonds_cut_202208201409.csv')
df_dimensions = pd.read_csv('../../data/diamonds_dimensions_202208201409.csv')
df_properties = pd.read_csv('../../data/diamonds_properties_202208201409.csv')
df_transactional = pd.read_csv('../../data/diamonds_transactional_202208201409.csv')
df_diamonds = df_properties.merge(
    df_clarity, on="clarity_id", how="left").merge(
    df_color, on="color_id", how="left").merge(
    df_cut, on="cut_id", how="left").merge(
    df_dimensions, on="index_id", how="left").merge(
    df_transactional, on="index_id", how="left").merge(df_city, on="city_id", how="left")
df_diamonds["price"] = df_diamonds["price"].astype(float)
#df_diamonds.info()

In [3]:
#reordenamos el orden de las columnas
df_diamonds = df_diamonds[['carat','depth','table','x', 'y', 'z', 'city', 'cut', 'color','clarity', 'price']]
df_diamonds_features = df_diamonds[['carat','depth','table','x', 'y', 'z', 'city', 'cut', 'color','clarity']]
df_diamonds_price = df_diamonds['price']

In [4]:
# Divide the diamond carats by 0.4 to limit the number of carat categories
# Round up to have discrete categories
df_diamonds["carat_cat"] = np.ceil(df_diamonds["carat"] / 0.35)

# Merge categories > 5 in 5
df_diamonds["carat_cat"].where(df_diamonds["carat_cat"] < 5, 5.0, inplace = True)

In [5]:
#df_diamonds

In [6]:
# Run the split. Creates on split and shares 20% of the dataset for the test set
split = StratifiedShuffleSplit(n_splits = 1, test_size = 0.2, random_state = 42)

# Separate the stratified train set and the test set
for train_index, test_index in split.split(df_diamonds, df_diamonds["carat_cat"]):
    strat_train_set = df_diamonds.loc[train_index]
    strat_test_set = df_diamonds.loc[test_index]

In [7]:
# We now have our Train set and Test set, both stratified. From here, 
# we don't need the carat_cat anylonger, hence we can drop it.
for set in (strat_train_set, strat_test_set):
    set.drop(["carat_cat"], axis = 1, inplace = True)

In [8]:
diamonds_train = strat_train_set.copy()
diamonds_train

Unnamed: 0,carat,depth,table,x,y,z,city,cut,color,clarity,price
3437,1.01,60.7,61.0,6.42,6.46,3.91,Luxembourg,Very Good,G,VS2,5759.0
10093,1.50,62.0,61.0,7.37,7.24,4.53,Antwerp,Premium,J,VS1,7832.0
29292,0.71,60.5,58.0,5.75,5.72,3.47,Antwerp,Premium,F,VS1,2839.0
20717,0.44,61.9,59.0,4.90,4.86,3.02,Kimberly,Premium,D,SI1,1089.0
6585,1.14,62.0,55.0,6.71,6.72,4.16,Paris,Ideal,H,VS2,7059.0
...,...,...,...,...,...,...,...,...,...,...,...
14808,0.72,59.2,63.0,5.77,5.86,3.44,Zurich,Very Good,F,VVS2,3952.0
12722,0.27,62.2,56.0,4.15,4.18,2.59,Antwerp,Ideal,H,VVS2,527.0
31561,0.33,61.9,57.0,4.42,4.47,2.75,Dubai,Ideal,D,VS2,781.0
23164,1.50,59.7,56.0,7.42,7.46,4.44,Amsterdam,Very Good,F,VS2,13537.0


In [9]:
diamonds_test = strat_test_set.copy()
diamonds_test

Unnamed: 0,carat,depth,table,x,y,z,city,cut,color,clarity,price
32013,1.11,62.5,58.0,6.64,6.61,4.14,Dubai,Premium,H,SI1,5433.0
27305,0.61,61.9,56.0,5.41,5.45,3.36,Zurich,Ideal,I,VVS1,2048.0
13618,0.46,62.0,55.0,4.96,4.98,3.08,London,Ideal,F,IF,1950.0
34983,1.54,61.8,59.0,7.40,7.35,4.56,Luxembourg,Premium,H,VS2,10897.0
15445,1.70,60.1,58.0,7.78,7.74,4.66,London,Premium,E,SI1,15804.0
...,...,...,...,...,...,...,...,...,...,...,...
14458,0.84,61.0,56.0,6.13,6.10,3.73,Kimberly,Ideal,G,SI2,2879.0
17183,0.70,62.7,54.0,5.66,5.70,3.56,London,Ideal,F,VVS1,4021.0
37430,1.00,62.4,57.0,6.37,6.42,3.99,Luxembourg,Ideal,F,SI2,5798.0
38955,2.01,59.8,63.0,8.14,8.11,4.86,Antwerp,Very Good,I,VS1,17068.0


In [10]:
# Now same process with test
X_train = diamonds_train.drop("price", axis = 1)
y_train = diamonds_train["price"].copy()
X_train

Unnamed: 0,carat,depth,table,x,y,z,city,cut,color,clarity
3437,1.01,60.7,61.0,6.42,6.46,3.91,Luxembourg,Very Good,G,VS2
10093,1.50,62.0,61.0,7.37,7.24,4.53,Antwerp,Premium,J,VS1
29292,0.71,60.5,58.0,5.75,5.72,3.47,Antwerp,Premium,F,VS1
20717,0.44,61.9,59.0,4.90,4.86,3.02,Kimberly,Premium,D,SI1
6585,1.14,62.0,55.0,6.71,6.72,4.16,Paris,Ideal,H,VS2
...,...,...,...,...,...,...,...,...,...,...
14808,0.72,59.2,63.0,5.77,5.86,3.44,Zurich,Very Good,F,VVS2
12722,0.27,62.2,56.0,4.15,4.18,2.59,Antwerp,Ideal,H,VVS2
31561,0.33,61.9,57.0,4.42,4.47,2.75,Dubai,Ideal,D,VS2
23164,1.50,59.7,56.0,7.42,7.46,4.44,Amsterdam,Very Good,F,VS2


In [11]:
X_test = diamonds_test.drop("price", axis = 1)
y_test = diamonds_test["price"].copy()
X_test

Unnamed: 0,carat,depth,table,x,y,z,city,cut,color,clarity
32013,1.11,62.5,58.0,6.64,6.61,4.14,Dubai,Premium,H,SI1
27305,0.61,61.9,56.0,5.41,5.45,3.36,Zurich,Ideal,I,VVS1
13618,0.46,62.0,55.0,4.96,4.98,3.08,London,Ideal,F,IF
34983,1.54,61.8,59.0,7.40,7.35,4.56,Luxembourg,Premium,H,VS2
15445,1.70,60.1,58.0,7.78,7.74,4.66,London,Premium,E,SI1
...,...,...,...,...,...,...,...,...,...,...
14458,0.84,61.0,56.0,6.13,6.10,3.73,Kimberly,Ideal,G,SI2
17183,0.70,62.7,54.0,5.66,5.70,3.56,London,Ideal,F,VVS1
37430,1.00,62.4,57.0,6.37,6.42,3.99,Luxembourg,Ideal,F,SI2
38955,2.01,59.8,63.0,8.14,8.11,4.86,Antwerp,Very Good,I,VS1


## Numeric

In [None]:
X_train_num = X_train.drop(["cut", "color", "clarity", 'city'], axis = 1)

In [None]:
# Using scikit-learn .StandardScaler() Label and dumm
df_cols_mix = list(X_train_num.columns)
scaler_stand = StandardScaler()
X_train_num_stand = scaler_stand.fit_transform(X_train_num)
X_train_num_stand = pd.DataFrame(X_train_num_stand, columns=df_cols_mix)
X_train_num_stand

In [None]:
X_test_num = X_test.drop(["cut", "color", "clarity", 'city'], axis = 1)

In [None]:
df_test_cols_mix = list(X_test_num.columns)
scaler_stand = StandardScaler()
X_test_num_stand = scaler_stand.fit_transform(X_test_num)
X_test_num_stand = pd.DataFrame(X_test_num_stand, columns=df_test_cols_mix)
X_test_num_stand

# Categorical

In [None]:
# We need only the category attributes to work with here
X_train_cat = X_train[["cut", "color", "clarity"]]
X_train_cat

In [None]:
X_train_cat_dumm = pd.get_dummies(X_train_cat, columns=['cut', 'color', 'clarity'], drop_first=True)
X_train_cat_dumm

In [None]:
X_test_cat = X_test[["cut", "color", "clarity"]]

In [None]:
X_test_cat_dumm = pd.get_dummies(X_test_cat, columns=['cut', 'color', 'clarity'], drop_first=True)
X_test_cat_dumm

In [None]:
X_train = pd.concat([X_train_num_stand, X_train_cat_dumm], axis=1)
X_train

In [None]:
# Perform the one-hot encoding on the category attributes of the dataset
cat_encoder = OneHotEncoder()
diamonds_cat_encoded = cat_encoder.fit_transform(diamonds_cat)

# Convert the encoded categories to arrays and Preview
pd.DataFrame(diamonds_cat_encoded.toarray()).head()

In [None]:
# We have our tables reformed, what we do now to to merge the numeric feature scaled attributes and the encoded category 
# attributes. An easy way to do this without writing so much like above is to Scikit-Learn's ColumnTransformer class. 
# This merging provides a single pipeline for the whole dataset.
from sklearn.compose import ColumnTransformer

num_attribs = list(diamonds_num)
cat_attribs = ["cut", "color", "clarity"]

# Pipeline to transform our dataset
pipeline = ColumnTransformer([
    ("num", StandardScaler(), num_attribs), # Perform feaured scaling on numeric attributes
    ("cat", OneHotEncoder(), cat_attribs) # Perform One-Hot encoding on the category attributes
])

In [None]:
# Transformed dataset to feed the ML Algorithm
diamonds_ready = pipeline.fit_transform(diamonds)

# Preview
pd.DataFrame(diamonds_ready).head()

In [None]:
# Remove label from test set
X_test = strat_test_set.drop("price", axis = 1)
# Have label stand alone
y_test = strat_test_set["price"].copy()

# Our models performance holder
models_rmse = [] # Holds Models original RMSE
cvs_rmse_mean = [] # Holds the Cross Validation RMSE Mean
tests_rmse = [] # Holds the tests RMSE
tests_accuracy = [] # Holds the tests accuracy
models = [] # Holds the models name

def display_model_performance(model_name, model, diamonds = diamonds_ready, labels = diamond_labels,
                              models_rmse = models_rmse, cvs_rmse_mean = cvs_rmse_mean, tests_rmse = tests_rmse,
                              tests_accuracy = tests_accuracy, pipeline = pipeline, X_test = X_test,
                              y_test = y_test, cv = True):
    # Fit dataset in model
    model.fit(diamonds, labels)
    
    # Setup predictions
    predictions = model.predict(diamonds)
    
    # Get models performance
    model_mse = mean_squared_error(labels, predictions)
    model_rmse = np.sqrt(model_mse)
    # Cross validation
    cv_score = cross_val_score(model, diamonds, labels, scoring = "neg_mean_squared_error", cv = 10)
    cv_rmse = np.sqrt(-cv_score)
    cv_rmse_mean = cv_rmse.mean()
    
    print("RMSE: %.4f" %model_rmse)
    models_rmse.append(model_rmse)
    
    print("CV-RMSE: %.4f" %cv_rmse_mean)
    cvs_rmse_mean.append(cv_rmse_mean)
    
    print("--- Test Performance ---")
    
    X_test_prepared = pipeline.transform(X_test)
    
    # Fit test dataset in model
    model.fit(X_test_prepared, y_test)
    
    # Setup test predictions
    test_predictions = model.predict(X_test_prepared)
    
    # Get models performance on test
    test_model_mse = mean_squared_error(y_test, test_predictions)
    test_model_rmse = np.sqrt(test_model_mse)
    print("RMSE: %.4f" %test_model_rmse)
    tests_rmse.append(test_model_rmse)
     # Tests accuracy
    test_accuracy = round(model.score(X_test_prepared, y_test) * 100, 2)
    print("Accuracy:", str(test_accuracy)+"%")
    tests_accuracy.append(test_accuracy)
    
    # Check how well model works on Test set by comparing prices
    start = randint(1, len(y_test))
    some_data = X_test.iloc[start:start + 7]
    some_labels = y_test.iloc[start:start + 7]
    some_data_prepared = pipeline.transform(some_data)
    print("Predictions:\t", model.predict(some_data_prepared))
    print("Labels:\t\t", list(some_labels))
    
    models.append(model_name)
    
    # Preview plot
    plt.scatter(diamond_labels, model.predict(diamonds_ready))
    plt.xlabel("Actual")
    plt.ylabel("Predicted")
    x_lim = plt.xlim()
    y_lim = plt.ylim()
    plt.plot(x_lim, y_lim, "k--")
    plt.show()
    
    print("------- Test -------")
    plt.scatter(y_test, model.predict(X_test_prepared))
    plt.xlabel("Actual")
    plt.ylabel("Predicted")
    plt.plot(x_lim, y_lim, "k--")
    plt.show()

In [None]:
regressor = RandomForestRegressor()
hyperparameters = regressor.get_params()
regressor.fit(X_test, y_test)
y_pred = regressor.predict(x_test_mix_stand)
rmse = mean_squared_error(y_pred,  y_test_mix_stand, squared=False)
rmse

In [None]:
# Linear Regression
lin_reg = LinearRegression(normalize = True)
display_model_performance("Linear Regression", lin_reg)

In [None]:
# Random Forest Regression

forest_reg = RandomForestRegressor(n_estimators = 10, random_state = 42)
display_model_performance("Random Forest Regression", forest_reg)

# Test

In [None]:
df_test = pd.read_csv('../../data/diamonds_test.csv')
df_test.drop('id', axis=1, inplace=True)
df_test

In [None]:
df_test_num = df_test.drop(["cut", "color", "clarity", 'city'], axis = 1)

In [None]:
# Perform the feature scaling on the numeric attributes of the dataset
num_scaler = StandardScaler()
df_test_num_scaled = num_scaler.fit_transform(df_test_num)

In [None]:
df_test_cat = df_test[["cut", "color", "clarity"]]

In [None]:
# Perform the one-hot encoding on the category attributes of the dataset
cat_encoder = OneHotEncoder()
df_test_cat_encoded = cat_encoder.fit_transform(df_test_cat)

In [None]:
num_attribs = list(df_test_num)
cat_attribs = ["cut", "color", "clarity"]

# Pipeline to transform our dataset
pipeline = ColumnTransformer([
    ("num", StandardScaler(), num_attribs), # Perform feaured scaling on numeric attributes
    ("cat", OneHotEncoder(), cat_attribs) # Perform One-Hot encoding on the category attributes
])

In [None]:
# Transformed dataset to feed the ML Algorithm
df_test_ready = pipeline.fit_transform(df_test)

# Preview
pd.DataFrame(df_test_ready).head()

In [None]:
# Remove label from test set
X_test_1 = df_test_ready
# Have label stand alone
y_test_1 = strat_test_set["price"].copy()

In [None]:
forest_reg = RandomForestRegressor(n_estimators = 10, random_state = 42)
display_model_performance("Random Forest Regression", forest_reg)

In [None]:
regressor = RandomForestRegressor()
hyperparameters = regressor.get_params()
regressor.fit(X_test, y_test)
#y_pred = regressor.predict(x_test_mix_stand)
#rmse = mean_squared_error(y_pred,  y_test_mix_stand, squared=False)
#rmse

In [None]:
predictions_test = regressor.predict(df_test_ready)

In [None]:
solucion = pd.DataFrame(predictions_test, columns = ['Price'])
solucion.index.names = ['id']
solucion

In [None]:
solucion.to_csv('../../data/results.csv')