In [19]:
# imports 
import numpy as np
import pandas as pd

In [20]:
df= pd.read_csv('../data/train/diamonds_train.csv', index_col=[0])
df.head().T

Unnamed: 0,0,1,2,3,4
index_id,5feceb66ffc86f38d952786c6d696c79c2dbc239dd4e91...,41667f6e2629360aecaf00b20f8732e3310417ebd54b24...,01f8667f50d52677bea23231a74156e4f92360d7bc3db6...,c3867352aab641358faec75d733af012dbe2259a014ea8...,0da4b104c4d8589fcb96a03aa0787549a2631935b0f499...
depth,62.4,61.6,62.3,59.6,60.2
table,58.0,58.0,58.0,60.0,62.0
x,6.83,6.4,5.86,7.58,5.4
y,6.79,6.35,5.8,7.48,5.33
z,4.25,3.93,3.63,4.49,3.23
price,4268,3513,1792,7553,1176
carat,1.21,1.02,0.77,1.51,0.57
cut,Premium,Premium,Premium,Premium,Premium
color,J,J,J,J,J


In [21]:
numerical_columns = ['carat', 'depth', 'table', 'x', 'y', 'z']
categorical_columns = ['cut', 'color', 'clarity']
FEATS = numerical_columns + categorical_columns
TARGET = 'price'


In [22]:
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import OneHotEncoder



categorical_transformer = \
Pipeline(steps=[('imputer', SimpleImputer(strategy='constant', fill_value='missing')),
                ('onehot', OneHotEncoder(handle_unknown='ignore'))])


In [23]:
numeric_transformer = \
Pipeline(steps=[('imputer', SimpleImputer(strategy='median')),
                ('scaler', StandardScaler())])



In [24]:
preprocessor = ColumnTransformer(transformers=[('num', numeric_transformer, numerical_columns),
                                                ('cat', categorical_transformer, categorical_columns)])

In [25]:
pd.DataFrame(data=preprocessor.fit_transform(df)).head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,16,17,18,19,20,21,22,23,24,25
0,0.867006,0.452019,0.247981,0.978807,0.921985,1.022657,0.0,0.0,0.0,1.0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
1,0.467458,-0.106755,0.247981,0.596394,0.538254,0.563582,0.0,0.0,0.0,1.0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
2,-0.058262,0.382172,0.247981,0.116154,0.05859,0.133198,0.0,0.0,0.0,1.0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
3,1.49787,-1.503688,1.143433,1.645806,1.523746,1.366964,0.0,0.0,0.0,1.0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
4,-0.478838,-1.084608,2.038886,-0.292939,-0.351305,-0.440646,0.0,0.0,0.0,1.0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0


In [26]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(df[FEATS], df[TARGET], test_size=0.15, random_state=42)

"""data_train, data_test, target_train, target_test = train_test_split(
    df[FEATS], target, random_state=42)"""

'data_train, data_test, target_train, target_test = train_test_split(\n    df[FEATS], target, random_state=42)'

In [31]:
from sklearn.linear_model import Lasso
from sklearn.ensemble import RandomForestRegressor
from xgboost import XGBRegressor


model= Pipeline(steps=[('preprocessor', preprocessor),
                       ("rf_classifier",XGBRegressor())])


In [32]:
model.fit(X_train, y_train);

In [33]:
y_pred = model.predict(X_test)
y_pred

array([8484.328  ,  492.47852, 8066.5923 , ..., 1324.2136 ,  939.4298 ,
       1600.7781 ], dtype=float32)

In [34]:
from sklearn.metrics import mean_squared_error

rmse = mean_squared_error(y_test, y_pred)**0.5
rmse

612.4052165844137

In [249]:
#Model prediction Test Dataframe (aka real prediction):
diamonds_train=pd.read_csv('../data/test/diamonds_test.csv', index_col=[0])
diamonds_train

Unnamed: 0_level_0,carat,cut,color,clarity,depth,table,x,y,z,city
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
0,0.79,Very Good,F,SI1,62.7,60.0,5.82,5.89,3.67,Amsterdam
1,1.20,Ideal,J,VS1,61.0,57.0,6.81,6.89,4.18,Surat
2,1.57,Premium,H,SI1,62.2,61.0,7.38,7.32,4.57,Kimberly
3,0.90,Very Good,F,SI1,63.8,54.0,6.09,6.13,3.90,Kimberly
4,0.50,Very Good,F,VS1,62.9,58.0,5.05,5.09,3.19,Amsterdam
...,...,...,...,...,...,...,...,...,...,...
13480,0.57,Ideal,E,SI1,61.9,56.0,5.35,5.32,3.30,Amsterdam
13481,0.71,Ideal,I,VS2,62.2,55.0,5.71,5.73,3.56,New York City
13482,0.70,Ideal,F,VS1,61.6,55.0,5.75,5.71,3.53,Tel Aviv
13483,0.70,Very Good,F,SI2,58.8,57.0,5.85,5.89,3.45,Surat


In [250]:
model.fit(df[FEATS],df[TARGET]); #Volvemos a entrenar el modelo con el datframe original (sin la separacion de train y test), para tener mas datos en el entreno

In [251]:
y_pred = model.predict(diamonds_train)
y_pred


array([2965.23, 5541.44, 9662.17, ..., 3123.41, 2219.53,  908.79])

In [252]:
y_pred=pd.DataFrame(y_pred, columns=['price'])
y_pred.reset_index(inplace=True)
y_pred.rename(columns={"index": "id"}, inplace= True)
y_pred

Unnamed: 0,id,price
0,0,2965.23
1,1,5541.44
2,2,9662.17
3,3,3679.10
4,4,1714.64
...,...,...
13480,13480,1749.73
13481,13481,2365.75
13482,13482,3123.41
13483,13483,2219.53


In [253]:
y_pred.to_csv('../data/prediction/price_prediction.csv', index= False)