In [40]:
import numpy as np
import pandas as pd

from sklearn.impute import SimpleImputer

from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import RobustScaler
from sklearn.model_selection import train_test_split

from collections import Counter
from sklearn.datasets import make_classification
from imblearn.over_sampling import SMOTE

from sklearn import linear_model
from sklearn.linear_model import ElasticNet
from sklearn.linear_model import Ridge
from sklearn.svm import SVR
from sklearn.linear_model import SGDRegressor
from sklearn.linear_model import LinearRegression

from sklearn.metrics import mean_squared_error
from sklearn.metrics import r2_score

In [2]:
df_city = pd.read_csv('../../data/diamonds_city_202208201409.csv')
df_clarity = pd.read_csv('../../data/diamonds_clarity_202208201409.csv')
df_color = pd.read_csv('../../data/diamonds_color_202208201409.csv')
df_cut = pd.read_csv('../../data/diamonds_cut_202208201409.csv')
df_dimensions = pd.read_csv('../../data/diamonds_dimensions_202208201409.csv')
df_properties = pd.read_csv('../../data/diamonds_properties_202208201409.csv')
df_transactional = pd.read_csv('../../data/diamonds_transactional_202208201409.csv')

In [3]:
df_diamonds = df_properties.merge(
    df_clarity, on="clarity_id", how="left").merge(
    df_color, on="color_id", how="left").merge(
    df_cut, on="cut_id", how="left").merge(
    df_dimensions, on="index_id", how="left").merge(
    df_transactional, on="index_id", how="left").merge(df_city, on="city_id", how="left")

In [4]:
#reordenamos el orden de las columnas
df_diamonds = df_diamonds[['carat', 'cut', 'color','clarity','depth','table','x', 'y', 'z','city','price', ]]

In [5]:
df_diamonds.info() #compuesto por las 40.455 filas y 16 columnas sin nulos.
df_diamonds.head()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 40455 entries, 0 to 40454
Data columns (total 11 columns):
 #   Column   Non-Null Count  Dtype  
---  ------   --------------  -----  
 0   carat    40455 non-null  float64
 1   cut      40455 non-null  object 
 2   color    40455 non-null  object 
 3   clarity  40455 non-null  object 
 4   depth    40455 non-null  float64
 5   table    40455 non-null  float64
 6   x        40455 non-null  float64
 7   y        40455 non-null  float64
 8   z        40455 non-null  float64
 9   city     40455 non-null  object 
 10  price    40455 non-null  int64  
dtypes: float64(6), int64(1), object(4)
memory usage: 3.7+ MB


Unnamed: 0,carat,cut,color,clarity,depth,table,x,y,z,city,price
0,1.21,Premium,J,VS2,62.4,58.0,6.83,6.79,4.25,Kimberly,4268
1,0.32,Very Good,H,VS2,63.0,57.0,4.35,4.38,2.75,Antwerp,505
2,0.71,Fair,G,VS1,65.5,55.0,5.62,5.53,3.65,London,2686
3,0.41,Good,D,SI1,63.8,56.0,4.68,4.72,3.0,New York City,738
4,1.02,Ideal,G,SI1,60.5,59.0,6.55,6.51,3.95,Dubai,4882


In [6]:
df_diamonds.describe()

Unnamed: 0,carat,depth,table,x,y,z,price
count,40455.0,40455.0,40455.0,40455.0,40455.0,40455.0,40455.0
mean,0.797706,61.752841,57.446133,5.729392,5.732819,3.537154,3928.444469
std,0.475544,1.431725,2.233535,1.124453,1.14665,0.697062,3992.416147
min,0.2,43.0,43.0,0.0,0.0,0.0,326.0
25%,0.4,61.0,56.0,4.71,4.72,2.91,945.0
50%,0.7,61.8,57.0,5.69,5.71,3.52,2397.0
75%,1.04,62.5,59.0,6.54,6.54,4.035,5331.0
max,4.5,79.0,95.0,10.23,58.9,8.06,18823.0


In [7]:
df_diamonds.isna().sum()

carat      0
cut        0
color      0
clarity    0
depth      0
table      0
x          0
y          0
z          0
city       0
price      0
dtype: int64

# CASE 1. Just converting every object to numeric

In [70]:
df_2 = pd.get_dummies(df_diamonds, columns=['cut', 'color', 'clarity', 'city'], drop_first=True)
df_2

KeyError: "None of [Index(['cut', 'color', 'clarity', 'city'], dtype='object')] are in the [columns]"

In [8]:
df_diamonds['cut'].value_counts()

Ideal        16220
Premium      10260
Very Good     9095
Good          3663
Fair          1217
Name: cut, dtype: int64

In [9]:
df_cut = ['Ideal', 'Premium', 'Very Good', 'Good', 'Fair']

# Your code here
for i in df_cut:
    df_diamonds[i] = 0

# Print your new dataframe to check whether new columns have been created:

df_diamonds

Unnamed: 0,carat,cut,color,clarity,depth,table,x,y,z,city,price,Ideal,Premium,Very Good,Good,Fair
0,1.21,Premium,J,VS2,62.4,58.0,6.83,6.79,4.25,Kimberly,4268,0,0,0,0,0
1,0.32,Very Good,H,VS2,63.0,57.0,4.35,4.38,2.75,Antwerp,505,0,0,0,0,0
2,0.71,Fair,G,VS1,65.5,55.0,5.62,5.53,3.65,London,2686,0,0,0,0,0
3,0.41,Good,D,SI1,63.8,56.0,4.68,4.72,3.00,New York City,738,0,0,0,0,0
4,1.02,Ideal,G,SI1,60.5,59.0,6.55,6.51,3.95,Dubai,4882,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
40450,1.34,Ideal,G,VS1,62.7,57.0,7.10,7.04,4.43,New York City,10070,0,0,0,0,0
40451,2.02,Good,F,SI2,57.1,60.0,8.31,8.25,4.73,Zurich,12615,0,0,0,0,0
40452,1.01,Ideal,H,SI1,62.7,56.0,6.37,6.42,4.01,Tel Aviv,5457,0,0,0,0,0
40453,0.33,Ideal,J,VS1,61.9,54.3,4.45,4.47,2.76,Paris,456,0,0,0,0,0


In [10]:
# Your code here
# austin_fixed['Events'].str.contains('rain', case=False).astype(int)
for i in df_cut:
    df_diamonds[i] = df_diamonds['cut'].str.contains(i, case=False).astype(int)
df_diamonds

Unnamed: 0,carat,cut,color,clarity,depth,table,x,y,z,city,price,Ideal,Premium,Very Good,Good,Fair
0,1.21,Premium,J,VS2,62.4,58.0,6.83,6.79,4.25,Kimberly,4268,0,1,0,0,0
1,0.32,Very Good,H,VS2,63.0,57.0,4.35,4.38,2.75,Antwerp,505,0,0,1,1,0
2,0.71,Fair,G,VS1,65.5,55.0,5.62,5.53,3.65,London,2686,0,0,0,0,1
3,0.41,Good,D,SI1,63.8,56.0,4.68,4.72,3.00,New York City,738,0,0,0,1,0
4,1.02,Ideal,G,SI1,60.5,59.0,6.55,6.51,3.95,Dubai,4882,1,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
40450,1.34,Ideal,G,VS1,62.7,57.0,7.10,7.04,4.43,New York City,10070,1,0,0,0,0
40451,2.02,Good,F,SI2,57.1,60.0,8.31,8.25,4.73,Zurich,12615,0,0,0,1,0
40452,1.01,Ideal,H,SI1,62.7,56.0,6.37,6.42,4.01,Tel Aviv,5457,1,0,0,0,0
40453,0.33,Ideal,J,VS1,61.9,54.3,4.45,4.47,2.76,Paris,456,1,0,0,0,0


In [11]:
df_diamonds.drop(columns=['cut'], inplace=True)
df_diamonds

Unnamed: 0,carat,color,clarity,depth,table,x,y,z,city,price,Ideal,Premium,Very Good,Good,Fair
0,1.21,J,VS2,62.4,58.0,6.83,6.79,4.25,Kimberly,4268,0,1,0,0,0
1,0.32,H,VS2,63.0,57.0,4.35,4.38,2.75,Antwerp,505,0,0,1,1,0
2,0.71,G,VS1,65.5,55.0,5.62,5.53,3.65,London,2686,0,0,0,0,1
3,0.41,D,SI1,63.8,56.0,4.68,4.72,3.00,New York City,738,0,0,0,1,0
4,1.02,G,SI1,60.5,59.0,6.55,6.51,3.95,Dubai,4882,1,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
40450,1.34,G,VS1,62.7,57.0,7.10,7.04,4.43,New York City,10070,1,0,0,0,0
40451,2.02,F,SI2,57.1,60.0,8.31,8.25,4.73,Zurich,12615,0,0,0,1,0
40452,1.01,H,SI1,62.7,56.0,6.37,6.42,4.01,Tel Aviv,5457,1,0,0,0,0
40453,0.33,J,VS1,61.9,54.3,4.45,4.47,2.76,Paris,456,1,0,0,0,0


In [12]:
df_diamonds['color'].value_counts()

G    8455
E    7325
F    7177
H    6277
D    5049
I    4032
J    2140
Name: color, dtype: int64

In [13]:
df_color = ['D', 'E', 'F', 'G', 'H', 'I', 'J']
# Your code here
for i in df_color:
    df_diamonds[i] = 0

In [14]:
for i in df_color:
    df_diamonds[i] = df_diamonds['color'].str.contains(i, case=False).astype(int)

In [15]:
df_diamonds.drop(columns=['color'], inplace=True)
df_diamonds

Unnamed: 0,carat,clarity,depth,table,x,y,z,city,price,Ideal,...,Very Good,Good,Fair,D,E,F,G,H,I,J
0,1.21,VS2,62.4,58.0,6.83,6.79,4.25,Kimberly,4268,0,...,0,0,0,0,0,0,0,0,0,1
1,0.32,VS2,63.0,57.0,4.35,4.38,2.75,Antwerp,505,0,...,1,1,0,0,0,0,0,1,0,0
2,0.71,VS1,65.5,55.0,5.62,5.53,3.65,London,2686,0,...,0,0,1,0,0,0,1,0,0,0
3,0.41,SI1,63.8,56.0,4.68,4.72,3.00,New York City,738,0,...,0,1,0,1,0,0,0,0,0,0
4,1.02,SI1,60.5,59.0,6.55,6.51,3.95,Dubai,4882,1,...,0,0,0,0,0,0,1,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
40450,1.34,VS1,62.7,57.0,7.10,7.04,4.43,New York City,10070,1,...,0,0,0,0,0,0,1,0,0,0
40451,2.02,SI2,57.1,60.0,8.31,8.25,4.73,Zurich,12615,0,...,0,1,0,0,0,1,0,0,0,0
40452,1.01,SI1,62.7,56.0,6.37,6.42,4.01,Tel Aviv,5457,1,...,0,0,0,0,0,0,0,1,0,0
40453,0.33,VS1,61.9,54.3,4.45,4.47,2.76,Paris,456,1,...,0,0,0,0,0,0,0,0,0,1


In [16]:
df_diamonds['clarity'].value_counts()

SI1     9749
VS2     9248
SI2     6929
VS1     6066
VVS2    3799
VVS1    2774
IF      1327
I1       563
Name: clarity, dtype: int64

In [17]:
df_clarity = ['I1', 'SI2', 'SI1', 'VS2', 'VS1', 'VVS2', 'VVS1', 'IF']
# Your code here
for i in df_clarity:
    df_diamonds[i] = 0

In [18]:
for i in df_clarity:
    df_diamonds[i] = df_diamonds['clarity'].str.contains(i, case=False).astype(int)

In [19]:
df_diamonds.drop(columns=['clarity'], inplace=True)
df_diamonds

Unnamed: 0,carat,depth,table,x,y,z,city,price,Ideal,Premium,...,I,J,I1,SI2,SI1,VS2,VS1,VVS2,VVS1,IF
0,1.21,62.4,58.0,6.83,6.79,4.25,Kimberly,4268,0,1,...,0,1,0,0,0,1,0,0,0,0
1,0.32,63.0,57.0,4.35,4.38,2.75,Antwerp,505,0,0,...,0,0,0,0,0,1,0,0,0,0
2,0.71,65.5,55.0,5.62,5.53,3.65,London,2686,0,0,...,0,0,0,0,0,0,1,0,0,0
3,0.41,63.8,56.0,4.68,4.72,3.00,New York City,738,0,0,...,0,0,1,0,1,0,0,0,0,0
4,1.02,60.5,59.0,6.55,6.51,3.95,Dubai,4882,1,0,...,0,0,1,0,1,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
40450,1.34,62.7,57.0,7.10,7.04,4.43,New York City,10070,1,0,...,0,0,0,0,0,0,1,0,0,0
40451,2.02,57.1,60.0,8.31,8.25,4.73,Zurich,12615,0,0,...,0,0,0,1,0,0,0,0,0,0
40452,1.01,62.7,56.0,6.37,6.42,4.01,Tel Aviv,5457,1,0,...,0,0,1,0,1,0,0,0,0,0
40453,0.33,61.9,54.3,4.45,4.47,2.76,Paris,456,1,0,...,0,1,0,0,0,0,1,0,0,0


In [20]:
df_diamonds['city'].value_counts()

Antwerp          5062
Surat            5047
Kimberly         5040
Zurich           2594
Paris            2579
Las Vegas        2558
Amsterdam        2544
Tel Aviv         2533
Luxembourg       2524
Madrid           2504
New York City    2495
Dubai            2488
London           2487
Name: city, dtype: int64

In [21]:
df_city = ['Antwerp', 'Surat', 'Kimberly', 'Zurich', 'Paris', 'Las Vegas', 'Amsterdam', 'Tel Aviv', 'Luxembourg', 'Madrid', 
          'New York City', 'Dubai', 'London']
# Your code here
for i in df_city:
    df_diamonds[i] = 0

In [22]:
for i in df_city:
    df_diamonds[i] = df_diamonds['city'].str.contains(i, case=False).astype(int)

In [23]:
df_diamonds.drop(columns=['city'], inplace=True)
df_diamonds

Unnamed: 0,carat,depth,table,x,y,z,price,Ideal,Premium,Very Good,...,Zurich,Paris,Las Vegas,Amsterdam,Tel Aviv,Luxembourg,Madrid,New York City,Dubai,London
0,1.21,62.4,58.0,6.83,6.79,4.25,4268,0,1,0,...,0,0,0,0,0,0,0,0,0,0
1,0.32,63.0,57.0,4.35,4.38,2.75,505,0,0,1,...,0,0,0,0,0,0,0,0,0,0
2,0.71,65.5,55.0,5.62,5.53,3.65,2686,0,0,0,...,0,0,0,0,0,0,0,0,0,1
3,0.41,63.8,56.0,4.68,4.72,3.00,738,0,0,0,...,0,0,0,0,0,0,0,1,0,0
4,1.02,60.5,59.0,6.55,6.51,3.95,4882,1,0,0,...,0,0,0,0,0,0,0,0,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
40450,1.34,62.7,57.0,7.10,7.04,4.43,10070,1,0,0,...,0,0,0,0,0,0,0,1,0,0
40451,2.02,57.1,60.0,8.31,8.25,4.73,12615,0,0,0,...,1,0,0,0,0,0,0,0,0,0
40452,1.01,62.7,56.0,6.37,6.42,4.01,5457,1,0,0,...,0,0,0,0,1,0,0,0,0,0
40453,0.33,61.9,54.3,4.45,4.47,2.76,456,1,0,0,...,0,1,0,0,0,0,0,0,0,0


In [24]:
df_diamonds.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 40455 entries, 0 to 40454
Data columns (total 40 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   carat          40455 non-null  float64
 1   depth          40455 non-null  float64
 2   table          40455 non-null  float64
 3   x              40455 non-null  float64
 4   y              40455 non-null  float64
 5   z              40455 non-null  float64
 6   price          40455 non-null  int64  
 7   Ideal          40455 non-null  int32  
 8   Premium        40455 non-null  int32  
 9   Very Good      40455 non-null  int32  
 10  Good           40455 non-null  int32  
 11  Fair           40455 non-null  int32  
 12  D              40455 non-null  int32  
 13  E              40455 non-null  int32  
 14  F              40455 non-null  int32  
 15  G              40455 non-null  int32  
 16  H              40455 non-null  int32  
 17  I              40455 non-null  int32  
 18  J     

In [25]:
X_columns = ['carat', 'depth', 'table', 'x', 'y', 'z', 'Ideal', 'Premium', 'Very Good', 'Good', 'Fair',
            'D', 'E', 'F', 'G', 'H', 'I', 'J', 'I1', 'SI2', 'SI1', 'VS2', 'VS1', 'VVS2', 'VVS1', 'IF',
            'Antwerp', 'Surat', 'Kimberly', 'Zurich', 'Paris', 'Las Vegas', 'Amsterdam', 'Tel Aviv', 'Luxembourg', 'Madrid', 
          'New York City', 'Dubai', 'London']
y_column = 'price'

In [26]:
X_diamonds = df_diamonds[X_columns]
y_diamonds = df_diamonds[y_column]

In [30]:
x_train, x_test, y_train, y_test = train_test_split(X_diamonds, y_diamonds, test_size=0.2, random_state=42)
y_train

32121     1577
9831     10685
33128     1885
6199       637
19661     2874
         ...  
6265      2739
11284      984
38158      530
860       4273
15795      844
Name: price, Length: 32364, dtype: int64

In [33]:
# Model definition

model = linear_model.Lasso()
#model = ElasticNet()
#model = Ridge()
#model = SVR()
#model = SGDRegressor()
#model = LinearRegression()

hyperparameters = model.get_params()

print(type(model), '\n')
print('Model hyperparameters:', hyperparameters, '\n')

<class 'sklearn.linear_model._coordinate_descent.Lasso'> 

Model hyperparameters: {'alpha': 1.0, 'copy_X': True, 'fit_intercept': True, 'max_iter': 1000, 'normalize': 'deprecated', 'positive': False, 'precompute': False, 'random_state': None, 'selection': 'cyclic', 'tol': 0.0001, 'warm_start': False} 



In [35]:
# Model training

model.fit(x_train, y_train)

print('Model:', model, '\n')
print('Model hyperparameters:', hyperparameters, '\n')
print('Model coefficients:', model.coef_, '\n')

Model: Lasso() 

Model hyperparameters: {'alpha': 1.0, 'copy_X': True, 'fit_intercept': True, 'max_iter': 1000, 'normalize': 'deprecated', 'positive': False, 'precompute': False, 'random_state': None, 'selection': 'cyclic', 'tol': 0.0001, 'warm_start': False} 

Model coefficients: [ 1.10723797e+04 -6.76275118e+01 -2.84473713e+01 -9.60115958e+02
 -4.28014835e+00 -0.00000000e+00  1.69384765e+02  1.01735012e+02
  1.35512216e+02 -5.95123849e+01 -6.12411637e+02  4.78911446e+02
  2.73875455e+02  2.03424891e+02 -0.00000000e+00 -4.93308884e+02
 -9.33814573e+02 -1.84401902e+03 -4.24437666e+03 -1.56067384e+03
  3.62516995e+03  0.00000000e+00  2.94723431e+02  6.56965652e+02
  4.38732566e+02  1.01347446e+03  1.86200797e+01 -2.06784536e+01
  7.94280238e+00  0.00000000e+00  1.06728955e+01  0.00000000e+00
 -7.45801376e+00 -9.33577808e-01 -1.96781934e+00  0.00000000e+00
  0.00000000e+00 -0.00000000e+00 -2.83441240e+01] 



In [37]:
predictions = model.predict(x_test)

In [38]:
check = pd.DataFrame({'Ground truth':y_test, 'Predictions':predictions, 'Diff':y_test-predictions})
check

Unnamed: 0,Ground truth,Predictions,Diff
17775,2970,3574.221539,-604.221539
13506,3004,3229.068412,-225.068412
4325,838,1269.198191,-431.198191
37870,6468,6133.539329,334.460671
21321,633,806.941094,-173.941094
...,...,...,...
3781,4764,5475.659988,-711.659988
26959,756,332.580070,423.419930
15529,2690,3339.247328,-649.247328
36333,3992,4536.683290,-544.683290


In [44]:
# RMSE calculation

rmse = mean_squared_error(predictions, y_test, squared=False)
rmse

1125.4033879045087

# TEST

In [45]:
df_test = pd.read_csv('../../data/diamonds_test.csv')
df_test

Unnamed: 0,id,carat,cut,color,clarity,depth,table,x,y,z,city
0,0,0.79,Very Good,F,SI1,62.7,60.0,5.82,5.89,3.67,Amsterdam
1,1,1.20,Ideal,J,VS1,61.0,57.0,6.81,6.89,4.18,Surat
2,2,1.57,Premium,H,SI1,62.2,61.0,7.38,7.32,4.57,Kimberly
3,3,0.90,Very Good,F,SI1,63.8,54.0,6.09,6.13,3.90,Kimberly
4,4,0.50,Very Good,F,VS1,62.9,58.0,5.05,5.09,3.19,Amsterdam
...,...,...,...,...,...,...,...,...,...,...,...
13480,13480,0.57,Ideal,E,SI1,61.9,56.0,5.35,5.32,3.30,Amsterdam
13481,13481,0.71,Ideal,I,VS2,62.2,55.0,5.71,5.73,3.56,New York City
13482,13482,0.70,Ideal,F,VS1,61.6,55.0,5.75,5.71,3.53,Tel Aviv
13483,13483,0.70,Very Good,F,SI2,58.8,57.0,5.85,5.89,3.45,Surat


In [46]:
for i in df_cut:
    df_test[i] = 0
for i in df_cut:
    df_test[i] = df_test['cut'].str.contains(i, case=False).astype(int)
df_test.drop(columns=['cut'], inplace=True)
for i in df_color:
    df_test[i] = 0
for i in df_color:
    df_test[i] = df_test['color'].str.contains(i, case=False).astype(int)
df_test.drop(columns=['color'], inplace=True)
for i in df_clarity:
    df_test[i] = 0
for i in df_clarity:
    df_test[i] = df_test['clarity'].str.contains(i, case=False).astype(int)
df_test.drop(columns=['clarity'], inplace=True)
for i in df_city:
    df_test[i] = 0
for i in df_city:
    df_test[i] = df_test['city'].str.contains(i, case=False).astype(int)
df_test.drop(columns=['city'], inplace=True)

In [52]:
df_test_1 = df_test.drop(columns=['id'])
df_test_1

Unnamed: 0,carat,depth,table,x,y,z,Ideal,Premium,Very Good,Good,...,Zurich,Paris,Las Vegas,Amsterdam,Tel Aviv,Luxembourg,Madrid,New York City,Dubai,London
0,0.79,62.7,60.0,5.82,5.89,3.67,0,0,1,1,...,0,0,0,1,0,0,0,0,0,0
1,1.20,61.0,57.0,6.81,6.89,4.18,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,1.57,62.2,61.0,7.38,7.32,4.57,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0.90,63.8,54.0,6.09,6.13,3.90,0,0,1,1,...,0,0,0,0,0,0,0,0,0,0
4,0.50,62.9,58.0,5.05,5.09,3.19,0,0,1,1,...,0,0,0,1,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
13480,0.57,61.9,56.0,5.35,5.32,3.30,1,0,0,0,...,0,0,0,1,0,0,0,0,0,0
13481,0.71,62.2,55.0,5.71,5.73,3.56,1,0,0,0,...,0,0,0,0,0,0,0,1,0,0
13482,0.70,61.6,55.0,5.75,5.71,3.53,1,0,0,0,...,0,0,0,0,1,0,0,0,0,0
13483,0.70,58.8,57.0,5.85,5.89,3.45,0,0,1,1,...,0,0,0,0,0,0,0,0,0,0


In [48]:
X_columns_test = ['carat', 'depth', 'table', 'x', 'y', 'z', 'Ideal', 'Premium', 'Very Good', 'Good', 'Fair',
            'D', 'E', 'F', 'G', 'H', 'I', 'J', 'I1', 'SI2', 'SI1', 'VS2', 'VS1', 'VVS2', 'VVS1', 'IF',
            'Antwerp', 'Surat', 'Kimberly', 'Zurich', 'Paris', 'Las Vegas', 'Amsterdam', 'Tel Aviv', 'Luxembourg', 'Madrid', 
          'New York City', 'Dubai', 'London']

In [49]:
X_diamonds_test = df_test[X_columns_test]

In [54]:
predictions_1 = model.predict(df_test_1)

In [56]:
predictions_1

array([ 3591.40747857,  6323.24782707, 10073.73075884, ...,
        3893.33847756,  1960.49165966,   755.7751882 ])

In [68]:
solucion = pd.DataFrame(predictions, columns = ['Price'])
solucion.index.names = ['id']
solucion

Unnamed: 0_level_0,Price
id,Unnamed: 1_level_1
0,3591.407479
1,6323.247827
2,10073.730759
3,4660.805484
4,2080.430147
...,...
13480,1940.905119
13481,2570.775485
13482,3893.338478
13483,1960.491660


In [69]:
solucion.to_csv('../../data/results.csv')