In [1]:
import pickle
import pandas as pd
from sklearn import model_selection
from sklearn.linear_model import LinearRegression, RidgeCV
from sklearn.metrics import root_mean_squared_error
from sklearn.preprocessing import OneHotEncoder, OrdinalEncoder, LabelEncoder
from sklearn.ensemble import RandomForestRegressor
from sklearn.preprocessing import RobustScaler
from sklearn.decomposition import PCA



In [2]:
#Load the model
filename = '../models/xgboost_002.sav'
loaded_model = pickle.load(open(filename, 'rb'))
#print('Model coefficients:', loaded_model.coef_, '\n')
#print('Loaded model coefficients:', loaded_model.coef_)

In [3]:
#Test csv

df_test = pd.read_csv('../dataset/diamonds_test.csv')

df_test.head()

Unnamed: 0,id,carat,cut,color,clarity,depth,table,x,y,z,city
0,0,0.79,Very Good,F,SI1,62.7,60.0,5.82,5.89,3.67,Amsterdam
1,1,1.2,Ideal,J,VS1,61.0,57.0,6.81,6.89,4.18,Surat
2,2,1.57,Premium,H,SI1,62.2,61.0,7.38,7.32,4.57,Kimberly
3,3,0.9,Very Good,F,SI1,63.8,54.0,6.09,6.13,3.9,Kimberly
4,4,0.5,Very Good,F,VS1,62.9,58.0,5.05,5.09,3.19,Amsterdam


In [4]:
#Encoding
df_test_num= df_test.loc[:,['carat','depth','table','x','y','z']]
df_diamonds_cat = df_test.loc[:,['clarity','color','cut']]
#Manual Encoding
clarity_mapped = {'I1':1.0, 'SI2':2.0, 'SI1':3.0, 'VS2':4.0, 'VS1':5.0, 'VVS2':6.0, 'VVS1':7.0, 'IF':8.0}
color_mapped = {'J':1.0, 'I':2.0, 'H':3.0, 'G':4.0, 'F':5.0, 'E':6.0, 'D':7.0}
cut_mapped = {'Fair':1.0, 'Good':2.0, 'Very Good':3.0, 'Premium':4.0, 'Ideal':5.0}

#mapping and dropping columns
df_diamonds_cat['clarity_encoded'] = df_diamonds_cat['clarity'].map(clarity_mapped)
df_diamonds_cat['color_encoded'] = df_diamonds_cat['color'].map(color_mapped)
df_diamonds_cat['cut_encoded'] = df_diamonds_cat['cut'].map(cut_mapped)
df_diamonds_cat = df_diamonds_cat.drop(columns=['clarity','color','cut'])

df_diamonds_cat.head()

Unnamed: 0,clarity_encoded,color_encoded,cut_encoded
0,3.0,5.0,3.0
1,5.0,1.0,5.0
2,3.0,3.0,4.0
3,3.0,5.0,3.0
4,5.0,5.0,3.0


In [5]:
#data-merge

encoded_df = pd.merge(df_test_num,df_diamonds_cat,left_index=True, right_index=True)

encoded_df

Unnamed: 0,carat,depth,table,x,y,z,clarity_encoded,color_encoded,cut_encoded
0,0.79,62.7,60.0,5.82,5.89,3.67,3.0,5.0,3.0
1,1.20,61.0,57.0,6.81,6.89,4.18,5.0,1.0,5.0
2,1.57,62.2,61.0,7.38,7.32,4.57,3.0,3.0,4.0
3,0.90,63.8,54.0,6.09,6.13,3.90,3.0,5.0,3.0
4,0.50,62.9,58.0,5.05,5.09,3.19,5.0,5.0,3.0
...,...,...,...,...,...,...,...,...,...
13480,0.57,61.9,56.0,5.35,5.32,3.30,3.0,6.0,5.0
13481,0.71,62.2,55.0,5.71,5.73,3.56,4.0,2.0,5.0
13482,0.70,61.6,55.0,5.75,5.71,3.53,5.0,5.0,5.0
13483,0.70,58.8,57.0,5.85,5.89,3.45,2.0,5.0,3.0


In [6]:
#Feature engineering

#add column quality (Quality is measured based on four criteria: color, clarity, cut quality and carat weight.)
weight_carat = 0.20
weight_clarity = 0.20
weight_color = 0.35
weight_cut = 0.20
encoded_df['quality'] = (weight_carat * encoded_df['carat'] +
                         weight_clarity * encoded_df['clarity_encoded'] +
                         weight_color * encoded_df ['color_encoded'] +
                         weight_color * encoded_df['cut_encoded'])




#add column volume (by multiplaying x,y and z)
encoded_df['volume'] = (0.33 * encoded_df['x'] +
                        0.33 * encoded_df['y'] +
                        0.33 * encoded_df['z'])

#add column proportion (by adding depth an table)
encoded_df['proportion'] = (0.5 * encoded_df['depth'] +
                            0.5 * encoded_df['table'])

encoded_df


Unnamed: 0,carat,depth,table,x,y,z,clarity_encoded,color_encoded,cut_encoded,quality,volume,proportion
0,0.79,62.7,60.0,5.82,5.89,3.67,3.0,5.0,3.0,3.558,5.0754,61.35
1,1.20,61.0,57.0,6.81,6.89,4.18,5.0,1.0,5.0,3.340,5.9004,59.00
2,1.57,62.2,61.0,7.38,7.32,4.57,3.0,3.0,4.0,3.364,6.3591,61.60
3,0.90,63.8,54.0,6.09,6.13,3.90,3.0,5.0,3.0,3.580,5.3196,58.90
4,0.50,62.9,58.0,5.05,5.09,3.19,5.0,5.0,3.0,3.900,4.3989,60.45
...,...,...,...,...,...,...,...,...,...,...,...,...
13480,0.57,61.9,56.0,5.35,5.32,3.30,3.0,6.0,5.0,4.564,4.6101,58.95
13481,0.71,62.2,55.0,5.71,5.73,3.56,4.0,2.0,5.0,3.392,4.9500,58.60
13482,0.70,61.6,55.0,5.75,5.71,3.53,5.0,5.0,5.0,4.640,4.9467,58.30
13483,0.70,58.8,57.0,5.85,5.89,3.45,2.0,5.0,3.0,3.340,5.0127,57.90


In [7]:
#Scaling
scaler = RobustScaler()
scaled = scaler.fit_transform(encoded_df)
X_temp = pd.DataFrame(scaled)

In [8]:
X_temp = X_temp.rename(columns={0:'carat',
                      1:'depth',
                      2:'table',
                      3:'x',
                      4:'y',
                      5:'z',
                      6: 'clarity_encoded',
                      7:'color_encoded',
                      8:'cut_encoded',
                      9: 'quality',
                      10: 'volume',
                      11: 'proportion'
                     }
            )

In [9]:
X_test_df = X_temp
X_test_df.head()

Unnamed: 0,carat,depth,table,x,y,z,clarity_encoded,color_encoded,cut_encoded,quality,volume,proportion
0,0.140625,0.533333,1.0,0.066667,0.094444,0.125,-0.5,0.333333,-0.5,-0.324864,0.089936,1.233333
1,0.78125,-0.6,0.0,0.616667,0.65,0.580357,0.5,-1.0,0.5,-0.522686,0.625268,-0.333333
2,1.359375,0.2,1.333333,0.933333,0.888889,0.928571,-0.5,-0.333333,0.0,-0.500907,0.922912,1.4
3,0.3125,1.266667,-1.0,0.216667,0.227778,0.330357,-0.5,0.333333,-0.5,-0.3049,0.248394,-0.4
4,-0.3125,0.666667,0.333333,-0.361111,-0.35,-0.303571,0.5,0.333333,-0.5,-0.014519,-0.349036,0.633333


In [11]:
#Model predictions

predictions = loaded_model.predict(X_test_df)

In [12]:
predictions_df = pd.DataFrame(predictions, columns=['price'])

predictions_df['id'] = predictions_df.index
predictions_df = predictions_df[['id','price']]



In [13]:
predictions_df.head()

Unnamed: 0,id,price
0,0,2816.865723
1,1,5535.553711
2,2,9722.40625
3,3,3927.839844
4,4,1641.020752


In [14]:
#Export submissions

predictions_df.to_csv(r'..\submissions\xgboost_004.csv', index=False)