In [1]:
import pandas as pd
from xgboost import XGBRegressor
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
import matplotlib.pyplot as plt

In [2]:
diamond_df = pd.read_csv("diamonds.csv", index_col=0)

In [3]:
diamond_df.clarity.value_counts()

SI1     13065
VS2     12258
SI2      9194
VS1      8171
VVS2     5066
VVS1     3655
IF       1790
I1        741
Name: clarity, dtype: int64

In [4]:
diamond_df.color.value_counts()

G    11292
E     9797
F     9542
H     8304
D     6775
I     5422
J     2808
Name: color, dtype: int64

In [5]:
diamond_df.clarity.value_counts()

SI1     13065
VS2     12258
SI2      9194
VS1      8171
VVS2     5066
VVS1     3655
IF       1790
I1        741
Name: clarity, dtype: int64

In [6]:
diamond_df.head(5)

Unnamed: 0,carat,cut,color,clarity,depth,table,price,x,y,z
1,0.23,Ideal,E,SI2,61.5,55.0,326,3.95,3.98,2.43
2,0.21,Premium,E,SI1,59.8,61.0,326,3.89,3.84,2.31
3,0.23,Good,E,VS1,56.9,65.0,327,4.05,4.07,2.31
4,0.29,Premium,I,VS2,62.4,58.0,334,4.2,4.23,2.63
5,0.31,Good,J,SI2,63.3,58.0,335,4.34,4.35,2.75


In [7]:
diamond_df.isna().sum()

carat      0
cut        0
color      0
clarity    0
depth      0
table      0
price      0
x          0
y          0
z          0
dtype: int64

In [8]:
# label encoding
def encoder_categorical_variables(dataframe):
    # encoding the categorical variable cut
    cut_mapping = {"Fair":0, "Good":1, "Very Good":2, "Premium": 3, "Ideal": 4}
    dataframe.cut = dataframe.cut.map(cut_mapping)
    # encoding the categorical variable color
    color_mapping = {"J":0, "I":1, "H":2, "G": 3, "F": 4, "E":5, "D":6}
    dataframe.color = dataframe.color.map(color_mapping)
    # encoding the categorical variable clarity
    clarity_mapping = {"I1":0, "SI2":1, "SI1":2, "VS2": 3, "VS1": 4, "VVS2":5, "VVS1":6, "IF":7}
    dataframe.clarity = dataframe.clarity.map(clarity_mapping)
    return dataframe

In [9]:
def cleaner(dataframe):
    #let us get the rid of the outliers
    dataframe = dataframe[dataframe["depth"] < dataframe["depth"].quantile(0.99)]
    dataframe = dataframe[dataframe["table"] < dataframe["table"].quantile(0.99)]
    dataframe = dataframe[dataframe["x"] < dataframe["x"].quantile(0.99)]
    dataframe = dataframe[dataframe["y"] < dataframe["y"].quantile(0.99)]
    dataframe = dataframe[dataframe["z"] < dataframe["z"].quantile(0.99)]

    dataframe = dataframe.drop(dataframe[dataframe["x"]==0].index)
    dataframe = dataframe.drop(dataframe[dataframe["y"]==0].index)
    dataframe = dataframe.drop(dataframe[dataframe["z"]==0].index)
    
    return dataframe

In [10]:
# cleaning data

diamond_df = encoder_categorical_variables(diamond_df)
diamond_df.isna().sum()

carat      0
cut        0
color      0
clarity    0
depth      0
table      0
price      0
x          0
y          0
z          0
dtype: int64

In [11]:
diamond_df = cleaner(diamond_df)
diamond_df

Unnamed: 0,carat,cut,color,clarity,depth,table,price,x,y,z
1,0.23,4,5,1,61.5,55.0,326,3.95,3.98,2.43
2,0.21,3,5,2,59.8,61.0,326,3.89,3.84,2.31
4,0.29,3,1,3,62.4,58.0,334,4.20,4.23,2.63
5,0.31,1,0,1,63.3,58.0,335,4.34,4.35,2.75
6,0.24,2,0,5,62.8,57.0,336,3.94,3.96,2.48
...,...,...,...,...,...,...,...,...,...,...
53936,0.72,4,6,2,60.8,57.0,2757,5.75,5.76,3.50
53937,0.72,1,6,2,63.1,55.0,2757,5.69,5.75,3.61
53938,0.70,2,6,2,62.8,60.0,2757,5.66,5.68,3.56
53939,0.86,3,2,1,61.0,58.0,2757,6.15,6.12,3.74


In [12]:
def preprocessing(dataframe):
    X = dataframe.drop(["price"], axis=1)
    y = dataframe.price
    X_train_val, X_test, y_train_val, y_test = train_test_split(X,y, test_size =0.2, random_state=0)
    X_train, X_val, y_train, y_val = train_test_split(X_train_val,y_train_val, test_size =0.25, random_state=0)
    
    return X_train, X_test, X_val, y_train, y_test, y_val

In [14]:
X_train, X_test, X_val, y_train, y_test, y_val = preprocessing(diamond_df)

In [15]:
# model or create the prediction
xgb1 = XGBRegressor()
parameters = {
    "objective": ["reg:squarederror"],
    "learning_rate": [.0001, 0.001, .01],
    "max_depth": [3, 5,7],
    "min_child_weight": [3,5,7],
    "subsample": [0.1, 0.5, 1.0],
    "n_estimators": [500]}               

In [16]:
xgb_grid = GridSearchCV(xgb1, parameters, cv=3, n_jobs=-1, verbose=0)

In [17]:
xgb_grid.fit(X_train, y_train)

In [19]:
print(xgb_grid.best_score_)
print(xgb_grid.best_params_)

0.9830998400873936
{'learning_rate': 0.01, 'max_depth': 7, 'min_child_weight': 3, 'n_estimators': 500, 'objective': 'reg:squarederror', 'subsample': 0.5}


In [20]:
xgb_cv = (xgb_grid.best_estimator_)
eval_set = [(X_train,y_train),(X_val,y_val)]

In [23]:
#now we are using the best params to calcualte the final model 

fit_model = xgb_cv.fit(X_train,y_train,eval_set=eval_set,eval_metric ="mae", early_stopping_rounds=50, verbose=False)



In [24]:
print("MAE:", mean_absolute_error(y_val, fit_model.predict(X_val)))
print("MSE:", mean_squared_error(y_val, fit_model.predict(X_val)))
print("r2:", r2_score(y_val, fit_model.predict(X_val)))

MAE: 236.82000413599295
MSE: 207937.1320800911
r2: 0.9832106556488737


In [25]:
print("MAE:", mean_absolute_error(y_test, fit_model.predict(X_test)))
print("MSE:", mean_squared_error(y_test, fit_model.predict(X_test)))
print("r2:", r2_score(y_test, fit_model.predict(X_test)))

MAE: 234.23262758784773
MSE: 191062.4449782614
r2: 0.9845281907014979


In [27]:
fit_model.save_model("xgb_model.json")