In [2]:
import pandas as pd

data = pd.read_csv("Diamonds Prices2022.csv")
df = data.copy()
df.head()

Unnamed: 0.1,Unnamed: 0,carat,cut,color,clarity,depth,table,price,x,y,z
0,1,0.23,Ideal,E,SI2,61.5,55.0,326,3.95,3.98,2.43
1,2,0.21,Premium,E,SI1,59.8,61.0,326,3.89,3.84,2.31
2,3,0.23,Good,E,VS1,56.9,65.0,327,4.05,4.07,2.31
3,4,0.29,Premium,I,VS2,62.4,58.0,334,4.2,4.23,2.63
4,5,0.31,Good,J,SI2,63.3,58.0,335,4.34,4.35,2.75


In [3]:
df.drop(['Unnamed: 0'],axis=1,inplace=True)

In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 53943 entries, 0 to 53942
Data columns (total 10 columns):
 #   Column   Non-Null Count  Dtype  
---  ------   --------------  -----  
 0   carat    53943 non-null  float64
 1   cut      53943 non-null  object 
 2   color    53943 non-null  object 
 3   clarity  53943 non-null  object 
 4   depth    53943 non-null  float64
 5   table    53943 non-null  float64
 6   price    53943 non-null  int64  
 7   x        53943 non-null  float64
 8   y        53943 non-null  float64
 9   z        53943 non-null  float64
dtypes: float64(6), int64(1), object(3)
memory usage: 4.1+ MB


In [5]:
for i in ['cut','color','clarity']:
    print("Column: ",i)
    print(df[i].unique())

Column:  cut
['Ideal' 'Premium' 'Good' 'Very Good' 'Fair']
Column:  color
['E' 'I' 'J' 'H' 'F' 'G' 'D']
Column:  clarity
['SI2' 'SI1' 'VS1' 'VS2' 'VVS2' 'VVS1' 'I1' 'IF']


In [6]:
from sklearn.preprocessing import LabelEncoder

LE = LabelEncoder()
label_dict = {}
for i in ['cut','color','clarity']:
    df[i] = LE.fit_transform(df[i])
    label_dict[i] = {label:encoded_label for label, encoded_label in zip(LE.classes_,LE.transform(LE.classes_))}

In [7]:
df

Unnamed: 0,carat,cut,color,clarity,depth,table,price,x,y,z
0,0.23,2,1,3,61.5,55.0,326,3.95,3.98,2.43
1,0.21,3,1,2,59.8,61.0,326,3.89,3.84,2.31
2,0.23,1,1,4,56.9,65.0,327,4.05,4.07,2.31
3,0.29,3,5,5,62.4,58.0,334,4.20,4.23,2.63
4,0.31,1,6,3,63.3,58.0,335,4.34,4.35,2.75
...,...,...,...,...,...,...,...,...,...,...
53938,0.86,3,4,3,61.0,58.0,2757,6.15,6.12,3.74
53939,0.75,2,0,3,62.2,55.0,2757,5.83,5.87,3.64
53940,0.71,3,1,2,60.5,55.0,2756,5.79,5.74,3.49
53941,0.71,3,2,2,59.8,62.0,2756,5.74,5.73,3.43


In [8]:
label_dict

{'cut': {'Fair': 0, 'Good': 1, 'Ideal': 2, 'Premium': 3, 'Very Good': 4},
 'color': {'D': 0, 'E': 1, 'F': 2, 'G': 3, 'H': 4, 'I': 5, 'J': 6},
 'clarity': {'I1': 0,
  'IF': 1,
  'SI1': 2,
  'SI2': 3,
  'VS1': 4,
  'VS2': 5,
  'VVS1': 6,
  'VVS2': 7}}

In [9]:
df.isnull().sum()

carat      0
cut        0
color      0
clarity    0
depth      0
table      0
price      0
x          0
y          0
z          0
dtype: int64

In [10]:
X = df.drop('price',axis=1)
y = df['price']

In [11]:
from sklearn.linear_model import LinearRegression, Ridge, Lasso, ElasticNet
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor,GradientBoostingRegressor,AdaBoostRegressor, BaggingRegressor
from sklearn.svm import SVR
from sklearn.neighbors import KNeighborsRegressor
from sklearn.neural_network import MLPRegressor
from xgboost import XGBRegressor
from lightgbm import LGBMRegressor
from catboost import CatBoostRegressor
from sklearn.model_selection import train_test_split
import time
import warnings
warnings.filterwarnings('ignore')

X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.2,random_state=37)

models = [
    LinearRegression(),Ridge(),Lasso(),ElasticNet(),DecisionTreeRegressor(),RandomForestRegressor(),GradientBoostingRegressor(),AdaBoostRegressor(),BaggingRegressor(),SVR(),KNeighborsRegressor(),MLPRegressor(),XGBRegressor(),LGBMRegressor(),CatBoostRegressor()
]

scores_df = pd.DataFrame(columns=['Model','Score','Time'])

for model in models:
    start = time.time()
    model.fit(X_train,y_train)
    end = time.time()
    scores_df = scores_df.append({'Model':type(model).__name__,'Score':model.score(X_test,y_test),'Time':end-start},ignore_index=True)
    print(f"{type(model).__name__} - {model.score(X_test,y_test)}")









LinearRegression - 0.8806629572432463
Ridge - 0.8806711903773219
Lasso - 0.880672127409462
ElasticNet - 0.7954153885574508
DecisionTreeRegressor - 0.9657918392472474
RandomForestRegressor - 0.9807835284669509
GradientBoostingRegressor - 0.9698539539116358
AdaBoostRegressor - 0.8816775425957075
BaggingRegressor - 0.9793411539864835
SVR - -0.11753519267443391
KNeighborsRegressor - 0.9462256011217939
MLPRegressor - 0.9132770767741102
XGBRegressor - 0.9805460958613403
LGBMRegressor - 0.9806072906878899
Learning rate set to 0.074219
0:	learn: 3737.1553425	total: 64.3ms	remaining: 1m 4s
1:	learn: 3497.7738545	total: 68.6ms	remaining: 34.2s
2:	learn: 3274.1686459	total: 72.8ms	remaining: 24.2s
3:	learn: 3075.2674931	total: 77.3ms	remaining: 19.3s
4:	learn: 2882.2126658	total: 81.8ms	remaining: 16.3s
5:	learn: 2706.7499816	total: 86.2ms	remaining: 14.3s
6:	learn: 2548.0325885	total: 90.6ms	remaining: 12.8s
7:	learn: 2404.3092718	total: 95.1ms	remaining: 11.8s
8:	learn: 2269.6671044	total: 99.6

In [12]:
scores_df

Unnamed: 0,Model,Score,Time
0,LinearRegression,0.880663,0.019541
1,Ridge,0.880671,0.008794
2,Lasso,0.880672,0.311709
3,ElasticNet,0.795415,0.042992
4,DecisionTreeRegressor,0.965792,0.238421
5,RandomForestRegressor,0.980784,16.715528
6,GradientBoostingRegressor,0.969854,3.566566
7,AdaBoostRegressor,0.881678,2.031472
8,BaggingRegressor,0.979341,1.709987
9,SVR,-0.117535,94.844481


In [13]:
scores_df.sort_values(by="Score",ascending=False)

Unnamed: 0,Model,Score,Time
14,CatBoostRegressor,0.981899,4.741094
5,RandomForestRegressor,0.980784,16.715528
13,LGBMRegressor,0.980607,0.161229
12,XGBRegressor,0.980546,0.583343
8,BaggingRegressor,0.979341,1.709987
6,GradientBoostingRegressor,0.969854,3.566566
4,DecisionTreeRegressor,0.965792,0.238421
10,KNeighborsRegressor,0.946226,0.06449
11,MLPRegressor,0.913277,25.558056
7,AdaBoostRegressor,0.881678,2.031472


In [14]:
best = CatBoostRegressor()
best.fit(X_train,y_train)

Learning rate set to 0.074219
0:	learn: 3737.1553425	total: 6.54ms	remaining: 6.54s
1:	learn: 3497.7738545	total: 11.3ms	remaining: 5.66s
2:	learn: 3274.1686459	total: 16.1ms	remaining: 5.33s
3:	learn: 3075.2674931	total: 20.3ms	remaining: 5.06s
4:	learn: 2882.2126658	total: 24.9ms	remaining: 4.95s
5:	learn: 2706.7499816	total: 29.4ms	remaining: 4.87s
6:	learn: 2548.0325885	total: 33.7ms	remaining: 4.78s
7:	learn: 2404.3092718	total: 38.9ms	remaining: 4.83s
8:	learn: 2269.6671044	total: 43.7ms	remaining: 4.81s
9:	learn: 2145.4140087	total: 48.2ms	remaining: 4.77s
10:	learn: 2031.2157322	total: 52.9ms	remaining: 4.76s
11:	learn: 1924.1641030	total: 57.5ms	remaining: 4.74s
12:	learn: 1826.9764782	total: 62.3ms	remaining: 4.73s
13:	learn: 1738.4546889	total: 66.9ms	remaining: 4.71s
14:	learn: 1659.7449086	total: 72ms	remaining: 4.73s
15:	learn: 1583.2658908	total: 77.3ms	remaining: 4.75s
16:	learn: 1512.0314763	total: 82.1ms	remaining: 4.75s
17:	learn: 1448.5023414	total: 88ms	remaining: 

<catboost.core.CatBoostRegressor at 0x20677731910>

In [15]:
best.save_model("model.cbm")