In [168]:
import pandas as pd

In [169]:
df = pd.read_csv("https://raw.githubusercontent.com/krishnaik06/DiamondPricePrediction1/main/notebooks/data/gemstone.csv")
df.head()

Unnamed: 0,id,carat,cut,color,clarity,depth,table,x,y,z,price
0,0,1.52,Premium,F,VS2,62.2,58.0,7.27,7.33,4.55,13619
1,1,2.03,Very Good,J,SI2,62.0,58.0,8.06,8.12,5.05,13387
2,2,0.7,Ideal,G,VS1,61.2,57.0,5.69,5.73,3.5,2772
3,3,0.32,Ideal,G,VS1,61.6,56.0,4.38,4.41,2.71,666
4,4,1.7,Premium,G,VS2,62.6,59.0,7.65,7.61,4.77,14453


In [170]:
df = df.drop(labels=['id'],axis=1)

In [171]:
df.head()

Unnamed: 0,carat,cut,color,clarity,depth,table,x,y,z,price
0,1.52,Premium,F,VS2,62.2,58.0,7.27,7.33,4.55,13619
1,2.03,Very Good,J,SI2,62.0,58.0,8.06,8.12,5.05,13387
2,0.7,Ideal,G,VS1,61.2,57.0,5.69,5.73,3.5,2772
3,0.32,Ideal,G,VS1,61.6,56.0,4.38,4.41,2.71,666
4,1.7,Premium,G,VS2,62.6,59.0,7.65,7.61,4.77,14453


In [172]:
## Independent & Dependent features

X = df.drop(labels=['price'],axis=1)
y= df[['price']]

In [173]:
y

Unnamed: 0,price
0,13619
1,13387
2,2772
3,666
4,14453
...,...
193568,1130
193569,2874
193570,3036
193571,681


In [174]:
## Define which columns should be ordinal encoded and which should be scaled

In [175]:
categorical_cols = X.select_dtypes(include='object').columns
numerical_cols= X.select_dtypes(exclude='object').columns

In [176]:
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import OrdinalEncoder
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer

In [177]:
cut_categories= ['Fair', 'Good', 'Very Good', 'Premium', 'Ideal']
color_categories=['D','E','F','G','H','I','J']
clarity_categories=['I1','SI2','SI1','VS2','VS1','VVS2','VVS1','IF']

In [178]:
num_pipeline = Pipeline(
    steps=[
        ('imputer',SimpleImputer(strategy='median')),
        ('scaler',StandardScaler())
    ]
)

In [179]:
cat_pipeline = Pipeline(
    steps=[
        ('imputer',SimpleImputer(strategy='most_frequent')),
        ('ordinalencoder',OrdinalEncoder(categories=[cut_categories, color_categories, clarity_categories])),
         ('scaler',StandardScaler())

    ]
)

In [180]:
##Combining numerical pipeline and categorical pipeline both

preprocessor= ColumnTransformer([('num_pipeline',num_pipeline,numerical_cols),('cat_pipeline',cat_pipeline,categorical_cols)])

In [181]:
preprocessor

In [182]:
## Train Test Split

from sklearn.model_selection import train_test_split

X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.30, random_state=42)

In [183]:
X_train

Unnamed: 0,carat,cut,color,clarity,depth,table,x,y,z
11504,0.41,Ideal,E,VVS2,60.6,56.0,4.85,4.80,2.93
95284,1.23,Very Good,H,VS1,59.9,59.0,6.91,7.01,4.19
184777,1.70,Premium,H,VS2,62.0,58.0,7.61,7.66,4.74
5419,0.33,Ideal,F,VVS1,61.2,56.0,4.47,4.44,2.73
45466,0.33,Very Good,I,SI1,62.1,58.0,4.41,4.45,2.75
...,...,...,...,...,...,...,...,...,...
119879,0.50,Very Good,E,SI1,60.2,61.0,5.11,5.15,3.09
103694,1.91,Very Good,F,SI1,62.3,62.0,7.85,7.79,4.87
131932,1.22,Premium,G,VS2,62.8,58.0,6.82,6.74,4.26
146867,0.31,Very Good,G,VVS1,61.1,56.0,4.37,4.40,2.67


In [184]:
X_test

Unnamed: 0,carat,cut,color,clarity,depth,table,x,y,z
14868,0.50,Ideal,D,SI1,62.1,57.0,5.05,5.08,3.14
165613,2.00,Very Good,G,SI2,59.5,57.0,8.08,8.15,4.89
96727,0.27,Premium,E,VVS1,60.5,59.0,4.19,4.16,2.52
145593,0.32,Premium,I,VVS1,61.2,59.0,4.43,4.44,2.71
118689,1.19,Ideal,H,SI1,62.5,56.0,6.77,6.81,4.23
...,...,...,...,...,...,...,...,...,...
39151,0.91,Very Good,I,SI2,62.4,59.0,6.18,6.13,3.83
32423,0.51,Ideal,D,VS2,62.4,56.0,5.13,5.11,3.19
17876,0.41,Ideal,G,VVS1,61.8,56.0,4.79,4.77,2.95
72938,1.21,Premium,I,SI1,61.1,60.0,6.88,6.79,4.18


In [185]:
y_train

Unnamed: 0,price
11504,1181
95284,7418
184777,12755
5419,1020
45466,445
...,...
119879,1410
103694,15064
131932,7209
146867,816


In [186]:
y_test

Unnamed: 0,price
14868,1355
165613,14691
96727,844
145593,707
118689,5797
...,...
39151,2974
32423,1875
17876,967
72938,5656


In [187]:
preprocessor.fit_transform(X_train)

array([[-0.82314374, -1.12998781, -0.64189666, ...,  0.87410007,
        -0.93674681,  1.35074594],
       [ 0.94502267, -1.77782269,  0.92190185, ..., -1.13764403,
         0.91085333,  0.68445511],
       [ 1.9584839 ,  0.16568195,  0.40063568, ..., -0.13177198,
         0.91085333,  0.01816428],
       ...,
       [ 0.92345966,  0.90606467,  0.40063568, ..., -0.13177198,
         0.29498662,  0.01816428],
       [-1.03877378, -0.66724861, -0.64189666, ..., -1.13764403,
         0.29498662,  2.01703677],
       [-1.03877378, -0.01941373,  0.92190185, ..., -1.13764403,
         0.29498662, -1.31441737]])

In [188]:
X_train= pd.DataFrame(preprocessor.fit_transform(X_train), columns=preprocessor.get_feature_names_out())

In [189]:
X_test = pd.DataFrame(preprocessor.transform(X_test),columns=preprocessor.get_feature_names_out())

In [190]:
X_train.head()

Unnamed: 0,num_pipeline__carat,num_pipeline__depth,num_pipeline__table,num_pipeline__x,num_pipeline__y,num_pipeline__z,cat_pipeline__cut,cat_pipeline__color,cat_pipeline__clarity
0,-0.823144,-1.129988,-0.641897,-0.780451,-0.835103,-0.876024,0.8741,-0.936747,1.350746
1,0.945023,-1.777823,0.921902,1.073226,1.166389,0.946633,-1.137644,0.910853,0.684455
2,1.958484,0.165682,0.400636,1.703116,1.755063,1.742237,-0.131772,0.910853,0.018164
3,-0.995648,-0.574701,-0.641897,-1.122391,-1.161138,-1.165334,0.8741,-0.32088,2.017037
4,-0.995648,0.25823,0.400636,-1.176382,-1.152082,-1.136403,-1.137644,1.52672,-0.648127


In [191]:
## Model training

In [192]:
from sklearn.linear_model import LinearRegression, Lasso , Ridge, ElasticNet
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score 

In [193]:
regression = LinearRegression()

In [194]:
regression.fit(X_train,y_train)

In [195]:
regression.coef_

array([[ 6432.97591819,  -132.34206204,   -70.48787525, -1701.38593925,
         -494.17005097,   -76.32351645,    68.80035873,  -464.67990411,
          652.10059539]])

In [196]:
regression.intercept_

array([3976.8787389])

In [197]:
import numpy as np 

In [217]:
# adjustedR2 = 1-(((1- r2score)*(N-1))/(N-p-1))
# N is the number of points in your data sample.
# p is the number of independent regressors, i.e. the number of variables in your model, excluding the constant.


N= len(y_train)

p=X_train.shape[1]

In [218]:
df.shape

(193573, 10)

In [219]:
def evaluate_model(true,predicted,X_train,y_train):
    mae = mean_absolute_error(true,predicted)
    mse = mean_squared_error(true,predicted)
    rmse= np.sqrt(mean_squared_error(true,predicted))
    r2score= r2_score(true,predicted)

    N= len(y_train)
    p=X_train.shape[1]

    adjustedR2 = 1-(((1- r2score)*(N-1))/(N-p-1))


    return mae,rmse, r2score,adjustedR2

In [211]:
## Train Multiple Models

In [212]:
models={
    'LinearRegression':LinearRegression(),
    'Lasso': Lasso(),
    'Ridge': Ridge(),
    'ElasticNet': ElasticNet()
}

In [220]:
model_list=[]
r2_list =[]

for i in range(len(list(models))):
    model=list(models.values())[i]
    model.fit(X_train,y_train)

    #make predictions
    y_pred = model.predict(X_test)
    mae,rmse,r2score, adjustedRsquare= evaluate_model(y_test,y_pred,X_train,y_train)
    model_list.append(list(models.keys())[i])
    r2_list.append(r2score)


    print(list(models.keys())[i])

    print("Model Training Performance")
    print("RMSE :",rmse)
    print("mae :",mae)
    print("R2Score :",r2score*100)
    print("adjustedRsquare :",adjustedRsquare*100 )


    print('='*35)
    print('\n')



LinearRegression
Model Training Performance
RMSE : 1014.6296630375463
mae : 675.0758270067483
R2Score : 93.62906819996049
adjustedRsquare : 93.62864501033017


Lasso
Model Training Performance
RMSE : 1014.6591302750638
mae : 676.2421173665509
R2Score : 93.62869814082755
adjustedRsquare : 93.62827492661603


Ridge
Model Training Performance
RMSE : 1014.6343233534415
mae : 675.1077629781366
R2Score : 93.6290096749163
adjustedRsquare : 93.62858648139847


ElasticNet
Model Training Performance
RMSE : 1533.3541245902313
mae : 1060.9432977143008
R2Score : 85.44967219374031
adjustedRsquare : 85.44870568710698




(193573, 10)