In [1]:
import pandas as pd

df=pd.read_csv("./data/gemstone.csv")
df.head()

Unnamed: 0,id,carat,cut,color,clarity,depth,table,x,y,z,price
0,0,1.52,Premium,F,VS2,62.2,58.0,7.27,7.33,4.55,13619
1,1,2.03,Very Good,J,SI2,62.0,58.0,8.06,8.12,5.05,13387
2,2,0.7,Ideal,G,VS1,61.2,57.0,5.69,5.73,3.5,2772
3,3,0.32,Ideal,G,VS1,61.6,56.0,4.38,4.41,2.71,666
4,4,1.7,Premium,G,VS2,62.6,59.0,7.65,7.61,4.77,14453


In [2]:
df.drop(['id'],axis=1,inplace=True)

In [3]:
# Independent and Dependent Features 

X=df.drop('price',axis=1)
y=df['price']

In [6]:
cat_cols=X.select_dtypes(include='object').columns
num_cols=X.select_dtypes(exclude='object').columns

In [19]:
X['cut'].unique()
X['color'].unique()
X['clarity'].unique()

array(['VS2', 'SI2', 'VS1', 'SI1', 'IF', 'VVS2', 'VVS1', 'I1'],
      dtype=object)

In [20]:
# ordinal values for ranking 
cut_types=[ 'Good', 'Fair','Very Good','Premium', 'Ideal',]
color_types=[ 'D','E', 'F', 'G', 'H', 'I', 'J']
clarity_type=[ 'I1', 'SI2', 'SI1', 'VS2','VS1',  'VVS2', 'VVS1','IF']

### PIPELINE 

In [21]:
from sklearn.impute import SimpleImputer   ## handling missing values if any 
from sklearn.preprocessing import StandardScaler  ## for scaling down the numerical data
from sklearn.preprocessing import OrdinalEncoder  ## for encoding the ordinal values 

from sklearn.pipeline import Pipeline     # for building pipelines 
from sklearn.compose import ColumnTransformer    # for combining the pipelines together 

In [23]:
num_pipeline=Pipeline(
    steps=[
        ('imputer',SimpleImputer(strategy='median')),
         ('scaler',StandardScaler())
    ]
)


cat_pipeline=Pipeline(
    steps=[
        ('imputer',SimpleImputer(strategy='most_frequent')),
        ('ordinalencoder',OrdinalEncoder(categories=[cut_types,color_types,clarity_type])),
        ('scaler',StandardScaler())
    ]
)

In [24]:
preprocessor=ColumnTransformer(
   [ ('numerical_pipeline',num_pipeline,num_cols),
     ('categorical_pipeline',cat_pipeline,cat_cols)]
)

In [25]:
preprocessor

In [26]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test=train_test_split(X,y,test_size=0.25,random_state=22)

In [27]:
X_train=pd.DataFrame(preprocessor.fit_transform(X_train),columns=preprocessor.get_feature_names_out())
X_test=pd.DataFrame(preprocessor.fit_transform(X_test),columns=preprocessor.get_feature_names_out())


In [28]:
X_train.head()

Unnamed: 0,numerical_pipeline__carat,numerical_pipeline__depth,numerical_pipeline__table,numerical_pipeline__x,numerical_pipeline__y,numerical_pipeline__z,categorical_pipeline__cut,categorical_pipeline__color,categorical_pipeline__clarity
0,-1.145771,1.644915,-0.118734,-1.5448,-1.522906,-1.454139,-2.763991,-0.935013,2.015308
1,-0.411946,0.720878,-2.204473,-0.274977,-0.26295,-0.208772,0.822923,-0.318907,0.683317
2,-0.994689,-0.757581,0.924136,-1.148543,-1.124071,-1.179,-0.073805,-0.935013,2.015308
3,1.983779,-0.018352,-0.118734,1.805373,1.731225,1.760646,0.822923,0.913306,0.017322
4,-0.045033,0.536071,1.967005,0.040228,0.054305,0.095329,-2.763991,-0.935013,0.683317


In [98]:
from sklearn.linear_model import Lasso,LinearRegression,Ridge,ElasticNet
from sklearn.metrics import r2_score,mean_squared_error,mean_absolute_error
import numpy as np

In [99]:
linear_model=LinearRegression()
linear_model.fit(X_train,y_train)




In [100]:
lasso_model=Lasso()
lasso_model.fit(X_train,y_train)



In [101]:

ridge_model=Ridge()
ridge_model.fit(X_train,y_train)



In [102]:
elastic_model=ElasticNet()
elastic_model.fit(X_train,y_train)

In [103]:
y_pred_linear=linear_model.predict(X_test)
y_pred_lasso=lasso_model.predict(X_test)
y_pred_ridge=ridge_model.predict(X_test)
y_pred_elastic=elastic_model.predict(X_test)

0.9368899906474544

0.9368779016150411

0.9368899598553743

In [113]:
from sklearn.metrics import r2_score as raccuracy

In [114]:
def evaluate_model(true,pred):
    mse=mean_squared_error(true,pred)
    r2=raccuracy(true,pred)
    mae=mean_absolute_error(true,pred)
    rmse=np.sqrt(mse)
    return (mse,rmse,mae,r2)
    


In [109]:
y_pred_elastic

array([-239.01239004, -143.20371019, -160.19432016, ...,  485.81866159,
        987.93913834,  175.3788611 ])

In [115]:
models={
    'LinearRegression':LinearRegression(),
    'LassoRegression':Lasso(),
    'RidgeRegression':Ridge(),
    'ElasticNet':ElasticNet()
}
model_list=[]
for i in range(len(list(models))):
   model= list(models.values())[i]
   model.fit(X_train,y_train)
   y_pred=model.predict(X_test)
   mse,rmse,mae,r2=evaluate_model(y_test,y_pred)
   model_list.append(list(models.keys())[i])
   print('Model Training Performance',model_list[i])
   print("MSE :",mse)
   print("RMSE : ",rmse)
   print("MAE :",mae)
   print("R2 :", r2)
  

Model Training Performance LinearRegression
MSE : 1013114.5732728026
RMSE :  1006.5359274625038
MAE : 673.0935742651536
R2 : 0.9368899906474544
Model Training Performance LassoRegression
MSE : 1013308.6403477459
RMSE :  1006.6323262978126
MAE : 674.422765666446
R2 : 0.9368779016150411
Model Training Performance RidgeRegression
MSE : 1013115.0675827469
RMSE :  1006.5361730125485
MAE : 673.1243954435049
R2 : 0.9368899598553743
Model Training Performance ElasticNet
MSE : 2297560.79899289
RMSE :  1515.7706947269069
MAE : 1060.9151381098718
R2 : 0.8568779017322069
