In [1]:
import pandas as pd

In [2]:
df = pd.read_csv('/Users/jatin/Desktop/Diamond_price_prediction/notebooks/data/diamond.csv')

In [3]:
df = df.drop('id',axis=1)

In [4]:
x = df.drop('price', axis=1)
y = df[['price']]

In [5]:
numerical_columns = x.select_dtypes(exclude='object').columns
categorical_columns = x.select_dtypes(include='object').columns

In [7]:
categorical_columns

Index(['cut', 'color', 'clarity'], dtype='object')

In [61]:
# Define the custom ranking for each ordinal variable
cut_categories = ['Fair', 'Good', 'Very Good','Premium','Ideal']
color_categories = ['D', 'E', 'F', 'G', 'H', 'I', 'J']
clarity_categories = ['I1','SI2','SI1','VS2','VS1','VVS2','VVS1','IF']

In [62]:
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OrdinalEncoder


In [67]:
numerical_pipeline = Pipeline(
    steps=[
    ('SimpleImputer',SimpleImputer(strategy='median')),
    ('StandardScaler', StandardScaler())
])

categorical_pipeline = Pipeline(
    steps=[
    ('SimpleImputer', SimpleImputer(strategy='most_frequent')),
    ('OrdinalEncoder', OrdinalEncoder(categories=[cut_categories,color_categories,clarity_categories])),
    ('StandardScaler', StandardScaler())
])

preprocessor = ColumnTransformer([('num_pipeline',numerical_pipeline,numerical_columns),
                                 ('cat_pipeline',categorical_pipeline,categorical_columns)])

In [68]:
from sklearn.model_selection import train_test_split
x_train,x_test,y_train,y_test = train_test_split(x,y)

In [69]:
X_train=pd.DataFrame(preprocessor.fit_transform(x_train),columns=preprocessor.get_feature_names_out())
X_test=pd.DataFrame(preprocessor.transform(x_test),columns=preprocessor.get_feature_names_out())

In [71]:
from sklearn.linear_model import LinearRegression,Lasso,Ridge,ElasticNet
from sklearn.metrics import r2_score


In [72]:
models = {
    'LinearRegression':LinearRegression(),
    'Lasso':Lasso(),
    'Ridge':Ridge(),
    'ElasticNet':ElasticNet()
}

In [73]:
r2_score_lst = []
models_lst = []
for i in range(len(models)):
    model = list(models.values())[i]
    model.fit(X_train,y_train)
    y_pred = model.predict(X_test)
    r2 = r2_score(y_test,y_pred)
    model_name = list(models.keys())[i]
    print('_'*10+model_name+'_'*10)
    print('r2_score = ',r2)

__________LinearRegression__________
r2_score =  0.9374130172842492
__________Lasso__________
r2_score =  0.9373909640325021
__________Ridge__________
r2_score =  0.9374137651764457
__________ElasticNet__________
r2_score =  0.8555734878782347
