In [1]:
import pandas as pd
import numpy as np

In [2]:
data= pd.read_csv("/Users/abhishekjha/Desktop/Dup_ML_project/notebooks/data/gemstone.csv")

In [3]:
data.drop(labels='id', axis=1, inplace=True)

In [4]:
data

Unnamed: 0,carat,cut,color,clarity,depth,table,x,y,z,price
0,1.52,Premium,F,VS2,62.2,58.0,7.27,7.33,4.55,13619
1,2.03,Very Good,J,SI2,62.0,58.0,8.06,8.12,5.05,13387
2,0.70,Ideal,G,VS1,61.2,57.0,5.69,5.73,3.50,2772
3,0.32,Ideal,G,VS1,61.6,56.0,4.38,4.41,2.71,666
4,1.70,Premium,G,VS2,62.6,59.0,7.65,7.61,4.77,14453
...,...,...,...,...,...,...,...,...,...,...
193568,0.31,Ideal,D,VVS2,61.1,56.0,4.35,4.39,2.67,1130
193569,0.70,Premium,G,VVS2,60.3,58.0,5.75,5.77,3.47,2874
193570,0.73,Very Good,F,SI1,63.1,57.0,5.72,5.75,3.62,3036
193571,0.34,Very Good,D,SI1,62.9,55.0,4.45,4.49,2.81,681


In [5]:
X= data.drop(labels=['price'],axis=1)

In [6]:
Y= data[['price']]

In [7]:
categorical_col=X.select_dtypes(include='object').columns
categorical_col

Index(['cut', 'color', 'clarity'], dtype='object')

In [8]:
Numerical_col= X.select_dtypes(exclude='object').columns
Numerical_col

Index(['carat', 'depth', 'table', 'x', 'y', 'z'], dtype='object')

In [9]:
# Define the custom ranking for each ordinal variable of categorical columns
cut_categories = ['Fair', 'Good', 'Very Good','Premium','Ideal']
color_categories = ['D', 'E', 'F', 'G', 'H', 'I', 'J']
clarity_categories = ['I1','SI2','SI1','VS2','VS1','VVS2','VVS1','IF']


In [10]:
from sklearn.impute import SimpleImputer ## HAndling Missing Values
from sklearn.preprocessing import StandardScaler # HAndling Feature Scaling
from sklearn.preprocessing import OrdinalEncoder # Ordinal Encoding
## pipelines
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer

In [11]:
num_pipeline= Pipeline(
    steps=[
            ('imputer',SimpleImputer()),
            ('scalar',StandardScaler())
           ]
)

In [12]:
cat_pipeline= Pipeline(
    steps= [
        ('imputer',SimpleImputer(strategy='most_frequent')),
        ('OrdinalEncoder',OrdinalEncoder(categories=[cut_categories,color_categories,clarity_categories]))
    ]
)

In [13]:
preprocessor= ColumnTransformer([
    ('num_pipeline',num_pipeline,Numerical_col),
    ('cat_pipeline',cat_pipeline,categorical_col)
]
)

In [14]:
# Train test split

from sklearn.model_selection import train_test_split

X_train,X_test,y_train,y_test= train_test_split(X,Y,test_size=.30,random_state=40)

In [15]:
preprocessor.fit_transform(X_train)

array([[ 0.45410567, -0.29654463, -0.63784219, ...,  4.        ,
         2.        ,  2.        ],
       [ 3.02766057, -1.58999042,  1.96548435, ...,  3.        ,
         4.        ,  1.        ],
       [ 0.54061172,  1.45884607, -1.15850749, ...,  2.        ,
         4.        ,  1.        ],
       ...,
       [ 2.61675685,  1.82840201, -0.63784219, ...,  2.        ,
         4.        ,  1.        ],
       [ 0.45410567,  0.99690115, -1.6791728 , ...,  1.        ,
         3.        ,  4.        ],
       [-0.88673805, -1.12804549,  2.48614965, ...,  2.        ,
         1.        ,  5.        ]])

In [16]:
preprocessor.transform(X_test)

array([[ 3.33043173,  0.16540029, -0.63784219, ...,  4.        ,
         4.        ,  1.        ],
       [-0.36770177,  0.16540029, -0.11717688, ...,  4.        ,
         2.        ,  2.        ],
       [ 0.8650094 , -0.20415565, -1.15850749, ...,  4.        ,
         4.        ,  2.        ],
       ...,
       [-0.71372596, -0.85087854, -0.63784219, ...,  4.        ,
         1.        ,  4.        ],
       [ 1.12452754,  0.81212318,  0.40348843, ...,  3.        ,
         4.        ,  1.        ],
       [-0.9732441 , -0.75848956,  0.40348843, ...,  3.        ,
         1.        ,  3.        ]])

In [17]:
preprocessor.get_feature_names_out()

array(['num_pipeline__carat', 'num_pipeline__depth',
       'num_pipeline__table', 'num_pipeline__x', 'num_pipeline__y',
       'num_pipeline__z', 'cat_pipeline__cut', 'cat_pipeline__color',
       'cat_pipeline__clarity'], dtype=object)

In [18]:
X_train= pd.DataFrame(preprocessor.fit_transform(X_train),columns=preprocessor.get_feature_names_out())

In [19]:
X_test=pd.DataFrame(preprocessor.fit_transform(X_test),columns=preprocessor.get_feature_names_out())

In [20]:
"""linear regression
ridge regression = Overfitting
lasso regression = Select correct feature
elastic net  -= Is is a balance orf ridge and lasso
"""


'linear regression\nridge regression = Overfitting\nlasso regression = Select correct feature\nelastic net  -= Is is a balance orf ridge and lasso\n'

In [21]:
#Model Training
from sklearn.linear_model import LinearRegression,Lasso,Ridge,ElasticNet
from sklearn.metrics import r2_score, mean_absolute_error,mean_squared_error


In [22]:
import numpy as np

def evaluate_model(true, predicted):
    mae= mean_absolute_error(true, predicted)
    mse= mean_squared_error( true, predicted)
    rmse= np.sqrt(mean_squared_error(true, predicted))
    r2= r2_score(true, predicted)
    return mae,mse, rmse, r2

    

In [23]:
# Now we will create a dictionary so that we can get all the regression class from the imported library

Models= {
    'LinearRegression': LinearRegression(),
    'Ridge': Ridge(),
    'Lasso':Lasso(),
    'ElasticNet': ElasticNet()
}

In [24]:
trained_model_list=[]
model_list=[]
r2_list=[]

In [25]:
list(Models)

['LinearRegression', 'Ridge', 'Lasso', 'ElasticNet']

In [26]:
model_list = []
r2_list = []

for model_key, model_value in Models.items():
    model_value.fit(X_train, y_train)
    y_pred = model_value.predict(X_test)

    mae, mse, rmse, r2 = evaluate_model(y_test, y_pred)
    model_result = {'Model': model_key, 'MAE': mae, 'MSE': mse, 'RMSE': rmse, 'R2': r2}
    model_list.append(model_result)

    # Print the results for each model
    print(f"Model Training Performance - {model_key}")
    print("RMSE:", rmse)
    print("MAE:", mae)
    print("R2 score", r2 * 100)
    print('=' * 30)
    print('\n')

    r2_list.append(r2)

# Print the list of all model names outside the loop
print("List of Model Names:", list(Models.keys()))


Model Training Performance - LinearRegression
RMSE: 1017.1388396871133
MAE: 673.1381700533112
R2 score 93.63473228301328


Model Training Performance - Ridge
RMSE: 1017.1401057899576
MAE: 673.161964226691
R2 score 93.6347164364282


Model Training Performance - Lasso
RMSE: 1015.6188074408999
MAE: 673.831093958044
R2 score 93.65374282954916


Model Training Performance - ElasticNet
RMSE: 1521.3723072256082
MAE: 1051.3089846673818
R2 score 85.7594352986082


List of Model Names: ['LinearRegression', 'Ridge', 'Lasso', 'ElasticNet']


In [27]:
model_list

[{'Model': 'LinearRegression',
  'MAE': 673.1381700533112,
  'MSE': 1034571.4192000472,
  'RMSE': 1017.1388396871133,
  'R2': 0.9363473228301328},
 {'Model': 'Ridge',
  'MAE': 673.161964226691,
  'MSE': 1034573.9948064062,
  'RMSE': 1017.1401057899576,
  'R2': 0.9363471643642821},
 {'Model': 'Lasso',
  'MAE': 673.831093958044,
  'MSE': 1031481.5620276757,
  'RMSE': 1015.6188074408999,
  'R2': 0.9365374282954916},
 {'Model': 'ElasticNet',
  'MAE': 1051.3089846673818,
  'MSE': 2314573.69719297,
  'RMSE': 1521.3723072256082,
  'R2': 0.857594352986082}]