In [25]:
import pandas as pd 
import numpy as np
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import OrdinalEncoder
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer

In [13]:
data = pd.read_csv("./data/cubic_zirconia.csv")
data.head(3)

Unnamed: 0.1,Unnamed: 0,carat,cut,color,clarity,depth,table,x,y,z,price
0,1,0.3,Ideal,E,SI1,62.1,58.0,4.27,4.29,2.66,499
1,2,0.33,Premium,G,IF,60.8,58.0,4.42,4.46,2.7,984
2,3,0.9,Very Good,E,VVS2,62.2,60.0,6.04,6.12,3.78,6289


In [14]:
data = data.drop(data.columns[data.columns.str.contains('^Unnamed', na=False)], axis=1)

In [15]:
print(f"Duplicates before: {data.duplicated().sum()}")
data = data.drop_duplicates()
print(f"Duplicates after: {data.duplicated().sum()}")

Duplicates before: 34
Duplicates after: 0


In [16]:
data.isnull().sum()

carat        0
cut          0
color        0
clarity      0
depth      697
table        0
x            0
y            0
z            0
price        0
dtype: int64

In [17]:
X = data.drop(labels=['price'], axis=1)
y = data['price']

In [18]:
X.head(3)

Unnamed: 0,carat,cut,color,clarity,depth,table,x,y,z
0,0.3,Ideal,E,SI1,62.1,58.0,4.27,4.29,2.66
1,0.33,Premium,G,IF,60.8,58.0,4.42,4.46,2.7
2,0.9,Very Good,E,VVS2,62.2,60.0,6.04,6.12,3.78


In [19]:
y.head(3)

0     499
1     984
2    6289
Name: price, dtype: int64

In [20]:
cat_cols = X.select_dtypes(include='object').columns
num_cols = X.select_dtypes(exclude='object').columns

print('cat_cols: ', cat_cols)
print('num_cols: ', num_cols)

cat_cols:  Index(['cut', 'color', 'clarity'], dtype='object')
num_cols:  Index(['carat', 'depth', 'table', 'x', 'y', 'z'], dtype='object')


In [21]:
# Define the custom ranking for each ordinal variable, rank 0 to n based on position
cut_categories = ['Fair', 'Good', 'Very Good','Premium','Ideal']
color_categories = ['D', 'E', 'F', 'G', 'H', 'I', 'J']
clarity_categories = ['I1','SI2','SI1','VS2','VS1','VVS2','VVS1','IF']

In [24]:
num_pipeline = Pipeline(

    steps=[

        ('imputer', SimpleImputer()),
        ('scaler', StandardScaler())
    ]
    
)

In [26]:
cat_pipeline = Pipeline(

    steps=[
        ('imputer', SimpleImputer(strategy='most_frequent')),
        ('ordinalencoder', OrdinalEncoder(categories=[cut_categories, color_categories, clarity_categories]))
    ]
)

In [27]:
preprocessor = ColumnTransformer(
    [

        ('num_pipeline', num_pipeline, num_cols),
        ('cat_pipeline', cat_pipeline, cat_cols)
    ]
)

In [28]:
# Train test split

from sklearn.model_selection import train_test_split
X_train, X_test,y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=30)

In [29]:
preprocessor.fit_transform(X_train)

array([[ 2.57516974,  0.10749241, -0.20045303, ...,  4.        ,
         4.        ,  1.        ],
       [-0.87815872,  0.10749241, -1.10204094, ...,  4.        ,
         2.        ,  3.        ],
       [-0.8363002 , -0.89741318,  0.25034093, ...,  2.        ,
         0.        ,  3.        ],
       ...,
       [ 0.46131414,  0.17927138, -0.65124699, ...,  4.        ,
         0.        ,  5.        ],
       [-1.04559283,  1.04061903, -0.20045303, ...,  1.        ,
         0.        ,  1.        ],
       [-1.04559283,  0.        , -1.10204094, ...,  4.        ,
         4.        ,  5.        ]])

In [30]:
preprocessor.transform(X_test)

array([[ 0.8314473 ,  0.40583225, -1.09651681, ...,  4.        ,
         3.        ,  2.        ],
       [-0.91272079,  0.83585126, -0.65477855, ...,  2.        ,
         1.        ,  2.        ],
       [ 1.60896802,  0.40583225,  2.87912748, ...,  1.        ,
         6.        ,  3.        ],
       ...,
       [-0.93373487,  0.40583225,  1.11217446, ...,  2.        ,
         1.        ,  4.        ],
       [ 0.55826435,  0.33416242,  0.22869795, ...,  2.        ,
         4.        ,  2.        ],
       [-0.57649562,  0.54917192, -0.65477855, ...,  3.        ,
         1.        ,  3.        ]])

In [31]:
preprocessor.get_feature_names_out()

array(['num_pipeline__carat', 'num_pipeline__depth',
       'num_pipeline__table', 'num_pipeline__x', 'num_pipeline__y',
       'num_pipeline__z', 'cat_pipeline__cut', 'cat_pipeline__color',
       'cat_pipeline__clarity'], dtype=object)

 it's common practice to use fit_transform on the training data (X_train) and transform on the test data (X_test) when applying preprocessing techniques like scaling or encoding. Here's why:

fit_transform:

Used on the training data (X_train) because it has two steps:
fit: Analyzes the data to learn parameters (like mean and standard deviation for standardization).
transform: Applies the learned parameters to transform the data based on these parameters.
This ensures the training data is transformed based on its own characteristics, leading to a more accurate model.
transform:

Used on the test data (X_test) because it only needs to apply the already learned parameters from the training data.
Directly applying the transformation learned from the training data ensures consistency and prevents "data leakage," where information from the test data influences the model.

In [32]:
X_train = pd.DataFrame(preprocessor.fit_transform(X_train), columns=preprocessor.get_feature_names_out())
X_test = pd.DataFrame(preprocessor.transform(X_test), columns=preprocessor.get_feature_names_out())

In [33]:
X_train.head(3)

Unnamed: 0,num_pipeline__carat,num_pipeline__depth,num_pipeline__table,num_pipeline__x,num_pipeline__y,num_pipeline__z,cat_pipeline__cut,cat_pipeline__color,cat_pipeline__clarity
0,2.57517,0.107492,-0.200453,2.13679,2.115004,2.04448,4.0,4.0,1.0
1,-0.878159,0.107492,-1.102041,-0.925939,-0.987762,-0.89272,4.0,2.0,3.0
2,-0.8363,-0.897413,0.250341,-0.863796,-0.81787,-0.878994,2.0,0.0,3.0


In [34]:
X_test.head(3)

Unnamed: 0,num_pipeline__carat,num_pipeline__depth,num_pipeline__table,num_pipeline__x,num_pipeline__y,num_pipeline__z,cat_pipeline__cut,cat_pipeline__color,cat_pipeline__clarity
0,0.817112,0.394608,-1.102041,0.911699,0.961526,0.946461,4.0,3.0,2.0
1,-0.920017,0.825282,-0.651247,-1.076856,-1.041412,-0.933895,2.0,1.0,2.0
2,1.591494,0.394608,2.955105,1.488735,1.453319,1.454295,1.0,6.0,3.0


In [35]:
## Model Training

from sklearn.linear_model import LinearRegression,Lasso,Ridge,ElasticNet
from sklearn.metrics import r2_score,mean_absolute_error,mean_squared_error

In [42]:

def evaluate_model(true, predicted):
    mae = mean_absolute_error(true, predicted)
    mse = mean_squared_error(true, predicted)
    rmse = np.sqrt(mean_squared_error(true, predicted))
    r2_square = r2_score(true, predicted)
    return mae, rmse, r2_square

In [49]:
# Train multiple models

models = {
    'LinearRegression': LinearRegression(),
    'Lasso': Lasso(),
    'Ridge': Ridge(),
    'ElasticNet': ElasticNet()
}

trained_model_list = []
model_list = []
r2_list = []

for i in range(len(list(models))):
    model = list(models.values())[i]
    model.fit(X_train, y_train)

    # make predictions
    y_pred = model.predict(X_test)

    # validation test score
    mae, rmse, r2_square = evaluate_model(y_test, y_pred)

    print(list(models.keys())[i])
    model_list.append(list(models.keys())[i])

    print('Model Training Performance')
    print("RMSE:",rmse)
    print("MAE:",mae)
    print("R2 score",r2_square * 100)

    r2_list.append(r2_square)
    
    print('='*35)
    print('\n')

LinearRegression
Model Training Performance
RMSE: 1473.1005053210968
MAE: 820.6706774924796
R2 score 86.82984247130878


Lasso
Model Training Performance
RMSE: 1304.8029224725942
MAE: 817.4104203142693
R2 score 89.66724685137312


Ridge
Model Training Performance
RMSE: 1453.661246759983
MAE: 820.4582077880251
R2 score 87.17513984874417


ElasticNet
Model Training Performance
RMSE: 1646.2943219315525
MAE: 1075.6714684341237
R2 score 83.55093664846285




  model = cd_fast.enet_coordinate_descent(
