# **Feature Engineering** 

In [56]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
import joblib

In [57]:
data= pd.read_csv('../data/processed_data/cleaned_data.csv')

In [58]:
data

Unnamed: 0.1,Unnamed: 0,carat,cut,color,clarity,depth,table,x,y,z,price,carat_bin
0,0,0.30,Ideal,E,SI1,62.1,58.0,4.27,4.29,2.66,499,Small
1,1,0.33,Premium,G,IF,60.8,58.0,4.42,4.46,2.70,984,Small
2,2,0.90,Very Good,E,VVS2,62.2,60.0,6.04,6.12,3.78,6289,Large
3,3,0.42,Ideal,F,VS1,61.6,56.0,4.82,4.80,2.96,1082,Medium
4,4,0.31,Ideal,F,VVS1,60.4,59.0,4.35,4.43,2.65,779,Small
...,...,...,...,...,...,...,...,...,...,...,...,...
26917,26962,1.11,Premium,G,SI1,62.3,58.0,6.61,6.52,4.09,5408,Extra Large
26918,26963,0.33,Ideal,H,IF,61.9,55.0,4.44,4.42,2.74,1114,Small
26919,26964,0.51,Premium,E,VS2,61.7,58.0,5.12,5.15,3.17,1656,Medium
26920,26965,0.27,Very Good,F,VVS2,61.8,56.0,4.19,4.20,2.60,682,Small


In [59]:
data.drop(columns=['Unnamed: 0','carat_bin'],inplace=True)

In [60]:
data

Unnamed: 0,carat,cut,color,clarity,depth,table,x,y,z,price
0,0.30,Ideal,E,SI1,62.1,58.0,4.27,4.29,2.66,499
1,0.33,Premium,G,IF,60.8,58.0,4.42,4.46,2.70,984
2,0.90,Very Good,E,VVS2,62.2,60.0,6.04,6.12,3.78,6289
3,0.42,Ideal,F,VS1,61.6,56.0,4.82,4.80,2.96,1082
4,0.31,Ideal,F,VVS1,60.4,59.0,4.35,4.43,2.65,779
...,...,...,...,...,...,...,...,...,...,...
26917,1.11,Premium,G,SI1,62.3,58.0,6.61,6.52,4.09,5408
26918,0.33,Ideal,H,IF,61.9,55.0,4.44,4.42,2.74,1114
26919,0.51,Premium,E,VS2,61.7,58.0,5.12,5.15,3.17,1656
26920,0.27,Very Good,F,VVS2,61.8,56.0,4.19,4.20,2.60,682


## STEP 1: Log-transform the Target Variable (price)

In [61]:
data['log_price'] = np.log(data['price'])

## STEP 2: Encode Ordered Categorical Features

In [62]:
cut_order = {'Fair': 1, 'Good': 2, 'Very Good': 3, 'Premium': 4, 'Ideal': 5}
color_order = {'J': 1, 'I': 2, 'H': 3, 'G': 4, 'F': 5, 'E': 6, 'D': 7}
clarity_order = {'I3': 1, 'I2': 2, 'I1': 3, 'SI2': 4, 'SI1': 5,
                 'VS2': 6, 'VS1': 7, 'VVS2': 8, 'VVS1': 9, 'IF': 10, 'FL': 11}

In [63]:
data['cut'] = data['cut'].map(cut_order)
data['color'] = data['color'].map(color_order)
data['clarity'] = data['clarity'].map(clarity_order)

In [64]:
data

Unnamed: 0,carat,cut,color,clarity,depth,table,x,y,z,price,log_price
0,0.30,5,6,5,62.1,58.0,4.27,4.29,2.66,499,6.212606
1,0.33,4,4,10,60.8,58.0,4.42,4.46,2.70,984,6.891626
2,0.90,3,6,8,62.2,60.0,6.04,6.12,3.78,6289,8.746557
3,0.42,5,5,7,61.6,56.0,4.82,4.80,2.96,1082,6.986566
4,0.31,5,5,9,60.4,59.0,4.35,4.43,2.65,779,6.658011
...,...,...,...,...,...,...,...,...,...,...,...
26917,1.11,4,4,5,62.3,58.0,6.61,6.52,4.09,5408,8.595635
26918,0.33,5,3,10,61.9,55.0,4.44,4.42,2.74,1114,7.015712
26919,0.51,4,6,6,61.7,58.0,5.12,5.15,3.17,1656,7.412160
26920,0.27,3,5,8,61.8,56.0,4.19,4.20,2.60,682,6.525030


## STEP 3: Drop Multicollinear Features

In [65]:
data.drop(['x', 'y', 'z'], axis=1, inplace=True)

## STEP 4: Log-transform Skewed Numeric Features

In [66]:
data['log_carat'] = np.log(data['carat'])

In [67]:
data_without_log=data.drop(['log_price', 'log_carat'], axis=1)

In [68]:
data_without_log

Unnamed: 0,carat,cut,color,clarity,depth,table,price
0,0.30,5,6,5,62.1,58.0,499
1,0.33,4,4,10,60.8,58.0,984
2,0.90,3,6,8,62.2,60.0,6289
3,0.42,5,5,7,61.6,56.0,1082
4,0.31,5,5,9,60.4,59.0,779
...,...,...,...,...,...,...,...
26917,1.11,4,4,5,62.3,58.0,5408
26918,0.33,5,3,10,61.9,55.0,1114
26919,0.51,4,6,6,61.7,58.0,1656
26920,0.27,3,5,8,61.8,56.0,682


In [69]:
data_without_log.to_csv("unscaled_processed_data.csv")

## STEP 6: Drop Original Columns After Transformation

In [70]:
data.drop(['price', 'carat'], axis=1, inplace=True)

## STEP 7: Feature Scaling

In [71]:
X = data.drop('log_price', axis=1)
y = data['log_price']

In [72]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [73]:
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [74]:
joblib.dump((X_train_scaled, X_test_scaled, y_train, y_test), 'sacled_train_test_split.pkl')

['sacled_train_test_split.pkl']