In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder,OneHotEncoder, OrdinalEncoder, StandardScaler
import pickle
import warnings
warnings.filterwarnings('ignore')

In [2]:
# Loading Data from a CSV File
df=pd.read_csv('diamonds.csv')
df.head()

Unnamed: 0,carat,cut,color,clarity,depth,table,price,x,y,z
0,0.23,Ideal,E,SI2,61.5,55.0,326,3.95,3.98,2.43
1,0.21,Premium,E,SI1,59.8,61.0,326,3.89,3.84,2.31
2,0.23,Good,E,VS1,56.9,65.0,327,4.05,4.07,2.31
3,0.29,Premium,I,VS2,62.4,58.0,334,4.2,4.23,2.63
4,0.31,Good,J,SI2,63.3,58.0,335,4.34,4.35,2.75


## Feature Definitions

- carat
    - Weight of the diamond (in carats). A primary factor influencing price.
- cut 
    - Quality of the diamond's cut. Ordinal categories: Fair, Good, Very Good, Premium, Ideal.
    - Should use label encoder for tree models since it is ordinal.
    - Should use One hot encoder for Linear and distance based models, because these models treat inputs as contionous. Encoding them as 1–5 would incorrectly imply equal spacing and linear relationships.
- color
    - Diamond color grading from best (D) to worst (J). Ordinal categories: D, E, F, G, H, I, J.
    - Same method like cut
-   clarity
    - Measure of diamond inclusions (internal flaws) and blemishes. Categories: IF (Internally Flawless), VVS1, VVS2, VS1, VS2, SI1, SI2, I1.
    - It is ordinal categorical variable
    - IF > VVS1 > VVS2 > VS1 > VS2 > SI1 > SI2 > I1
- x
    - Length of the diamond (in mm).
- y
    - Width of the diamond (in mm).
- z
    - Depth (height) of the diamond (in mm).
- depth
    - Total depth percentage = (z / mean(x, y)) × 100. Affects brilliance and appearance.
- table
    - Width of the diamond’s top facet as a percentage of the average diameter. Impacts how light reflects internally.
- price
    - Price of the diamond in USD. Target variable(Convert to INR)

In [19]:
df.min()

carat       0.2
cut        Fair
color         D
clarity      I1
depth      43.0
table      43.0
price       326
x           0.0
y           0.0
z           0.0
dtype: object

In [20]:
df.max()

carat           5.01
cut        Very Good
color              J
clarity         VVS2
depth           79.0
table           95.0
price          18823
x              10.74
y               58.9
z               31.8
dtype: object

In [21]:
df.tail()

Unnamed: 0,carat,cut,color,clarity,depth,table,price,x,y,z
53935,0.72,Ideal,D,SI1,60.8,57.0,2757,5.75,5.76,3.5
53936,0.72,Good,D,SI1,63.1,55.0,2757,5.69,5.75,3.61
53937,0.7,Very Good,D,SI1,62.8,60.0,2757,5.66,5.68,3.56
53938,0.86,Premium,H,SI2,61.0,58.0,2757,6.15,6.12,3.74
53939,0.75,Ideal,D,SI2,62.2,55.0,2757,5.83,5.87,3.64


In [22]:
#Checking for Null Values
df.isnull().sum().any()

False

In [23]:
#Checking for Duplicate values
if df.duplicated().sum():
    print(f'Dulplicate Rows {df.duplicated().sum()}')
else:
    print("No Duplicated Rows")

Dulplicate Rows 146


In [24]:
#Dropping the Duplicated Rows
df.drop_duplicates(inplace=True)
if df.duplicated().sum():
    print(f'Dulplicate Rows {df.duplicated().sum()}')
else:
    print("No Duplicated Rows")

No Duplicated Rows


In [25]:
#Getting Rows and Columns
rows,columns=df.shape
print(f"{rows} Rows and {columns} columns")

53794 Rows and 10 columns


## Feature Engineering

In [26]:
# Getting all the x,y,z columns which are 0
rows_with_zeros = df[(df[['x', 'y','z']] == 0).all(axis=1)]
print(rows_with_zeros)
mask_to_keep = ~(df[['x', 'y', 'z']] == 0).all(axis=1)
df = df[mask_to_keep]
df.shape

       carat        cut color clarity  depth  table  price    x    y    z
11963   1.00  Very Good     H     VS2   63.3   53.0   5139  0.0  0.0  0.0
15951   1.14       Fair     G     VS1   57.5   67.0   6381  0.0  0.0  0.0
24520   1.56      Ideal     G     VS2   62.2   54.0  12800  0.0  0.0  0.0
26243   1.20    Premium     D    VVS1   62.1   59.0  15686  0.0  0.0  0.0
27429   2.25    Premium     H     SI2   62.8   59.0  18034  0.0  0.0  0.0
49556   0.71       Good     F     SI2   64.1   60.0   2130  0.0  0.0  0.0


(53788, 10)

In [27]:
# Getting all the x,y,z columns where 0 exists in any columns
rows_with_zeros = df[(df[['x', 'y','z']] == 0).any(axis=1)]
rows_with_zeros

Unnamed: 0,carat,cut,color,clarity,depth,table,price,x,y,z
2207,1.0,Premium,G,SI2,59.1,59.0,3142,6.55,6.48,0.0
2314,1.01,Premium,H,I1,58.1,59.0,3167,6.66,6.6,0.0
4791,1.1,Premium,G,SI2,63.0,59.0,3696,6.5,6.47,0.0
5471,1.01,Premium,F,SI2,59.2,58.0,3837,6.5,6.47,0.0
10167,1.5,Good,G,I1,64.0,61.0,4731,7.15,7.04,0.0
11182,1.07,Ideal,F,SI2,61.6,56.0,4954,0.0,6.62,0.0
13601,1.15,Ideal,G,VS2,59.2,56.0,5564,6.88,6.83,0.0
24394,2.18,Premium,H,SI2,59.4,61.0,12631,8.49,8.45,0.0
26123,2.25,Premium,I,SI1,61.3,58.0,15397,8.52,8.42,0.0
27112,2.2,Premium,H,SI1,61.2,59.0,17265,8.42,8.37,0.0


In [28]:
# With the help of Depth, x and y we are going to find the z column
df.loc[(df['z'] == 0) & (df['x'] != 0) & (df['y'] != 0), 'z'] = (
    (df['depth'] / 100) * (df['x'] + df['y']) / 2
)



In [29]:
# Checking again if 0 exists in any of the 3 columns
df[(df[['x', 'y','z']] == 0).any(axis=1)]

Unnamed: 0,carat,cut,color,clarity,depth,table,price,x,y,z
11182,1.07,Ideal,F,SI2,61.6,56.0,4954,0.0,6.62,0.0


In [30]:
# Removing that Rows
mask_to_keep = ~(df[['x', 'y', 'z']] == 0).any(axis=1)
df = df[mask_to_keep]
df[(df[['x', 'y','z']] == 0).any(axis=1)]

Unnamed: 0,carat,cut,color,clarity,depth,table,price,x,y,z


In [31]:
df.min()

carat       0.2
cut        Fair
color         D
clarity      I1
depth      43.0
table      43.0
price       326
x          3.73
y          3.68
z          1.07
dtype: object

In [32]:
df.max()

carat           5.01
cut        Very Good
color              J
clarity         VVS2
depth           79.0
table           95.0
price          18823
x              10.74
y               58.9
z               31.8
dtype: object

### Deriving New Columns
- Volume = x * y * z
- Price per Carat = price / carat
- Dimension Ratio = (x + y) / (2 * z)
- Carat Category: Light (<0.5), Medium (0.5-1.5), Heavy (>1.5)

In [12]:
df['volume']=df[['x', 'y', 'z']].prod(axis=1)
df['price_per_carat']=df['price']/df['carat']
df['dim_ratio']=(df[['x','y']].sum(axis=1))/(2*df['z'])
conditions = [
    df['carat'] < 0.5,
    (df['carat'] >= 0.5) & (df['carat'] < 1.5),
    df['carat'] >= 1.5
]

choices = ['Low', 'Medium', 'High']

df['carat_category'] = np.select(conditions, choices)

In [13]:
# Encoding using Ordinal Encoder
cutencoder = OrdinalEncoder(categories=[['Fair', 'Good', 'Very Good', 'Premium', 'Ideal']])
df['cut_encoded'] = cutencoder.fit_transform(df[['cut']])
clarity_order = [['I1', 'SI2', 'SI1', 'VS2', 'VS1', 'VVS2', 'VVS1', 'IF']]
clarityencoder = OrdinalEncoder(categories=clarity_order)
df['clarity_encoded'] = clarityencoder.fit_transform(df[['clarity']])
color_order = [['D','E','F','G','H','I','J']]
colorencoder = OrdinalEncoder(categories=color_order)
df['color_encoded'] = colorencoder.fit_transform(df[['color']])
carat_order = [['Low', 'Medium', 'High']]
caretencoder = OrdinalEncoder(categories=carat_order)
df['carat_encoded'] = caretencoder.fit_transform(df[['carat_category']])

In [14]:
# Saving into Pickle File For Future Use
encoders = {
    'cut': cutencoder,
    'clarity': clarityencoder,
    'color': colorencoder,
    'carat':caretencoder
}

with open('encoders.pkl', 'wb') as file:
    pickle.dump(encoders, file)

In [15]:
df.to_csv('Processed_data.csv')