In [1]:
import pandas as pd 

In [3]:
df1 = pd.DataFrame(pd.read_csv(r"data\gemstone.csv"))

In [4]:
df1.head(11)

Unnamed: 0,id,carat,cut,color,clarity,depth,table,x,y,z,price
0,0,1.52,Premium,F,VS2,62.2,58.0,7.27,7.33,4.55,13619
1,1,2.03,Very Good,J,SI2,62.0,58.0,8.06,8.12,5.05,13387
2,2,0.7,Ideal,G,VS1,61.2,57.0,5.69,5.73,3.5,2772
3,3,0.32,Ideal,G,VS1,61.6,56.0,4.38,4.41,2.71,666
4,4,1.7,Premium,G,VS2,62.6,59.0,7.65,7.61,4.77,14453
5,5,1.51,Very Good,J,SI1,62.8,58.0,7.34,7.29,4.59,7506
6,6,0.74,Ideal,E,VS2,61.8,57.0,5.76,5.79,3.57,3229
7,7,1.34,Premium,G,SI2,62.5,57.0,7.0,7.05,4.38,6224
8,8,0.3,Ideal,F,IF,62.0,56.0,4.35,4.37,2.7,886
9,9,0.3,Good,J,VS1,63.6,57.0,4.26,4.28,2.72,421


In [5]:
df1.drop(columns=['id'], inplace= True)

Seggregate the input and output variables

In [6]:
X = df1.drop(columns=['price'], axis = 1)
Y = df1["price"]

In [7]:
numerical_columns = X.select_dtypes(exclude= 'object').columns
categorical_columns = X.select_dtypes(include= 'object').columns

In [8]:
categorical_columns

Index(['cut', 'color', 'clarity'], dtype='object')

In [9]:
df1["cut"].value_counts()

cut
Ideal        92454
Premium      49910
Very Good    37566
Good         11622
Fair          2021
Name: count, dtype: int64

In [10]:
df1["color"].value_counts().sort_values()

color
J     6456
I    17514
D    24286
H    30799
F    34258
E    35869
G    44391
Name: count, dtype: int64

In [11]:
df1["clarity"].value_counts().sort_values()

clarity
I1        512
IF       4219
VVS1    10628
VVS2    15762
SI2     30484
VS1     30669
VS2     48027
SI1     53272
Name: count, dtype: int64

In [12]:
cut_categories = ["Fair", "Good", "Very Good", "Premium", "Ideal"]
color_categories = ["D", "E", "F", "G", "H", "I", "J"]
clarity_categories = ["I1", "SI2", "SI1", "VS2", "VS1", "VVS2", "VVS1", "IF"]

Categories are created

Create pipeline

In [13]:
from sklearn.impute import SimpleImputer # Missing values
from sklearn.preprocessing import StandardScaler # Feature scaling (Numerical datatypes)
from sklearn.preprocessing import OrdinalEncoder # To rank the categorical variables
#Pipeline 
from sklearn.pipeline import Pipeline # To club everything together 
from sklearn.compose import ColumnTransformer # Begin the work 

In [14]:
num_pipeline = Pipeline(
    steps=[
        ("imputer", SimpleImputer(strategy= "median")), 
        ("scaler", StandardScaler())
    ]
)

cat_pipeline = Pipeline(
    steps=[
        ("imputer", SimpleImputer(strategy= "most_frequent")), 
        ("OrdinalEncoder", OrdinalEncoder(categories= [cut_categories, color_categories, clarity_categories])), 
        ('scaler', StandardScaler())
    ]
)

preprocessor = ColumnTransformer(
    [
        ('num_pipeline', num_pipeline, numerical_columns), 
        ('cat_pipeline', cat_pipeline, categorical_columns)
    ]
)