#### Model Training 

In [2]:
import pandas as pd
data_set=pd.read_csv('./data/gemstone.csv')

In [4]:
data_set.head(10)

Unnamed: 0,id,carat,cut,color,clarity,depth,table,x,y,z,price
0,0,1.52,Premium,F,VS2,62.2,58.0,7.27,7.33,4.55,13619
1,1,2.03,Very Good,J,SI2,62.0,58.0,8.06,8.12,5.05,13387
2,2,0.7,Ideal,G,VS1,61.2,57.0,5.69,5.73,3.5,2772
3,3,0.32,Ideal,G,VS1,61.6,56.0,4.38,4.41,2.71,666
4,4,1.7,Premium,G,VS2,62.6,59.0,7.65,7.61,4.77,14453
5,5,1.51,Very Good,J,SI1,62.8,58.0,7.34,7.29,4.59,7506
6,6,0.74,Ideal,E,VS2,61.8,57.0,5.76,5.79,3.57,3229
7,7,1.34,Premium,G,SI2,62.5,57.0,7.0,7.05,4.38,6224
8,8,0.3,Ideal,F,IF,62.0,56.0,4.35,4.37,2.7,886
9,9,0.3,Good,J,VS1,63.6,57.0,4.26,4.28,2.72,421


In [7]:
data_set=data_set.drop(labels=['id'],axis=1)

In [14]:
# Independent and dependent features 
X=data_set.drop(labels=['price'],axis=1)
y=data_set[['price']] # Here double bracket is applied for getting the coloumn name and data.

In [15]:
X.head(10)

Unnamed: 0,carat,cut,color,clarity,depth,table,x,y,z
0,1.52,Premium,F,VS2,62.2,58.0,7.27,7.33,4.55
1,2.03,Very Good,J,SI2,62.0,58.0,8.06,8.12,5.05
2,0.7,Ideal,G,VS1,61.2,57.0,5.69,5.73,3.5
3,0.32,Ideal,G,VS1,61.6,56.0,4.38,4.41,2.71
4,1.7,Premium,G,VS2,62.6,59.0,7.65,7.61,4.77
5,1.51,Very Good,J,SI1,62.8,58.0,7.34,7.29,4.59
6,0.74,Ideal,E,VS2,61.8,57.0,5.76,5.79,3.57
7,1.34,Premium,G,SI2,62.5,57.0,7.0,7.05,4.38
8,0.3,Ideal,F,IF,62.0,56.0,4.35,4.37,2.7
9,0.3,Good,J,VS1,63.6,57.0,4.26,4.28,2.72


In [16]:
y.head(10)

Unnamed: 0,price
0,13619
1,13387
2,2772
3,666
4,14453
5,7506
6,3229
7,6224
8,886
9,421


In [None]:
# CAtegorial and ordinal data set diviion of the independent variable. 

In [17]:
categorical_columns= X.select_dtypes(include='object').columns
numericals_columns=X.select_dtypes(exclude='object').columns

In [18]:
categorical_columns

Index(['cut', 'color', 'clarity'], dtype='object')

In [19]:
numericals_columns

Index(['carat', 'depth', 'table', 'x', 'y', 'z'], dtype='object')

In [55]:
cut_categories = ['Fair', 'Good', 'Very Good','Premium','Ideal']
color_categories = ['D', 'E', 'F', 'G', 'H', 'I', 'J']
clarity_categories = ['I1','SI2','SI1','VS2','VS1','VVS2','VVS1','IF']

## sklearn.impute - Machine Learning Technique to handle the missing values. ## StandardScaler - ML Tech for Feature Scaling (Z-Score)

In [34]:
from sklearn.impute import SimpleImputer # Used for Handing Missing Values
from sklearn.preprocessing import StandardScaler # Used for Feature Scaling.(Z-Score)
from sklearn.preprocessing import OrdinalEncoder # Used for Ranking the Categorial Data.
# All these Data convertion has to be done in order . Therefore Pipeline is Important
from sklearn.pipeline import Pipeline 
from sklearn.compose import ColumnTransformer # All These need to be in Group. This is the Workflow. 

In [65]:
# Numerical pipeline
num_pipeline=Pipeline(
    steps=[('Imputer',SimpleImputer(strategy='median')),
    ('FScaler',StandardScaler())
    ]
)
# Categorical Pipeline 
cat_pipeline=Pipeline(steps=
    [
    ('Imputer',SimpleImputer(strategy='most_frequent')),
    ('OrdinalEnco',OrdinalEncoder(categories=[cut_categories,color_categories,clarity_categories])),
    ('FScaler',StandardScaler())
    ]
)

In [66]:
num_pipeline

In [67]:
cat_pipeline

In [68]:
preprocessor=ColumnTransformer([
    ('num_pipeline',num_pipeline,numericals_columns),
    ('cat_pipeline',cat_pipeline,categorical_columns)
])

In [69]:
preprocessor

#### Now its time for Train Test Split. 

In [70]:
from sklearn.model_selection import train_test_split # Libraries Imported.
X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.30,shuffle=True,random_state=42)

In [71]:
preprocessor.fit_transform(X_train)

array([[-0.82314374, -1.12998781, -0.64189666, ...,  0.87410007,
        -0.93674681,  1.35074594],
       [ 0.94502267, -1.77782269,  0.92190185, ..., -1.13764403,
         0.91085333,  0.68445511],
       [ 1.9584839 ,  0.16568195,  0.40063568, ..., -0.13177198,
         0.91085333,  0.01816428],
       ...,
       [ 0.92345966,  0.90606467,  0.40063568, ..., -0.13177198,
         0.29498662,  0.01816428],
       [-1.03877378, -0.66724861, -0.64189666, ..., -1.13764403,
         0.29498662,  2.01703677],
       [-1.03877378, -0.01941373,  0.92190185, ..., -1.13764403,
         0.29498662, -1.31441737]])

In [72]:
X_train=pd.DataFrame(preprocessor.fit_transform(X_train),columns=preprocessor.get_feature_names_out())
X_test=pd.DataFrame(preprocessor.transform(X_test),columns=preprocessor.get_feature_names_out())


In [80]:
y_train

Unnamed: 0,price
11504,1181
95284,7418
184777,12755
5419,1020
45466,445
...,...
119879,1410
103694,15064
131932,7209
146867,816


In [74]:
X_train.head(10)

Unnamed: 0,num_pipeline__carat,num_pipeline__depth,num_pipeline__table,num_pipeline__x,num_pipeline__y,num_pipeline__z,cat_pipeline__cut,cat_pipeline__color,cat_pipeline__clarity
0,-0.823144,-1.129988,-0.641897,-0.780451,-0.835103,-0.876024,0.8741,-0.936747,1.350746
1,0.945023,-1.777823,0.921902,1.073226,1.166389,0.946633,-1.137644,0.910853,0.684455
2,1.958484,0.165682,0.400636,1.703116,1.755063,1.742237,-0.131772,0.910853,0.018164
3,-0.995648,-0.574701,-0.641897,-1.122391,-1.161138,-1.165334,0.8741,-0.32088,2.017037
4,-0.995648,0.25823,0.400636,-1.176382,-1.152082,-1.136403,-1.137644,1.52672,-0.648127
5,1.527224,0.073134,-1.163163,1.433163,1.50148,1.467392,0.8741,0.294987,-0.648127
6,1.548787,-0.574701,0.400636,1.532146,1.510536,1.452926,-0.131772,-0.936747,0.018164
7,-0.499699,0.25823,-1.163163,-0.393518,-0.364164,-0.355265,0.8741,0.294987,0.018164
8,-0.542825,-0.389605,-0.641897,-0.447509,-0.418503,-0.456523,0.8741,0.294987,0.684455
9,0.233444,0.998613,-1.163163,0.36235,0.396584,0.454805,-1.137644,0.910853,-0.648127


#### Model Training 

In [77]:
from sklearn.linear_model import LinearRegression,Lasso,Ridge,ElasticNet
from sklearn.metrics import r2_score,mean_absolute_error,mean_squared_error

In [78]:
regression=LinearRegression() # Calling the Function - initializing. 

In [79]:
regression

In [81]:
regression.fit(X_train,y_train) #Modelling has been completed. 

In [82]:
regression.coef_

array([[ 6432.97591819,  -132.34206204,   -70.48787525, -1701.38593925,
         -494.17005097,   -76.32351645,    68.80035873,  -464.67990411,
          652.10059539]])

In [83]:
regression.intercept_

array([3976.8787389])

In [84]:
import numpy as np
def evaluate_model(true, predicted):
    mae = mean_absolute_error(true, predicted)
    mse = mean_squared_error(true, predicted)
    rmse = np.sqrt(mean_squared_error(true, predicted))
    r2_square = r2_score(true, predicted)
    return mae, rmse, r2_square

In [99]:
## Train multiple models

models={
    'LinearRegression':LinearRegression(),
    'Lasso':Lasso(),
    'Ridge':Ridge(),
    'Elasticnet':ElasticNet()
}

In [105]:
models['Lasso']

In [85]:
## Train multiple models

models={
    'LinearRegression':LinearRegression(),
    'Lasso':Lasso(),
    'Ridge':Ridge(),
    'Elasticnet':ElasticNet()
}
trained_model_list=[] 
model_list=[]
r2_list=[]

for i in range(len(list(models))):
    model=list(models.values())[i]
    model.fit(X_train,y_train)

    #Make Predictions
    y_pred=model.predict(X_test)

    mae, rmse, r2_square=evaluate_model(y_test,y_pred) # Running the model....:D

    print(list(models.keys())[i])
    model_list.append(list(models.keys())[i])

    print('Model Training Performance')
    print("RMSE:",rmse)
    print("MAE:",mae)
    print("R2 score",r2_square*100)

    r2_list.append(r2_square)
    
    print('='*35)
    print('\n')

LinearRegression
Model Training Performance
RMSE: 1014.6296630375463
MAE: 675.0758270067483
R2 score 93.62906819996049


Lasso
Model Training Performance
RMSE: 1014.659130275064
MAE: 676.2421173665508
R2 score 93.62869814082755


Ridge
Model Training Performance
RMSE: 1014.6343233534411
MAE: 675.1077629781329
R2 score 93.62900967491632


Elasticnet
Model Training Performance
RMSE: 1533.3541245902313
MAE: 1060.9432977143008
R2 score 85.44967219374031


