# MODEL TRAINING

In [212]:
import pandas as pd
import numpy as np
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import OrdinalEncoder


from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer

from sklearn.model_selection import train_test_split

In [213]:
data = pd.read_csv('C:\\Users\\himan\\OneDrive\\Desktop\\Diamond_prediction\\Notebook\\Data\\gemstone.csv')
data.head()

Unnamed: 0,id,carat,cut,color,clarity,depth,table,x,y,z,price
0,0,1.52,Premium,F,VS2,62.2,58.0,7.27,7.33,4.55,13619
1,1,2.03,Very Good,J,SI2,62.0,58.0,8.06,8.12,5.05,13387
2,2,0.7,Ideal,G,VS1,61.2,57.0,5.69,5.73,3.5,2772
3,3,0.32,Ideal,G,VS1,61.6,56.0,4.38,4.41,2.71,666
4,4,1.7,Premium,G,VS2,62.6,59.0,7.65,7.61,4.77,14453


In [214]:
data = data.drop(labels='id',axis=1)

In [215]:
data.head()

Unnamed: 0,carat,cut,color,clarity,depth,table,x,y,z,price
0,1.52,Premium,F,VS2,62.2,58.0,7.27,7.33,4.55,13619
1,2.03,Very Good,J,SI2,62.0,58.0,8.06,8.12,5.05,13387
2,0.7,Ideal,G,VS1,61.2,57.0,5.69,5.73,3.5,2772
3,0.32,Ideal,G,VS1,61.6,56.0,4.38,4.41,2.71,666
4,1.7,Premium,G,VS2,62.6,59.0,7.65,7.61,4.77,14453


In [216]:
X = data.drop(labels=['price'],axis=1)
Y = data[['price']]



In [217]:
X.select_dtypes(exclude='object').columns

Index(['carat', 'depth', 'table', 'x', 'y', 'z'], dtype='object')

In [218]:
# Segregating numerical and categorical variables
categorical_cols = X.select_dtypes(include='object').columns
numerical_cols = X.select_dtypes(exclude='object').columns

In [219]:
# Define the custom ranking for each ordinal variable
cut_cat = ['Fair', 'Good', 'Very Good','Premium','Ideal']
color_cat = ['D', 'E', 'F', 'G', 'H', 'I', 'J']
clarity_cat = ['I1','SI2','SI1','VS2','VS1','VVS2','VVS1','IF']

In [220]:
numarical_pipeline = Pipeline(
    steps=[
        #         ('imputer', SimpleImputer()), 
        ('imputer',SimpleImputer(strategy='median')),
        ('scalar',StandardScaler())

    ]
)


categorical_pipepline = Pipeline(
    steps=[
        ('imputer',SimpleImputer(strategy='most_frequent')),
        ('ordinal_encoding',OrdinalEncoder(categories=[cut_cat,color_cat,clarity_cat])),
        ('scalar',StandardScaler()),

    ]
)



preprocessor = ColumnTransformer(
    [
        ('numarical_pipepline',numarical_pipeline,numerical_cols),
        ('categorical_pipeline',categorical_pipepline,categorical_cols)
    ]
)
 

In [221]:
## Train - Test _splits

X_train,X_test,y_train,y_test = train_test_split(X,Y,random_state=42,test_size=0.33)

In [222]:
X_train=pd.DataFrame(preprocessor.fit_transform(X_train),columns=preprocessor.get_feature_names_out())
X_test=pd.DataFrame(preprocessor.transform(X_test),columns=preprocessor.get_feature_names_out())

In [223]:
X_train.head()

Unnamed: 0,numarical_pipepline__carat,numarical_pipepline__depth,numarical_pipepline__table,numarical_pipepline__x,numarical_pipepline__y,numarical_pipepline__z,categorical_pipeline__cut,categorical_pipeline__color,categorical_pipeline__clarity
0,2.626061,-2.888129,0.400868,2.233112,2.216066,1.856561,-0.130933,1.525655,-1.314696
1,-0.845291,0.164716,0.922458,-0.915966,-0.908068,-0.890852,-0.130933,-0.937159,-0.648656
2,-0.845291,-1.500472,1.96564,-0.843987,-0.899013,-0.963153,-0.130933,-0.321455,-0.648656
3,-0.694363,-0.667878,-0.642314,-0.637048,-0.636405,-0.673951,0.874463,-0.937159,-1.314696
4,1.548002,-0.482857,1.444049,1.477333,1.455407,1.393839,-0.130933,1.525655,0.683424


In [224]:
## Model _training

from sklearn.linear_model import LinearRegression,Lasso,Ridge,ElasticNet
from sklearn.metrics import r2_score,accuracy_score,mean_absolute_error,confusion_matrix,mean_squared_error

In [225]:
regression = LinearRegression()
regression.fit(X_train,y_train)


In [226]:
regression.coef_

array([[ 6432.59272318,  -133.11853452,   -70.36485019, -1713.18964719,
         -490.48291102,   -68.02812257,    68.36709467,  -464.25812278,
          651.94096231]])

In [227]:
regression.intercept_

array([3979.27372333])

In [228]:
def eval_model(true,predicted):
   
    mae = mean_absolute_error(true, predicted)
    mse = mean_squared_error(true, predicted)
    rmse = np.sqrt(mean_squared_error(true, predicted))
    r2_square = r2_score(true, predicted)
    return mae, rmse, r2_square

In [232]:
models = {
    'LinearRegression':LinearRegression(),
    'Lasso':Lasso(),
    'Ridge':Ridge(),
    'ElasticNet':ElasticNet()
}

trained_model_list = []
model_list = []
r2_list = []

for i in range(len(list(models))):

    model = list(models.values())[i]
    model.fit(X_train,y_train)

    y_pred = model.predict(X_test)

    mae, rmse, r2_square=eval_model(y_test,y_pred)



    print(list(models.keys())[i])
    model_list.append(list(models.keys())[i])

    print('Model Training Performance')
    print('RMSE',rmse)
   
    print('MAE',mae)
    print('r2_square',r2_square*100)
    r2_list.append(r2_square)
    
    
    print("-"*40)
    print('\n')

LinearRegression
Model Training Performance
RMSE 1014.274933056864
MAE 674.7352796098307
r2_square 93.63893549824441
----------------------------------------


Lasso
Model Training Performance
RMSE 1014.3366158273099
MAE 675.8986621286324
r2_square 93.63816178295377
----------------------------------------


Ridge
Model Training Performance
RMSE 1014.2792052203266
MAE 674.7687088427467
r2_square 93.63888191205453
----------------------------------------


ElasticNet
Model Training Performance
RMSE 1534.0746306294475
MAE 1061.3169023914195
r2_square 85.4483784776376
----------------------------------------


