In [18]:
# import lib
import numpy as np
import pandas as pd
import pickle
from sklearn.model_selection import train_test_split

from sklearn.preprocessing import StandardScaler,OneHotEncoder,OrdinalEncoder
#https://machinelearningmastery.com/one-hot-encoding-for-categorical-data/
#StandardScaler Fearure scalling
#OneHotEncoder categorical to numeric
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer # Handling missing values


import warnings
warnings.filterwarnings('ignore')

In [19]:
df=pd.read_csv('./data/gemstone.csv')
df.head(5)

Unnamed: 0,id,carat,cut,color,clarity,depth,table,x,y,z,price
0,0,1.52,Premium,F,VS2,62.2,58.0,7.27,7.33,4.55,13619
1,1,2.03,Very Good,J,SI2,62.0,58.0,8.06,8.12,5.05,13387
2,2,0.7,Ideal,G,VS1,61.2,57.0,5.69,5.73,3.5,2772
3,3,0.32,Ideal,G,VS1,61.6,56.0,4.38,4.41,2.71,666
4,4,1.7,Premium,G,VS2,62.6,59.0,7.65,7.61,4.77,14453


In [20]:
df.drop('id',axis=1,inplace=True)

In [21]:
# we are going to predict price

X=df.drop('price',axis=1)
y=df[['price']]


In [22]:
numeric_col=X.select_dtypes(exclude='object').columns
numeric_col

Index(['carat', 'depth', 'table', 'x', 'y', 'z'], dtype='object')

In [23]:
categorical_col=X.select_dtypes(include='object').columns
categorical_col

Index(['cut', 'color', 'clarity'], dtype='object')

In [24]:
cat_col=['cut', 'color', 'clarity']
def unique_item(df,cat_col):
    for col in cat_col:
        print(col,df[col].unique())
unique_item(df,cat_col)

cut ['Premium' 'Very Good' 'Ideal' 'Good' 'Fair']
color ['F' 'J' 'G' 'E' 'D' 'H' 'I']
clarity ['VS2' 'SI2' 'VS1' 'SI1' 'IF' 'VVS2' 'VVS1' 'I1']


In [25]:
cut_categories = ['Fair', 'Good', 'Very Good','Premium','Ideal']
color_categories = ['D', 'E', 'F', 'G', 'H', 'I', 'J']
clarity_categories = ['I1','SI2','SI1','VS2','VS1','VVS2','VVS1','IF']

In [26]:
numeric=Pipeline(
    steps=[
        ('simpleimputer',SimpleImputer()),
        ('standardscaler',StandardScaler())
    ]
        
)

In [27]:
catogorical=Pipeline(
    steps=[
    ('simpleImputer',SimpleImputer(strategy='most_frequent')),
    ('ordinalEncoder',OrdinalEncoder(categories=[cut_categories,color_categories,clarity_categories]))
])

In [28]:
preprocessing=ColumnTransformer(
    [
        ('numeric',numeric,numeric_col),
        ('categorical',catogorical,categorical_col)
    ]
)

In [29]:
X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.20,random_state=33)

In [30]:
X_train.shape,X_test.shape,y_train.shape,y_test.shape

((154858, 9), (38715, 9), (154858, 1), (38715, 1))

In [31]:
preprocessing.fit_transform(X_train)

array([[ 0.47389841,  2.84587302,  1.44488788, ...,  0.        ,
         2.        ,  1.        ],
       [-0.71458028,  0.35109098, -1.1600719 , ...,  4.        ,
         2.        ,  1.        ],
       [ 0.45228971,  0.25869165,  0.92389593, ...,  3.        ,
         5.        ,  3.        ],
       ...,
       [ 1.5759423 ,  0.25869165,  0.92389593, ...,  3.        ,
         3.        ,  2.        ],
       [-0.56331936, -1.77409372,  1.96587984, ...,  1.        ,
         1.        ,  3.        ],
       [-0.97388473,  0.25869165,  0.40290397, ...,  3.        ,
         1.        ,  1.        ]])

In [32]:
preprocessing.transform(X_test)

array([[ 0.23620267,  2.38387635,  0.92389593, ...,  0.        ,
         3.        ,  1.        ],
       [-1.03871084, -0.48050303, -1.1600719 , ...,  4.        ,
         0.        ,  4.        ],
       [ 0.62515934,  0.62828898, -0.11808799, ...,  4.        ,
         2.        ,  1.        ],
       ...,
       [-0.06631918,  0.25869165, -0.63907995, ...,  4.        ,
         1.        ,  1.        ],
       [ 0.9708986 ,  0.16629231,  0.92389593, ...,  3.        ,
         3.        ,  3.        ],
       [-1.01710214, -0.29570436, -1.1600719 , ...,  4.        ,
         3.        ,  2.        ]])

In [33]:
preprocessing.get_feature_names_out()

array(['numeric__carat', 'numeric__depth', 'numeric__table', 'numeric__x',
       'numeric__y', 'numeric__z', 'categorical__cut',
       'categorical__color', 'categorical__clarity'], dtype=object)

In [34]:
preprocessing_file="preprocessor.pkl"
with open(preprocessing_file,'wb') as file:
    pickle.dump(preprocessing,file)

In [35]:
X_train=pd.DataFrame(preprocessing.fit_transform(X_train),columns=preprocessing.get_feature_names_out())
X_train

Unnamed: 0,numeric__carat,numeric__depth,numeric__table,numeric__x,numeric__y,numeric__z,categorical__cut,categorical__color,categorical__clarity
0,0.473898,2.845873,1.444888,0.472847,0.507875,0.747811,0.0,2.0,1.0
1,-0.714580,0.351091,-1.160072,-0.698675,-0.743720,-0.687520,4.0,2.0,1.0
2,0.452290,0.258692,0.923896,0.617034,0.571362,0.617327,3.0,5.0,3.0
3,1.230203,0.073893,-0.639080,1.247853,1.233437,1.240753,4.0,4.0,2.0
4,-0.757798,-0.018506,-1.160072,-0.725710,-0.716511,-0.716516,4.0,3.0,6.0
...,...,...,...,...,...,...,...,...,...
154853,-0.952276,-1.219698,-0.118088,-1.014084,-1.052084,-1.122468,4.0,0.0,2.0
154854,-0.801015,0.073893,-1.681064,-0.833850,-0.807206,-0.803506,4.0,0.0,2.0
154855,1.575942,0.258692,0.923896,1.500181,1.451106,1.487224,3.0,3.0,2.0
154856,-0.563319,-1.774094,1.965880,-0.410300,-0.380939,-0.542537,1.0,1.0,3.0


In [36]:
#X_test=pd.DataFrame(preprocessing.fit(X_test),columns=preprocessing.get_feature_names_out())
X_test=pd.DataFrame(preprocessing.transform(X_test),columns=preprocessing.get_feature_names_out())
#X_test

In [37]:
from sklearn.linear_model import LinearRegression

linear_model=LinearRegression()


In [38]:
linear_model.fit(X_train,y_train)

In [39]:
pred=linear_model.predict(X_test)
pred

array([[3229.38060236],
       [1385.26829929],
       [5525.93225617],
       ...,
       [2962.86748392],
       [7680.76207209],
       [-342.14367263]])

In [40]:
from sklearn.metrics import mean_absolute_error,r2_score

mean_absolute_error(y_test,pred)
res=r2_score(y_test,pred)

In [41]:
res*100

93.73057561200795

In [42]:
pred=np.squeeze(pred,axis=(1,))
# Reduce diementions



In [43]:
y_test=np.squeeze(y_test.values,axis=(1,))
y_test


array([3292,  826, 4588, ..., 2833, 8719,  612], dtype=int64)

In [44]:
pd.DataFrame({
    "Actual_Values":y_test,
    "Predicted_Values":pred
})

Unnamed: 0,Actual_Values,Predicted_Values
0,3292,3229.380602
1,826,1385.268299
2,4588,5525.932256
3,814,1221.814777
4,720,183.780634
...,...,...
38710,2170,1877.790389
38711,6462,6663.690010
38712,2833,2962.867484
38713,8719,7680.762072


In [45]:
from sklearn.metrics import mean_absolute_error,mean_squared_error,r2_score
from sklearn.linear_model import LinearRegression,Lasso,Ridge,ElasticNet

def metrics(y_test,y_pred):
    MAE=mean_absolute_error(y_test,y_pred)
    MSE=mean_squared_error(y_test,y_pred)
    RMSE=np.sqrt(MSE)
    r2=r2_score(y_test,y_pred)
    return MAE,MSE,RMSE,r2


In [46]:
models={
    'LinearModel':LinearRegression(),
    'LassoModel':Lasso(),
    'Ridge_Model':Ridge(),
    'ElasticNetModel':ElasticNet()
}

In [47]:
list(models)

['LinearModel', 'LassoModel', 'Ridge_Model', 'ElasticNetModel']

In [48]:
#print models
for i in range(len(models)):
   print(list(models.values())[i])

LinearRegression()
Lasso()
Ridge()
ElasticNet()


In [49]:
#Do prediction of each and every model
r2_list=[]
for i in range(len(models)):
    model=list(models.values())[i]
    model.fit(X_train,y_train)
    y_pred=model.predict(X_test)
    MAE,MSE,RMSE,r2=metrics(y_test,y_pred)

    print(list(models.keys())[i])
    print("*"*20)
    print("MAE:",MAE)
    print("MSE:",MSE)
    print("RMSE:",RMSE)
    print("R2:",r2*100)

    r2_list.append(r2*100)
    print("__"*20)


LinearModel
********************
MAE: 672.8083557210307
MSE: 1014906.577848457
RMSE: 1007.4257182782545
R2: 93.73057561200795
________________________________________


LassoModel
********************
MAE: 674.1792014919832
MSE: 1015333.9617594971
RMSE: 1007.6378127876588
R2: 93.72793551569426
________________________________________
Ridge_Model
********************
MAE: 672.8371488780335
MSE: 1014916.6019331969
RMSE: 1007.4306933646586
R2: 93.7305136898146
________________________________________
ElasticNetModel
********************
MAE: 1052.0981699314675
MSE: 2288515.8593858182
RMSE: 1512.7841417022516
R2: 85.86305631050664
________________________________________


In [50]:
print(r2_list)

[93.73057561200795, 93.72793551569426, 93.7305136898146, 85.86305631050664]


In [51]:
r2_list=[]
import pickle

for i in range(len(models)):
    model=list(models.values())[i] # Getting the each model
    model.fit(X_train,y_train)     # train model
    y_pred=model.predict(X_test)   # predict model
    
    
    MAE,MSE,RMSE,r2=metrics(y_test,y_pred)

    #converting model to bin file
    model_name=list(models.keys())[i]
    model_file_name=f"{model_name}.pkl"
    with open(model_file_name,'wb') as file:
        pickle.dump(model,file)
        
    #print the metrics
    print(model_name)
    print("*"*20)
    print("MAE:",MAE)
    print("MSE:",MSE)
    print("RMSE:",RMSE)
    print("R2:",r2*100)

    r2_list.append(r2*100)
    print("__"*20)

LinearModel
********************
MAE: 672.8083557210307
MSE: 1014906.577848457
RMSE: 1007.4257182782545
R2: 93.73057561200795
________________________________________
LassoModel
********************
MAE: 674.1792014919832
MSE: 1015333.9617594971
RMSE: 1007.6378127876588
R2: 93.72793551569426
________________________________________
Ridge_Model
********************
MAE: 672.8371488780335
MSE: 1014916.6019331969
RMSE: 1007.4306933646586
R2: 93.7305136898146
________________________________________
ElasticNetModel
********************
MAE: 1052.0981699314675
MSE: 2288515.8593858182
RMSE: 1512.7841417022516
R2: 85.86305631050664
________________________________________


In [52]:
n="erter"
print("{}num".format(n))
print(f"{n}num")

erternum
erternum


In [53]:
import pandas as pd
col_names=['carat','cut','color','clarity','depth','table','x','y','z']
val=[1,2,3,4,5,6,7,8,9]
pd.DataFrame([val],columns=col_names)


Unnamed: 0,carat,cut,color,clarity,depth,table,x,y,z
0,1,2,3,4,5,6,7,8,9
