In [158]:
import pandas as pd
import numpy as np
import statsmodels.api as sm
from sklearn.metrics import mean_squared_error

# The main purpose of this code example is to show how to handle ordinal type of variable

In [139]:
df = pd.read_csv('diamonds.csv')
df.head()

Unnamed: 0,carat,cut,depth,table,price,x,y,z
0,0.66,Ideal,62.8,57.0,1901,5.58,5.53,3.49
1,1.06,Ideal,61.9,56.0,13588,6.51,6.54,4.04
2,0.74,Good,63.1,58.0,2824,5.73,5.75,3.62
3,1.0,Good,64.0,53.0,3763,6.34,6.28,4.04
4,1.5,Premium,61.0,60.0,9820,7.33,7.27,4.45


In [144]:
df.tail()

Unnamed: 0,carat,cut,depth,table,price,x,y,z
45,1.18,Premium,59.7,58.0,9537,6.94,6.9,4.13
46,0.41,Very Good,63.5,54.0,1061,4.75,4.7,3.0
47,0.41,Ideal,60.8,56.0,961,4.82,4.79,2.92
48,0.31,Ideal,61.7,57.0,698,4.34,4.32,2.67
49,0.91,Premium,61.9,61.0,4138,6.12,6.1,3.78


In [140]:
predictors = df.drop('price', axis=1)
target = df[['price']]

In [141]:
pd.unique(predictors['cut'])

array(['Ideal', 'Good', 'Premium', 'Very Good', 'Fair'], dtype=object)

In [142]:
predictors['cut'] = predictors['cut'].map( 
    {'Fair': 0, 'Good': 1, 'Very Good': 2, 'Premium': 3, 'Ideal': 4}) 

In [143]:
predictors.head()

Unnamed: 0,carat,cut,depth,table,x,y,z
0,0.66,4,62.8,57.0,5.58,5.53,3.49
1,1.06,4,61.9,56.0,6.51,6.54,4.04
2,0.74,1,63.1,58.0,5.73,5.75,3.62
3,1.0,1,64.0,53.0,6.34,6.28,4.04
4,1.5,3,61.0,60.0,7.33,7.27,4.45


In [146]:
predictors.tail()

Unnamed: 0,carat,cut,depth,table,x,y,z
45,1.18,3,59.7,58.0,6.94,6.9,4.13
46,0.41,2,63.5,54.0,4.75,4.7,3.0
47,0.41,4,60.8,56.0,4.82,4.79,2.92
48,0.31,4,61.7,57.0,4.34,4.32,2.67
49,0.91,3,61.9,61.0,6.12,6.1,3.78


In [147]:
target.head()

Unnamed: 0,price
0,1901
1,13588
2,2824
3,3763
4,9820


In [148]:
predictors_const = sm.add_constant(predictors)
predictors_const.head()

Unnamed: 0,const,carat,cut,depth,table,x,y,z
0,1.0,0.66,4,62.8,57.0,5.58,5.53,3.49
1,1.0,1.06,4,61.9,56.0,6.51,6.54,4.04
2,1.0,0.74,1,63.1,58.0,5.73,5.75,3.62
3,1.0,1.0,1,64.0,53.0,6.34,6.28,4.04
4,1.0,1.5,3,61.0,60.0,7.33,7.27,4.45


In [149]:
mod = sm.OLS(target, predictors_const)
res = mod.fit()
res.summary()

0,1,2,3
Dep. Variable:,price,R-squared:,0.781
Model:,OLS,Adj. R-squared:,0.744
Method:,Least Squares,F-statistic:,21.34
Date:,"Sun, 02 Aug 2020",Prob (F-statistic):,6.09e-12
Time:,22:14:09,Log-Likelihood:,-448.88
No. Observations:,50,AIC:,913.8
Df Residuals:,42,BIC:,929.0
Df Model:,7,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
const,-1.457e+05,7.28e+04,-2.002,0.052,-2.93e+05,1167.426
carat,1.026e+04,3978.488,2.579,0.013,2232.444,1.83e+04
cut,103.2927,325.762,0.317,0.753,-554.122,760.707
depth,2466.7708,1232.210,2.002,0.052,-19.929,4953.471
table,-89.5177,170.308,-0.526,0.602,-433.212,254.177
x,1.024e+04,7905.483,1.295,0.202,-5718.182,2.62e+04
y,1.41e+04,7739.262,1.822,0.076,-1516.696,2.97e+04
z,-4.1e+04,2.07e+04,-1.983,0.054,-8.27e+04,722.072

0,1,2,3
Omnibus:,22.821,Durbin-Watson:,2.314
Prob(Omnibus):,0.0,Jarque-Bera (JB):,36.932
Skew:,1.454,Prob(JB):,9.56e-09
Kurtosis:,6.046,Cond. No.,21900.0


In [150]:
y_pred = res.predict(predictors_const)

In [153]:
#Root Mean Squared Error
rmse = mean_squared_error(target, y_pred, squared=False)
rmse

1917.0886830509412

In [155]:
mean_price = np.mean(target)
mean_price

price    4569.66
dtype: float64

In [157]:
rmse / mean_price * 100

price    41.952545
dtype: float64

In [None]:
#The error is about 42% of mean diamond price