# Importing the Modules

In [8]:
import numpy as np 
import pandas as pd 
import seaborn as sns
import matplotlib as mpl
import matplotlib.pyplot as plt
import matplotlib.pylab as pylab
from sklearn.preprocessing import OneHotEncoder, LabelEncoder, OrdinalEncoder
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.pipeline import Pipeline
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import LinearRegression
from xgboost import XGBRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.model_selection import cross_val_score
from sklearn.metrics import mean_squared_error
from sklearn import metrics
import statsmodels.api as sm
import statsmodels.formula.api as smapi
import sklearn

# Importing Data

In [2]:
data = pd.read_csv('../data/export_diamonds.csv')
data.head()

Unnamed: 0,carat,cut,color,clarity,price,x,y,z,volume
0,-1.469676,2,1,3,5.786897,3.95,3.98,2.43,2.544276
1,-1.560648,3,1,2,5.786897,3.89,3.84,2.31,2.442517
2,-1.469676,1,1,4,5.78996,4.05,4.07,2.31,2.540995
3,-1.237874,3,5,5,5.811141,4.2,4.23,2.63,2.745658
4,-1.171183,1,6,3,5.814131,4.34,4.35,2.75,2.851039


In [4]:
# Assigning the featurs as X and trarget as y
X= data.drop(["price"],axis =1)
y= data["price"]

#Splitting data into test and train
X_train, X_test, y_train, y_test = train_test_split(X, y,test_size=0.25, random_state=42)

# Creating models and fitting to data

In [5]:
# Building pipelins of standard scaler and model for varios regressors.

pipeline_lr=Pipeline([("scalar1",StandardScaler()),
                     ("lr_classifier",LinearRegression())])

pipeline_dt=Pipeline([("scalar2",StandardScaler()),
                     ("dt_classifier",DecisionTreeRegressor())])

pipeline_rf=Pipeline([("scalar3",StandardScaler()),
                     ("rf_classifier",RandomForestRegressor())])


pipeline_kn=Pipeline([("scalar4",StandardScaler()),
                     ("rf_classifier",KNeighborsRegressor())])


pipeline_xgb=Pipeline([("scalar5",StandardScaler()),
                     ("rf_classifier",XGBRegressor())])

# List of all the pipelines
pipelines = [pipeline_lr, pipeline_dt, pipeline_rf, pipeline_kn, pipeline_xgb]

# Dictionary of pipelines and model types for ease of reference
pipe_dict = {0: "LinearRegression", 1: "DecisionTree", 2: "RandomForest",3: "KNeighbors", 4: "XGBRegressor"}

# Fit the pipelines
for pipe in pipelines:
    pipe.fit(X_train, y_train)



In [14]:
cv_results_score = []
mean_scores = {}
for i, model in enumerate(pipelines):
    cv_score = cross_val_score(model, X_train,y_train, cv=10)
    cv_results_rms.append(cv_score)
    print("%s: %f " % (pipe_dict[i], cv_score.mean()))
    mean_scores[pipe_dict[i]] = cv_score.mean()

LinearRegression: 0.956903 
DecisionTree: 0.984723 
RandomForest: 0.990881 
KNeighbors: 0.987803 
XGBRegressor: 0.992033 


In [16]:
#Get model with highest score
best_regressor = max(mean_scores, key=mean_scores.get)
print("The best regressor is", best_regressor)

The best regressor is XGBRegressor


# Fit model to test data

In [18]:
# Model prediction on test data
pred = pipeline_xgb.predict(X_test)

In [19]:
#Model Evaluation
print("R^2:",metrics.r2_score(y_test, pred))
print("Adjusted R^2:",1 - (1-metrics.r2_score(y_test, pred))*(len(y_test)-1)/(len(y_test)-X_test.shape[1]-1))
print("MAE:",metrics.mean_absolute_error(y_test, pred))
print("MSE:",metrics.mean_squared_error(y_test, pred))
print("RMSE:",np.sqrt(metrics.mean_squared_error(y_test, pred)))

R^2: 0.9921318549708055
Adjusted R^2: 0.9921271823288165
MAE: 0.06448008058022564
MSE: 0.00801906872634557
RMSE: 0.08954925307530806
