In [4]:

import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import io
from google.colab import drive 
drive.mount('/content/gdrive')
%matplotlib inline

avo = pd.read_csv('gdrive/My Drive/Colab Notebooks/avocado.csv')

Mounted at /content/gdrive


In [5]:

## Parsing Date to datetime
def parse_date(d):
     return pd.to_datetime(d, format='%Y-%m-%d')

avo['Date'] = avo['Date'].apply(parse_date)

In [6]:
from scipy.stats import norm
from sklearn.preprocessing import StandardScaler
from scipy import stats
import warnings
warnings.filterwarnings('ignore')
#import dtale

In [7]:

y_ = avo[["AveragePrice"]]
X_ = avo.drop(["AveragePrice"], axis=1)
print(X_.shape)
print(y_.shape)

(18249, 13)
(18249, 1)


## Preprocessing

In [8]:
## https://towardsdatascience.com/categorical-encoding-techniques-93ebd18e1f24

## For categorical features, if ordinal, apply ordinal encoder, and for nominal features, use one hot encoder.

from sklearn.preprocessing import OrdinalEncoder
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import StandardScaler

ord_ = OrdinalEncoder()
ohe = OneHotEncoder(drop='first', sparse = False)
scaler = StandardScaler()

#dummify region column
region=pd.get_dummies(X_['region'], drop_first=True)
X_ = pd.concat( (X_.drop(["region"], axis=1).reset_index(drop = True), region), axis = 1 )

#Binary feature encoding; replacing the type {conventional, organic} into {0,1}
X_['type'] = X_['type'].astype(object)
X_['type']=X_['type'].replace({'conventional':0, 'organic':1})

## use standard scaler to standardize numeric columns

X_[['Total Volume', '4046', '4225', '4770', 'Total Bags', 'Small Bags', 'Large Bags', 'XLarge Bags']]=scaler.fit_transform(X_[['Total Volume', '4046', '4225', '4770', 'Total Bags', 'Small Bags', 'Large Bags', 'XLarge Bags']])

In [9]:
#extract only month and year from the data
X_['year']=X_['Date'].dt.year.astype('float')
X_['month']=X_['Date'].dt.month.astype('float')

In [10]:
X_=X_.drop('Date', axis=1)

In [11]:
np.random.seed(0)
from sklearn.model_selection import train_test_split

Xtrain, Xtest, ytrain, ytest = train_test_split( X_,y_, test_size = 0.2  )

In [12]:
from joblib import load

In [13]:
#Rdanfom Forest; bigger tree
rf_params={'max_depth': 15,
           'max_features': 5,
           'max_leaf_nodes': None,
           'min_impurity_decrease': 0.0,
           'min_samples_leaf': 3,
           'min_samples_split': 6,
           'n_estimators': 2000}

xgb_params={'alpha': 0.01}

#Random Forest; smaller tree
rf_params2 = {'max_depth': 8,
              'max_features': 11,
              'max_leaf_nodes': None,
              'min_impurity_decrease': 0.0,
              'min_samples_leaf': 3,
              'min_samples_split': 6,
              'n_estimators': 2500}


In [14]:
from sklearn.ensemble import RandomForestRegressor
from xgboost import  XGBRegressor
from sklearn.linear_model import LinearRegression

In [15]:
model_rf  = RandomForestRegressor(criterion='mse', oob_score=False, **rf_params, n_jobs=-1)


model_xgb = XGBRegressor(booster = "gblinear", objective = "reg:squarederror",
                               n_jobs = -1,
                               **xgb_params)

model_linear = LinearRegression()

model_rf2  = RandomForestRegressor(criterion='mse', oob_score=False, **rf_params2)

In [None]:
model_rf.fit(Xtrain, ytrain)

RandomForestRegressor(bootstrap=True, ccp_alpha=0.0, criterion='mse',
                      max_depth=15, max_features=5, max_leaf_nodes=None,
                      max_samples=None, min_impurity_decrease=0.0,
                      min_impurity_split=None, min_samples_leaf=3,
                      min_samples_split=6, min_weight_fraction_leaf=0.0,
                      n_estimators=2000, n_jobs=-1, oob_score=False,
                      random_state=None, verbose=0, warm_start=False)

In [None]:
model_rf.fit(Xtrain, ytrain)
model_xgb.fit(Xtrain, ytrain)

XGBRegressor(alpha=0.01, base_score=0.5, booster='gblinear',
             colsample_bylevel=1, colsample_bynode=1, colsample_bytree=1,
             gamma=0, importance_type='gain', learning_rate=0.1,
             max_delta_step=0, max_depth=3, min_child_weight=1, missing=None,
             n_estimators=100, n_jobs=-1, nthread=None,
             objective='reg:squarederror', random_state=0, reg_alpha=0,
             reg_lambda=1, scale_pos_weight=1, seed=None, silent=None,
             subsample=1, verbosity=1)

In [None]:
model_rf2.fit(Xtrain, ytrain)
model_linear.fit(Xtrain, ytrain)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None, normalize=False)

In [None]:
from joblib import dump

In [None]:
dump(model_rf, 'gdrive/My Drive/Colab Notebooks/models/RandomForest_1.joblib')
dump(model_xgb, 'gdrive/My Drive/Colab Notebooks/models/XGBoost.joblib')
dump(model_rf2, 'gdrive/My Drive/Colab Notebooks/models/RandomForest_2.joblib')
dump(model_linear, 'gdrive/My Drive/Colab Notebooks/models/Linear.joblib')

['gdrive/My Drive/Colab Notebooks/models/Linear.joblib']

## Ensemble model building

In [16]:
from joblib import load

In [17]:
m_rf = load('gdrive/My Drive/Colab Notebooks/models/RandomForest_1.joblib')
m_xgb = load('gdrive/My Drive/Colab Notebooks/models/XGBoost.joblib')
m_rf2 = load('gdrive/My Drive/Colab Notebooks/models/RandomForest_2.joblib')
m_lnr = load('gdrive/My Drive/Colab Notebooks/models/Linear.joblib')

In [None]:
models = [m_rf, m_xgb, m_rf2, m_lnr]

In [None]:
from functools import reduce

def ensemble(X_, models, threshold = None):
    if threshold == None:
        threshold = (len(models))
    predictions = []
    for model in models:
        predictions.append(pd.Series(model.predict(X_).reshape(-1)))
    
    y_sum = reduce(lambda x, y: x+y, predictions)
    return y_sum.apply(lambda x: x/threshold)

In [None]:
y_pred = ensemble(Xtest, models)

In [None]:
from sklearn.metrics import mean_squared_error, r2_score 
import statsmodels.api as sm

# model evaluation 
mse = mean_squared_error(ytest,y_pred) 
  
rmse = np.sqrt(mean_squared_error(ytest, y_pred)) 
r2 = r2_score(ytest, y_pred) 
  
# printing values 
print('MSE:',mse) 
print('Root mean squared error: ', rmse) 
print("R2 score : %.2f" % r2_score(ytest,y_pred)) #0.77

MSE: 0.05845076789912308
Root mean squared error:  0.24176593618440767
R2 score : 0.64


In [None]:
#m_rf.predict(Xtest)
# m_xgb = load('gdrive/My Drive/Colab Notebooks/models/XGBoost.joblib')
# m_rf2 = load('gdrive/My Drive/Colab Notebooks/models/RandomForest_2.joblib')
# m_lnr = load('gdrive/My Drive/Colab Notebooks/models/Linear.joblib')

## Grid Searching - Random Forest

In [19]:

grid_param={'max_depth': [10, 15, 20],
           'max_features': [5, 10],
           'min_samples_split': [6, 9],
           'n_estimators': [1500, 2000, 2500]}


In [20]:
from sklearn.model_selection import GridSearchCV

gd_sr = GridSearchCV(estimator=model_rf,
                     param_grid=grid_param,
                     #scoring='accuracy',
                     #cv=5,
                     n_jobs=-1)

In [None]:
gd_sr.fit(Xtrain, ytrain)

GridSearchCV(cv=None, error_score=nan,
             estimator=RandomForestRegressor(bootstrap=True, ccp_alpha=0.0,
                                             criterion='mse', max_depth=15,
                                             max_features=5,
                                             max_leaf_nodes=None,
                                             max_samples=None,
                                             min_impurity_decrease=0.0,
                                             min_impurity_split=None,
                                             min_samples_leaf=3,
                                             min_samples_split=6,
                                             min_weight_fraction_leaf=0.0,
                                             n_estimators=2000, n_jobs=-1,
                                             oob_score=False, random_state=None,
                                             verbose=0, warm_start=False),
             iid='deprecated', n_jobs=-1

## GridSearch - XGBoost

In [21]:
            param_grid={"learning_rate": (0.05, 0.10, 0.15),
                        "max_depth": [ 3, 4, 5, 6, 8],
                        "min_child_weight": [ 1, 3, 5, 7],
                        "gamma":[ 0.0, 0.1, 0.2],
                        "colsample_bytree":[ 0.3, 0.4],}

In [22]:
from sklearn.model_selection import GridSearchCV

gd_sr_xgb = GridSearchCV(estimator=model_xgb,
                     param_grid=param_grid,
                     scoring='neg_mean_squared_error', verbose=0, n_jobs=-1)
                     #cv=5




In [23]:
gd_sr_xgb.fit(Xtrain, ytrain)

GridSearchCV(cv=None, error_score=nan,
             estimator=XGBRegressor(alpha=0.01, base_score=0.5,
                                    booster='gblinear', colsample_bylevel=1,
                                    colsample_bynode=1, colsample_bytree=1,
                                    gamma=0, importance_type='gain',
                                    learning_rate=0.1, max_delta_step=0,
                                    max_depth=3, min_child_weight=1,
                                    missing=None, n_estimators=100, n_jobs=-1,
                                    nthread=None, objective='reg:squarederror',
                                    random_s...da=1,
                                    scale_pos_weight=1, seed=None, silent=None,
                                    subsample=1, verbosity=1),
             iid='deprecated', n_jobs=-1,
             param_grid={'colsample_bytree': [0.3, 0.4],
                         'gamma': [0.0, 0.1, 0.2],
                         'le

In [26]:
best_parameters = gd_sr_xgb.best_params_
print(best_parameters) 

{'colsample_bytree': 0.3, 'gamma': 0.0, 'learning_rate': 0.15, 'max_depth': 3, 'min_child_weight': 1}


## XGBoost model with best parameters

In [27]:
            xgb_params3={"learning_rate": 0.15,
                        "max_depth": 3,
                        "min_child_weight": 1,
                        "gamma": 0.0,
                        "colsample_bytree":0.3}

In [29]:
model_xgb3 = XGBRegressor(booster = "gblinear", objective = "reg:squarederror",
                               n_jobs = -1,
                               **xgb_params3)

In [30]:
model_xgb3.fit(Xtrain, ytrain)

XGBRegressor(base_score=0.5, booster='gblinear', colsample_bylevel=1,
             colsample_bynode=1, colsample_bytree=0.3, gamma=0.0,
             importance_type='gain', learning_rate=0.15, max_delta_step=0,
             max_depth=3, min_child_weight=1, missing=None, n_estimators=100,
             n_jobs=-1, nthread=None, objective='reg:squarederror',
             random_state=0, reg_alpha=0, reg_lambda=1, scale_pos_weight=1,
             seed=None, silent=None, subsample=1, verbosity=1)

In [34]:
from sklearn.metrics import mean_squared_error, r2_score 
import statsmodels.api as sm

m_xgb_pred3 = model_xgb3.predict(Xtest)

# model evaluation 
mse = mean_squared_error(ytest,m_xgb_pred3) 
  
rmse = np.sqrt(mean_squared_error(ytest, m_xgb_pred3)) 
r2 = r2_score(ytest, m_xgb_pred3) 
  
# printing values 
print('MSE: %.3f'% mse) 
print('Root mean squared error: %.3f' % rmse)
print("R2 score : %.2f" % r2_score(ytest,m_xgb_pred3))

MSE: 0.132
Root mean squared error: 0.363
R2 score : 0.19


## Random Forest regressor model with best parameters

In [24]:
best_parameters = gd_sr.best_params_
print(best_parameters) 
#{'max_depth': 20, 'max_features': 10, 'min_samples_split': 6, 'n_estimators': 2000}

AttributeError: ignored

In [25]:
best_result = gd_sr.best_score_
print(best_result)

AttributeError: ignored

In [None]:
rf3_params={'max_depth': 20,
           'max_features': 10,
           'max_leaf_nodes': None,
           'min_impurity_decrease': 0.0,
           'min_samples_leaf': 3,
           'min_samples_split': 6,
           'n_estimators': 2000}

In [None]:
model_rf3  = RandomForestRegressor(criterion='mse', oob_score=False, **rf3_params, n_jobs=-1)

In [None]:
model_rf3.fit(Xtrain, ytrain)

In [None]:
from sklearn.metrics import mean_squared_error, r2_score 
import statsmodels.api as sm

model_rf3_pred = model_rf3.predict(Xtest)

# model evaluation 
mse = mean_squared_error(ytest,model_rf3_pred) 
  
rmse = np.sqrt(mean_squared_error(ytest, model_rf3_pred)) 
r2 = r2_score(ytest, model_rf3_pred) 
  
# printing values 
print('MSE: %.3f' % mse) 
print('Root mean squared error: %.3f' % rmse)
print("R2 score : %.2f" % r2_score(ytest,model_rf3_pred))

## Cross validation with Random Forest Model 1

In [None]:
from sklearn.model_selection import cross_val_score
all_accuracies = cross_val_score(estimator=model_rf, X=Xtrain, y=ytrain, cv=5)

In [None]:
print(all_accuracies) ## [0.75635479 0.76686489 0.77316028 0.76032498 0.77959182]

[0.75635479 0.76686489 0.77316028 0.76032498 0.77959182]


In [None]:
print(all_accuracies.mean()) ##0.7672593511417624

0.7672593511417624
