In [45]:
#Import libraries

import pandas as pd
from numpy.lib.arraysetops import unique
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import matplotlib.ticker as mtick
import datetime as dt
import category_encoders as ce 
from sklearn.metrics import mean_squared_error,r2_score,accuracy_score
from sklearn.kernel_ridge import KernelRidge
from sklearn.model_selection import train_test_split
from sklearn.linear_model import ElasticNet
from sklearn.linear_model import Lasso
from sklearn.preprocessing import RobustScaler


In [35]:
#Read csv file

dataset = pd.read_csv('CropProduction.csv')
df=dataset.copy()

In [36]:
#Convert date column to ordinal
df['Date'] = pd.to_datetime(df['Date'])
df['Date'] = df['Date'].map(dt.datetime.toordinal)
#Convert Area to numeric
df = df.replace(',','', regex=True)
df['Area'] = df['Area'].astype('float64')

df

Unnamed: 0,Date,Place,Area,Crop,Production
0,735629,Atok,1774.99,Cabbage,922.928
1,735657,Atok,1774.99,Cabbage,2153.498
2,735688,Atok,1774.99,Cabbage,3384.069
3,735718,Atok,1774.99,Cabbage,2153.498
4,735749,Atok,1774.99,Cabbage,3691.711
...,...,...,...,...,...
6559,737302,Tublay,8.60,Carrots,2.413
6560,737332,Tublay,8.60,Carrots,3.235
6561,737363,Tublay,8.60,Carrots,6.984
6562,737393,Tublay,8.60,Carrots,21.670


In [37]:
#Craete copy of dataset
df_subset = df.copy()
df_subset

Unnamed: 0,Date,Place,Area,Crop,Production
0,735629,Atok,1774.99,Cabbage,922.928
1,735657,Atok,1774.99,Cabbage,2153.498
2,735688,Atok,1774.99,Cabbage,3384.069
3,735718,Atok,1774.99,Cabbage,2153.498
4,735749,Atok,1774.99,Cabbage,3691.711
...,...,...,...,...,...
6559,737302,Tublay,8.60,Carrots,2.413
6560,737332,Tublay,8.60,Carrots,3.235
6561,737363,Tublay,8.60,Carrots,6.984
6562,737393,Tublay,8.60,Carrots,21.670


In [38]:
df_subset.drop(df_subset[df_subset['Production']==260610.0].index, inplace = True)
df_subset.drop(df_subset[df_subset['Production']==0.000000].index, inplace = True)

In [39]:
#Convert categorical features to binary using BinaryEncoder

categorical_features = list(df_subset.columns[df_subset.dtypes == object])
encoder = ce.BinaryEncoder(cols=categorical_features)
encoder_df = encoder.fit_transform(df_subset)

encoder_df.head()

  elif pd.api.types.is_categorical(cols):


Unnamed: 0,Date,Place_0,Place_1,Place_2,Place_3,Place_4,Area,Crop_0,Crop_1,Crop_2,Crop_3,Crop_4,Production
0,735629,0,0,0,0,1,1774.99,0,0,0,0,1,922.928
1,735657,0,0,0,0,1,1774.99,0,0,0,0,1,2153.498
2,735688,0,0,0,0,1,1774.99,0,0,0,0,1,3384.069
3,735718,0,0,0,0,1,1774.99,0,0,0,0,1,2153.498
4,735749,0,0,0,0,1,1774.99,0,0,0,0,1,3691.711


In [59]:
#Scale dataset

from sklearn.preprocessing import MinMaxScaler,RobustScaler,MaxAbsScaler
    
scaler = MaxAbsScaler()
categorical_features = list(df_subset.columns[df_subset.dtypes == float])
encoder_df[categorical_features] = scaler.fit_transform(encoder_df[categorical_features])
encoder_df.head()

Unnamed: 0,Date,Place_0,Place_1,Place_2,Place_3,Place_4,Area,Crop_0,Crop_1,Crop_2,Crop_3,Crop_4,Production
0,735629,0,0,0,0,1,0.42839,0,0,0,0,1,0.075389
1,735657,0,0,0,0,1,0.42839,0,0,0,0,1,0.175908
2,735688,0,0,0,0,1,0.42839,0,0,0,0,1,0.276427
3,735718,0,0,0,0,1,0.42839,0,0,0,0,1,0.175908
4,735749,0,0,0,0,1,0.42839,0,0,0,0,1,0.301557


In [68]:
# Split the dataset into train and test. The default size of the split ratio is 3:1

def data_split(df, label):
    
    from sklearn.model_selection import train_test_split

    X = df.drop(label, axis=1)
    Y = df[label]

    x_train, x_test, y_train, y_test = train_test_split(X,Y,test_size=0.2,random_state=20)
    
    return x_train, x_test, y_train, y_test

#apply the function
X_train, X_test, Y_train, Y_test = data_split(encoder_df, 'Production')

x_train = X_train.values
x_test = X_test.values
y_train = Y_train.values
y_test = Y_test.values


In [69]:
from sklearn.model_selection import GridSearchCV

class EstimatorSelectionHelper:

    def __init__(self, models, params):
        if not set(models.keys()).issubset(set(params.keys())):
            missing_params = list(set(models.keys()) - set(params.keys()))
            raise ValueError("Some estimators are missing parameters: %s" % missing_params)
        self.models = models
        self.params = params
        self.keys = models.keys()
        self.grid_searches = {}

    def fit(self, X, y, cv=3, n_jobs=-1, verbose=1, scoring=None, refit=False):
        for key in self.keys:
            print("\nRunning GridSearchCV for %s." % key)
            model = self.models[key]
            params = self.params[key]
            gs = GridSearchCV(model, params, cv=cv, n_jobs=n_jobs,
                              verbose=verbose, scoring=scoring, refit=refit,
                              return_train_score=True)
            gs.fit(X,y)
            self.grid_searches[key] = gs
            print('\nMAE: %.3f' % gs.best_score_)
            print('Config: %s' % gs.best_params_)


In [70]:
models1 = {
    'krr': KernelRidge(),
    'lasso': Lasso(),
    'enet': ElasticNet()
}

params1 = {
    'krr': [{'kernel': ['linear'], 'alpha': [0, 1, 0.01], 'gamma': [0.001, 0.0001]},
            {'kernel': ['rbf'], 'alpha': [0, 1, 0.01], 'gamma': [0.001, 0.0001]}],
    'lasso': { 'alpha': [5, 0.5, 0.05, 0.005, 0.0005, 1, 0.1, 0.01,0.001, 0.0001, 0 ]},
    'enet':  { 'alpha': [5, 0.5, 0.05, 0.005, 0.0005, 1, 0.1, 0.01,0.001, 0.0001, 0 ],
                        'l1_ratio':[0.001, 0.1, 100] }
}

In [71]:
helper1 = EstimatorSelectionHelper(models1, params1)
helper1.fit(x_train, y_train, scoring='neg_mean_absolute_error', n_jobs=-1)


Running GridSearchCV for krr.
Fitting 3 folds for each of 12 candidates, totalling 36 fits

MAE: -0.026
Config: {'alpha': 0.01, 'gamma': 0.001, 'kernel': 'linear'}

Running GridSearchCV for lasso.
Fitting 3 folds for each of 11 candidates, totalling 33 fits

MAE: -0.025
Config: {'alpha': 0.0005}

Running GridSearchCV for enet.
Fitting 3 folds for each of 33 candidates, totalling 99 fits

MAE: -0.025
Config: {'alpha': 0.0001, 'l1_ratio': 0.1}
 -0.04069812 -0.04185149         nan -0.02838426 -0.02793062         nan
 -0.02540969 -0.0253041          nan -0.05043376 -0.05490635         nan
 -0.0430345  -0.05044682         nan -0.03156257 -0.0309675          nan
 -0.02562068 -0.02542851         nan -0.02530298 -0.02527978         nan
 -0.02528433 -0.02528433         nan]
 -0.04060087 -0.04177339         nan -0.02831612 -0.02786116         nan
 -0.02537053 -0.02526167         nan -0.05037061 -0.05484372         nan
 -0.04296508 -0.05035183         nan -0.03148423 -0.03090109         nan
 -0.

## KERNEL RIDGE REGRESSION

In [72]:
krr = KernelRidge(kernel ='linear', alpha=0.01, gamma = 0.001)
krr.fit(x_train,y_train)

preds1 = krr.predict(x_test)
y_pred1 = np.exp(preds1)-1
rsme_valid1 = np.sqrt(mean_squared_error(y_test,preds1))


train_score = krr.score(x_train,y_train)
test_score = krr.score(x_test,y_test)

print('RSME:', rsme_valid1)
print(f'Training Accuracy of our model is: %.2f%% ' %(train_score*100.0))
print(f'Test Accuracy of our model is:  %.2f%% ' %  (test_score*100.0))

RSME: 0.058292669430268944
Training Accuracy of our model is: 64.50% 
Test Accuracy of our model is:  65.31% 


## ENET REGRESSION

In [73]:
enet = ElasticNet(alpha=0.0001, l1_ratio=0.1)
# fit model
enet.fit(x_train, y_train)

enet_pred = enet.predict(x_test)
rsme_enet = np.sqrt(mean_squared_error(y_test,enet_pred))

train_score = enet.score(x_train,y_train)
test_score = enet.score(x_test,y_test)

print('RSME:', rsme_enet)
print(f'Training Accuracy of our model is: %.2f%% ' %(train_score*100.0))
print(f'Test Accuracy of our model is:  %.2f%% ' %  (test_score*100.0))

RSME: 0.05835496445756635
Training Accuracy of our model is: 64.61% 
Test Accuracy of our model is:  65.24% 


## LASSO REGRESSION

In [74]:

#Initializing the Lasso Regressor with Normalization Factor as True
lasso = Lasso(alpha = 0.0005,normalize=True)

#Fitting the Training data to the Lasso regressor
lasso.fit(x_train,y_train)

#Predicting for X_test
lasso_pred =lasso.predict(x_test)

rsme_lasso = np.sqrt(mean_squared_error(y_test,lasso_pred))

train_score = lasso.score(x_train,y_train)
test_score = lasso.score(x_test,y_test)

print('RSME:', rsme_lasso)
print(f'Training Accuracy of our model is: %.2f%% ' %(train_score*100.0))
print(f'Test Accuracy of our model is:  %.2f%% ' %  (test_score*100.0))

RSME: 0.06980691152308349
Training Accuracy of our model is: 51.36% 
Test Accuracy of our model is:  50.25% 


In [25]:
# Define estimators
from sklearn.ensemble import StackingRegressor
from sklearn.linear_model import LinearRegression, LassoLars
from sklearn.metrics import matthews_corrcoef
from sklearn.metrics import f1_score, r2_score

estimator_list = [
    ('krr',krr),
    ('lasso',lasso),
    ('enet',enet)]

# Build stack model
stack_model = StackingRegressor(estimators=estimator_list, final_estimator=LinearRegression(),cv=10)

# Train stacked model
stack_model.fit(x_train, y_train)

# Make predictions
y_train_pred = stack_model.predict(x_train)
y_test_pred = stack_model.predict(x_test)

# Training set model performance
stack_model_train_accuracy = stack_model.score(x_train,y_train)*100.0
stack_model_train_rsme = np.sqrt(mean_squared_error(y_train,y_train_pred)) 
stack_model_train_r2 = r2_score(y_train, y_train_pred) # Calculate F1-score

# Test set model performance
stack_model_test_accuracy = stack_model.score(x_test,y_test)*100.0 # Calculate Accuracy
stack_model_test_rsme = np.sqrt(mean_squared_error(y_test,y_test_pred)) # Calculate MCC
stack_model_test_r2 = r2_score(y_test, y_test_pred) # Calculate F1-score

print('\nModel performance for Training set')
print('- Accuracy: %s' % stack_model_train_accuracy)
print('- RSME: %s' % stack_model_train_rsme)
print('- R2 score: %s' % stack_model_train_r2)
print('----------------------------------')
print('Model performance for Test set')
print('- Accuracy: %s' % stack_model_test_accuracy)
print('- RMSE: %s' % stack_model_test_rsme)
print('- R2 score: %s' % stack_model_test_r2)


Model performance for Training set
- Accuracy: 64.00020388138083
- RSME: 0.060281404132719346
- R2 score: 0.6400020388138082
----------------------------------
Model performance for Test set
- Accuracy: 66.25871497562773
- RMSE: 0.05909663740813565
- R2 score: 0.6625871497562773
