# Ensemble Testing (Under Construction)

We have created a python module that makes the homogeneous ensemble callable.  The module is `homogeneous_ensemble.py`.

In [21]:
import homogeneous_ensemble as he

In [16]:
import numpy as np
import pandas as pd

import random
from sklearn.svm import SVR
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import mean_squared_error
from sklearn.neighbors import KNeighborsClassifier
from sklearn.preprocessing import OneHotEncoder
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier

import time

abalone = pd.read_csv("abalone.csv")
oe_style = OneHotEncoder()
oe_results = oe_style.fit_transform(abalone[["Sex"]])

abalone = pd.DataFrame(oe_results.toarray(), columns=oe_style.categories_).join(abalone)
abalone = abalone.drop("Sex", axis=1)
abalone.shape

(4174, 11)

In [17]:
training, valid = he.split_train_test(abalone, 0.2)
training.head()

Unnamed: 0,"(F,)","(I,)","(M,)",Length,Diameter,Height,Whole_weight,Shucked_weight,Viscera_weight,Shell_weight,Rings
2547,0.0,0.0,1.0,0.5,0.365,0.13,0.5945,0.309,0.1085,0.1535,9
3441,0.0,0.0,1.0,0.595,0.46,0.155,1.03,0.4275,0.207,0.3305,10
2644,0.0,0.0,1.0,0.62,0.475,0.195,1.3585,0.5935,0.3365,0.3745,10
3640,0.0,0.0,1.0,0.625,0.495,0.155,1.025,0.46,0.1945,0.34,9
414,0.0,1.0,0.0,0.22,0.165,0.055,0.0545,0.0215,0.012,0.02,5


In [4]:

weights, predictors = he.homog_ens(training, 1, 2)
print(weights)

[0.18865622891558753, 0.1736265470034139]


In [5]:
print(predictors[0].intercept_)
print(predictors[1].intercept_)

[9.84838181]
[10.04619023]


In [6]:
X_valid = valid.iloc[:,0:-1]
Y_valid = valid.iloc[:,-1]

X_valid.iloc[:5,:]

Unnamed: 0,"(F,)","(I,)","(M,)",Length,Diameter,Height,Whole_weight,Shucked_weight,Viscera_weight,Shell_weight
4107,1.0,0.0,0.0,0.625,0.475,0.16,1.3335,0.605,0.2875,0.319
632,0.0,1.0,0.0,0.155,0.105,0.05,0.0175,0.005,0.0035,0.005
1091,1.0,0.0,0.0,0.755,0.625,0.21,2.505,1.1965,0.513,0.6785
235,0.0,0.0,1.0,0.56,0.45,0.175,1.011,0.3835,0.2065,0.37
3376,0.0,1.0,0.0,0.4,0.315,0.1,0.3225,0.143,0.0735,0.091


In [7]:
predictions = []
for p in predictors:
    predictions.append(p.predict(X_valid))



In [8]:
predictions[0][0:5]

array([10.03240638,  4.40735365, 11.74204177, 12.08912217,  7.11018509])

In [9]:
mean_squared_error(predictions[0], Y_valid)

5.257305588451197

In [10]:
mean_squared_error(predictions[1], Y_valid)

5.266829304960987

In [11]:
#weights[0]*predictions[0]
num = 0
j = 0
weight_sum = 0
while (j < len(weights)):
        num += weights[j]*predictions[j]
        weight_sum += weights[j]
        j+=1
print(weight_sum)

0.36228277591900143


In [12]:
guess = num / weight_sum
mean_squared_error(guess, Y_valid)

5.260438510105208

In [19]:
table_1 = [["DecisionTree",0,0],["SVR",0,0],["kNN",0,0],["RandomForest",0,0],["Gradient",0,0]]
cols = ["Algorithm", "MSE", "Time"]
table_1 = pd.DataFrame(table_1, columns=cols)
table_1

Unnamed: 0,Algorithm,MSE,Time
0,DecisionTree,0,0
1,SVR,0,0
2,kNN,0,0
3,RandomForest,0,0
4,Gradient,0,0


In [14]:
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning) # setting ignore as a parameter and further adding category

In [25]:
n = 256

for i in range(0,3):
    
    #  Training
    t0 = time.time()
    weights, predictors = he.homog_ens(training, i, n)
    
    predictions = []
    for p in predictors:
        predictions.append(p.predict(X_valid))
    
    
    num = 0
    j = 0
    weight_sum = 0
    while (j < len(weights)):
        num += weights[j]*predictions[j]
        weight_sum += weights[j]
        j+=1
    #print(weight_sum)
    
    guess = num / weight_sum
    
    t1 = time.time()
    
    # Predicting
    #predictions = []
    #for p in predictors:
    #   predictions.append(p.predict(X_valid))
        
        
    table_1.loc[i,"MSE"] = mean_squared_error(guess, Y_valid)
    table_1.loc[i,"Time"] = t1-t0
    
t0 = time.time()
train_set, test_set = he.split_train_test(training, 0.2)
X_vars = train_set.iloc[:,:-1]
X_labels = train_set.iloc[:,-1]
Y = X_labels.to_numpy()
X = X_vars.to_numpy()

rf = RandomForestClassifier(n_estimators=n, max_depth=5, max_features=None, bootstrap=False)
rf.fit(X, Y)
t1 = time.time()


table_1.loc[3,"MSE"] = mean_squared_error(rf.predict(X_valid), Y_valid)
table_1.loc[3,"Time"] = t1-t0

t0 = time.time()
train_set, test_set = he.split_train_test(training, 0.2)
X_vars = train_set.iloc[:,:-1]
X_labels = train_set.iloc[:,-1]
Y = X_labels.to_numpy()
X = X_vars.to_numpy()

gb = GradientBoostingClassifier(n_estimators=n)
gb.fit(X, Y)
t1 = time.time()


table_1.loc[4,"MSE"] = mean_squared_error(gb.predict(X_valid), Y_valid)
table_1.loc[4,"Time"] = t1-t0

table_1

Unnamed: 0,Algorithm,MSE,Time
0,DecisionTree,6.818293,0.922262
1,SVR,5.256181,225.777503
2,kNN,5.691817,14.651755
3,RandomForest,6.08753,1.900617
4,Gradient,2.7506,29.933503
