# Ensemble Testing (Under Construction)

We have created a python module that makes the homogeneous ensemble callable.  The module is `homogeneous_ensemble.py`.

In [1]:
import homogeneous_ensemble as he

In [2]:
import numpy as np
import pandas as pd

import random
from sklearn.svm import SVR
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import mean_squared_error
from sklearn.neighbors import KNeighborsClassifier
from sklearn.preprocessing import OneHotEncoder

abalone = pd.read_csv("abalone.csv")
oe_style = OneHotEncoder()
oe_results = oe_style.fit_transform(abalone[["Sex"]])

abalone = pd.DataFrame(oe_results.toarray(), columns=oe_style.categories_).join(abalone)
abalone = abalone.drop("Sex", axis=1)
abalone.shape

(4174, 11)

In [3]:
training, valid = he.split_train_test(abalone, 0.2)
training.head()

Unnamed: 0,"(F,)","(I,)","(M,)",Length,Diameter,Height,Whole_weight,Shucked_weight,Viscera_weight,Shell_weight,Rings
1222,1.0,0.0,0.0,0.595,0.465,0.155,1.026,0.4645,0.112,0.305,12
1674,0.0,1.0,0.0,0.515,0.42,0.15,0.6725,0.2555,0.1335,0.235,10
2922,0.0,0.0,1.0,0.65,0.525,0.19,1.4995,0.6265,0.4005,0.395,14
1400,0.0,1.0,0.0,0.385,0.29,0.09,0.2365,0.1,0.0505,0.076,8
652,0.0,1.0,0.0,0.15,0.1,0.025,0.015,0.0045,0.004,0.005,2


In [4]:

weights, predictors = he.homog_ens(training, 1, 2)
print(weights)

[0.20331099039442488, 0.1753999372973465]


In [5]:
print(predictors)

[SVR(), SVR()]


In [6]:
X_valid = valid.iloc[:,0:-1]
Y_valid = valid.iloc[:,-1]

X_valid.iloc[:5,:]

Unnamed: 0,"(F,)","(I,)","(M,)",Length,Diameter,Height,Whole_weight,Shucked_weight,Viscera_weight,Shell_weight
364,0.0,0.0,1.0,0.59,0.5,0.165,1.1045,0.4565,0.2425,0.34
947,0.0,0.0,1.0,0.69,0.505,0.2,1.872,0.893,0.4015,0.48
3874,0.0,1.0,0.0,0.48,0.37,0.125,0.5435,0.244,0.101,0.165
3732,0.0,1.0,0.0,0.28,0.215,0.07,0.124,0.063,0.0215,0.03
3749,1.0,0.0,0.0,0.515,0.4,0.125,0.615,0.2865,0.123,0.1765


In [7]:
predictions = []
for p in predictors:
    predictions.append(p.predict(X_valid))



In [8]:
predictions

[array([11.11037499, 11.26303576,  8.29669276,  5.53635246,  9.51735716,
         9.88822893,  8.94081024, 10.46987948, 11.91686915, 10.0476903 ,
         9.59532717,  9.28148905, 12.30192927, 11.59902512, 11.55832062,
         9.4634063 , 13.79818862, 11.24632323,  8.12541274, 11.20482156,
        11.01940587, 10.27725909, 10.16957567, 10.16633958, 10.41436135,
        10.41822474,  7.48260097,  9.21127967, 10.61131965,  9.03081224,
        10.99279938, 12.11359487, 11.07334853, 10.5744444 , 14.11073283,
         9.02645265,  5.80082173, 10.12397604,  5.5808783 ,  5.29636841,
        10.18180893,  5.01970455, 11.63477784,  6.5254457 , 10.75039933,
         8.59234753, 11.11222593,  6.96607938,  9.2952124 ,  7.96390531,
         9.93399693,  9.51313897,  8.88731914, 10.7776438 , 10.08406329,
         7.06774473,  6.17264605,  7.8817148 ,  9.5061861 ,  7.21299339,
         8.3554825 , 10.39699568, 10.00784174,  9.40734398,  7.56461177,
        12.52930201,  9.86800617,  9.59135464,  9.5

In [9]:
mean_squared_error(predictions[0], Y_valid)

6.351949818112758

In [10]:
mean_squared_error(predictions[1], Y_valid)

6.351949818112758

In [43]:
#weights[0]*predictions[0]
num = 0
j = 0
weight_sum = 0
while (j < len(weights)):
        num += weights[j]*predictions[j]
        weight_sum += weights[j]
        j+=1
print(weight_sum)

0.39740873394728493


In [44]:
guess = num / weight_sum
mean_squared_error(guess, Y_valid)

5.413576826184167

In [19]:
table_1 = [["DecisionTree",0],["SVR",0],["kNN",0]]
cols = ["Algorithm", "MSE"]
table_1 = pd.DataFrame(table_1, columns=cols)
table_1

Unnamed: 0,Algorithm,MSE
0,DecisionTree,0
1,SVR,0
2,kNN,0


In [21]:
table_1.loc[2,"MSE"] = mean_squared_error(predictors[0].predict(X_valid), Y_valid)
table_1



Unnamed: 0,Algorithm,MSE
0,DecisionTree,7.089928
1,SVR,0.0
2,kNN,7.089928


In [None]:
for i in range(0,3):
    weights, predictors = he.homog_ens(training, 0, 2)
    predictions = []
    for p in predictors:
        predictions.append(p.predict(X_valid))
    table_1.loc[i,"MSE"] = mean_squared_error(predictors[0].predict(X_valid), Y_valid)

In [29]:
#  Sampling of Data
train_set, test_set = he.split_train_test(abalone, 0.2)
X_vars = train_set.iloc[:,:-1]
X_labels = train_set.iloc[:,-1]
Y = X_labels.to_numpy()
X = X_vars.to_numpy()

X[:,:5]

array([[0.   , 0.   , 1.   , 0.645, 0.51 ],
       [1.   , 0.   , 0.   , 0.545, 0.4  ],
       [1.   , 0.   , 0.   , 0.6  , 0.48 ],
       ...,
       [0.   , 0.   , 1.   , 0.64 , 0.505],
       [1.   , 0.   , 0.   , 0.57 , 0.435],
       [1.   , 0.   , 0.   , 0.625, 0.485]])