# Ensemble Testing (Under Construction)

We have created a python module that makes the homogeneous ensemble callable.  The module is `homogeneous_ensemble.py`.

In [1]:
import homogeneous_ensemble as he

In [2]:
import numpy as np
import pandas as pd

import random
from sklearn.svm import SVR
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import mean_squared_error
from sklearn.neighbors import KNeighborsClassifier
from sklearn.preprocessing import OneHotEncoder
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier

import time

abalone = pd.read_csv("abalone.csv")
oe_style = OneHotEncoder()
oe_results = oe_style.fit_transform(abalone[["Sex"]])

abalone = pd.DataFrame(oe_results.toarray(), columns=oe_style.categories_).join(abalone)
abalone = abalone.drop("Sex", axis=1)
abalone.shape

(4174, 11)

In [3]:
training, valid = he.split_train_test(abalone, 0.2)
training.head()

Unnamed: 0,"(F,)","(I,)","(M,)",Length,Diameter,Height,Whole_weight,Shucked_weight,Viscera_weight,Shell_weight,Rings
3723,1.0,0.0,0.0,0.635,0.495,0.175,1.2355,0.5205,0.3085,0.347,10
2810,1.0,0.0,0.0,0.47,0.36,0.1,0.4705,0.1635,0.089,0.1385,8
1909,0.0,0.0,1.0,0.32,0.24,0.08,0.18,0.08,0.0385,0.055,6
573,0.0,1.0,0.0,0.475,0.365,0.115,0.49,0.223,0.1235,0.1335,9
1176,0.0,1.0,0.0,0.53,0.425,0.13,0.7675,0.419,0.1205,0.21,9


In [4]:

weights, predictors = he.homog_ens(training, 1, 2)
print(weights)

[0.18105734369480506, 0.19776533402395963]


In [5]:
print(predictors[0].intercept_)
print(predictors[1].intercept_)

[9.71078372]
[9.81107547]


In [6]:
X_valid = valid.iloc[:,0:-1]
Y_valid = valid.iloc[:,-1]

X_valid.iloc[:5,:]

Unnamed: 0,"(F,)","(I,)","(M,)",Length,Diameter,Height,Whole_weight,Shucked_weight,Viscera_weight,Shell_weight
2704,0.0,0.0,1.0,0.645,0.505,0.185,1.463,0.592,0.3905,0.416
1629,0.0,0.0,1.0,0.635,0.495,0.195,1.172,0.445,0.3115,0.3475
2164,0.0,1.0,0.0,0.5,0.395,0.12,0.537,0.2165,0.1085,0.1785
614,0.0,0.0,1.0,0.54,0.415,0.145,0.74,0.2635,0.168,0.245
752,0.0,0.0,1.0,0.415,0.305,0.1,0.325,0.156,0.0505,0.091


In [7]:
predictions = []
for p in predictors:
    predictions.append(p.predict(X_valid))



In [8]:
predictions[0][0:5]

array([11.81926833, 11.69837018,  8.62896118, 10.80138569,  8.14701265])

In [9]:
mean_squared_error(predictions[0], Y_valid)

5.425042535403968

In [10]:
mean_squared_error(predictions[1], Y_valid)

5.305178755980408

In [11]:
#weights[0]*predictions[0]
num = 0
j = 0
weight_sum = 0
while (j < len(weights)):
        num += weights[j]*predictions[j]
        weight_sum += weights[j]
        j+=1
print(weight_sum)

0.37882267771876466


In [12]:
guess = num / weight_sum
mean_squared_error(guess, Y_valid)

5.359120590331483

In [13]:
table_1 = [["DecisionTree",0,0],["SVR",0,0],["kNN",0,0],["RandomForest",0,0],["Gradient",0,0]]
cols = ["Algorithm", "MSE", "Time"]
table_1 = pd.DataFrame(table_1, columns=cols)
table_1

Unnamed: 0,Algorithm,MSE,Time
0,DecisionTree,0,0
1,SVR,0,0
2,kNN,0,0
3,RandomForest,0,0
4,Gradient,0,0


In [14]:
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning) # setting ignore as a parameter and further adding category

In [20]:
n = 256

for i in range(0,3):
    
    #  Training
    t0 = time.time()
    weights, predictors = he.homog_ens(training, i, n)
    
    predictions = []
    for p in predictors:
        predictions.append(p.predict(X_valid))
    
    
    num = 0
    j = 0
    weight_sum = 0
    while (j < len(weights)):
        num += weights[j]*predictions[j]
        weight_sum += weights[j]
        j+=1
    #print(weight_sum)
    
    guess = num / weight_sum
    
    t1 = time.time()
    
    # Predicting
    #predictions = []
    #for p in predictors:
    #   predictions.append(p.predict(X_valid))
        
        
    table_1.loc[i,"MSE"] = mean_squared_error(guess, Y_valid)
    table_1.loc[i,"Time"] = t1-t0
    
t0 = time.time()
train_set = training  #, test_set = he.split_train_test(training, 0.2)
X_vars = train_set.iloc[:,:-1]
X_labels = train_set.iloc[:,-1]
Y = X_labels.to_numpy()
X = X_vars.to_numpy()

rf = RandomForestClassifier(n_estimators=n, max_depth=5, max_features=None, bootstrap=False)
rf.fit(X, Y)
t1 = time.time()


table_1.loc[3,"MSE"] = mean_squared_error(rf.predict(X_valid), Y_valid)
table_1.loc[3,"Time"] = t1-t0

t0 = time.time()
train_set = training #, test_set = he.split_train_test(training, 0.2)
X_vars = train_set.iloc[:,:-1]
X_labels = train_set.iloc[:,-1]
Y = X_labels.to_numpy()
X = X_vars.to_numpy()

gb = GradientBoostingClassifier(n_estimators=n)
gb.fit(X, Y)
t1 = time.time()


table_1.loc[4,"MSE"] = mean_squared_error(gb.predict(X_valid), Y_valid)
table_1.loc[4,"Time"] = t1-t0

table_1

Unnamed: 0,Algorithm,MSE,Time
0,DecisionTree,6.967057,0.970813
1,SVR,5.370882,165.74317
2,kNN,6.283108,22.006831
3,RandomForest,6.699041,2.18254
4,Gradient,6.3753,39.572611
