# Ensemble

In [1]:
import numpy as np
import csv
import pandas as pd

from collections import Counter
from numpy import linalg as LA

from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

In [2]:
with open("/content/drive/MyDrive/CIn/Mestrado/2021.1/Aprendizagem de Máquina/Projeto - Francisco/yeast.data") as input_file:
   lines = input_file.readlines()
   newLines = []
   for line in lines:
      newLine = line.strip().split()
      newLines.append(newLine)

with open("/content/drive/MyDrive/CIn/Mestrado/2021.1/Aprendizagem de Máquina/Projeto - Francisco/yeast.csv", 'w') as test_file:
   file_writer = csv.writer(test_file)
   file_writer.writerows(newLines)

In [3]:
# reading csv files
data =  pd.read_csv("/content/drive/MyDrive/CIn/Mestrado/2021.1/Aprendizagem de Máquina/Projeto - Francisco/yeast.csv",
                    names=["sequence_name", "mcg", "gvh", "alm", "mit", "erl", "pox", "vac", "nuc", "protein_local"])
print(data)

     sequence_name   mcg   gvh   alm   mit  erl  pox   vac   nuc protein_local
0       ADT1_YEAST  0.58  0.61  0.47  0.13  0.5  0.0  0.48  0.22           MIT
1       ADT2_YEAST  0.43  0.67  0.48  0.27  0.5  0.0  0.53  0.22           MIT
2       ADT3_YEAST  0.64  0.62  0.49  0.15  0.5  0.0  0.53  0.22           MIT
3       AAR2_YEAST  0.58  0.44  0.57  0.13  0.5  0.0  0.54  0.22           NUC
4       AATM_YEAST  0.42  0.44  0.48  0.54  0.5  0.0  0.48  0.22           MIT
...            ...   ...   ...   ...   ...  ...  ...   ...   ...           ...
1479    YUR1_YEAST  0.81  0.62  0.43  0.17  0.5  0.0  0.53  0.22           ME2
1480    ZIP1_YEAST  0.47  0.43  0.61  0.40  0.5  0.0  0.48  0.47           NUC
1481    ZNRP_YEAST  0.67  0.57  0.36  0.19  0.5  0.0  0.56  0.22           ME2
1482    ZUO1_YEAST  0.43  0.40  0.60  0.16  0.5  0.0  0.53  0.39           NUC
1483    G6PD_YEAST  0.65  0.54  0.54  0.13  0.5  0.0  0.53  0.22           CYT

[1484 rows x 10 columns]


In [4]:
X = data.iloc[:, 1:9].values
y = data.iloc[:, 9].values

# Baysean Gaussian Classifier

In [5]:
def probability_priori(y):
  """return a list with each priori probability"""

  pr_pb = Counter(y)

  total = sum(pr_pb.values(), 0.0)

  for key in pr_pb:
    pr_pb[key] /= total  

  return pr_pb

def mi(X,y):

  y = np.expand_dims(y, axis=1)

  concat = np.hstack((X,y))

  ids=np.unique(concat[:,-1]) #array of unique ids

  pos_mean=[(i, np.mean(concat[concat[:,-1]==i, 0:-1], axis=0)) for i in ids]

  return pos_mean,concat

def sigma_square(X,y):

  mean_vector,concat = mi(X,y)
  classes = np.unique(y)
  
  mean_vector = dict(mean_vector)

  sigma_square_val = []

  for i in classes:
    mean = mean_vector[i]

    input_values = concat[concat[:,-1]==i, 0:-1]

    sub = input_values - mean

    d = len(sub[0])
    n = len(sub)

    sigma_square_val.append([i,LA.norm(sub)**2/(d*n)])

  return sigma_square_val

def sigma_matrix(X,y):
  """docstring"""

  lst = sigma_square(X,y)

  sigma_matrix_val = []

  for n,i in enumerate(lst):

    lst[n][1]

    diag = np.zeros((8, 8), float)
    np.fill_diagonal(diag, lst[n][1])

    sigma_matrix_val.append(diag)

  return np.array(sigma_matrix_val)

def probability_posteriori(class_name,classes,x,mean,sigma_matrix_values):

  d = sigma_matrix_values.shape[1]

  i = classes.index(class_name)

  posteriori = (1/(((2*np.pi)**(d/2))*np.sqrt(np.linalg.det(sigma_matrix_values[i]))))*np.exp(-0.5*(np.matmul(np.matmul(((x - mean).transpose()),(np.linalg.inv(sigma_matrix_values[i]))),x - mean)))

  return posteriori

def baysean_classifier(x_i,X,y):

  pp = probability_priori(y)
  mean_vector,concat = mi(X,y)
  matrix = sigma_matrix(X,y)
  classes = [mean_vector[n][0] for n,i in enumerate(mean_vector)]
  mean_vector = dict(mean_vector)

  classification_bg = []

  for x in x_i:

    probs = []

    for n,i in enumerate(classes):
      den = 0
      mean = mean_vector[i]
      for k,j in enumerate(classes):
        mean_den = mean_vector[j]
        den += pp[classes[k]]*probability_posteriori(j,classes,x,mean_den,matrix)

      num = pp[classes[n]]*probability_posteriori(i,classes,x,mean,matrix)
      probs.append(num/den)

    classification_bg.append(classes[probs.index(max(probs))])

  return classification_bg

# Parzen

In [6]:
def p_estimate_parzen_window(x,h,X,y):

  classes = np.unique(y)

  concat = np.hstack((X,np.expand_dims(y, axis=1)))

  p_list = []

  for i in classes:

    input_values_class = concat[concat[:,-1]==i, 0:-1]

    sub = (x - input_values_class)/h

    cte = (1/(np.sqrt(2)*np.pi))

    K = np.prod(cte*np.exp(-sub.astype(float)**2/2),axis=1)

    p = sum(K)

    n_i = len(sub)

    d = len(sub[0])

    p_list.append([i,p/(n_i*(h**d))])
    
    p = 0

  return p_list

class Parzen:
    def __init__(self, h=1):
        self.h = h

    def predict(self,x,X,y):

      pp = probability_priori(y)

      predict_parzen = []

      for x_i in x:

        parzen = p_estimate_parzen_window(x_i,self.h,X,y)

        parzen_probs = []

        for n,i in enumerate(parzen):
          num = pp[parzen[n][0]]*parzen[n][1]
          den = 0
          for k,j in enumerate(parzen):
            den+= pp[parzen[k][0]]*parzen[k][1]

          parzen_probs.append(num/den)

        predict_parzen.append(parzen[parzen_probs.index(max(parzen_probs))][0])

      return predict_parzen

# Ensemble

In [7]:
def ensemble(x,X,y):

  voto1 = np.array(baysean_classifier(x,X,y))

  neigh = KNeighborsClassifier(n_neighbors=18)
  neigh.fit(X, y)
  voto2 = neigh.predict(x)

  parzen = Parzen(h=0.0744)
  voto3 = np.array(parzen.predict(x,X,y))

  lr = LogisticRegression(random_state=0,multi_class="ovr").fit(X, y)
  voto4 = lr.predict(x)

  ensemble_choose = []

  for n,_ in enumerate(voto1):

    List = [voto1[n],voto2[n],voto3[n],voto4[n]]

    ensemble_choose.append(max(sorted(set(List)), key = List.count))


  return ensemble_choose

In [8]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.model_selection import StratifiedKFold

accuracy = []
precision = []
recall = []
f_measure = []

skf = StratifiedKFold(n_splits=5)

for train_index, test_index in skf.split(X,y):

  X_train, X_test = X[train_index], X[test_index]
  y_train, y_test = y[train_index], y[test_index]

  ensemble_predict = ensemble(X_test,X_train,y_train)

  accuracy.append(accuracy_score(y_test, ensemble_predict))
  precision.append(precision_score(y_test, ensemble_predict, average='macro',zero_division=0))
  recall.append(recall_score(y_test, ensemble_predict, average='macro',zero_division=0))
  f_measure.append(f1_score(y_test, ensemble_predict, average='macro',zero_division=0))

  print("accuracy: ",accuracy_score(y_test, ensemble_predict))

print("\nMean accuracy: ",sum(accuracy)/len(accuracy), "Std: ",np.std(accuracy))
print("Mean precision: ",sum(precision)/len(precision), "Std: ",np.std(precision))
print("Mean recall: ",sum(recall)/len(recall), "Std: ",np.std(recall))
print("Mean f_measure: ",sum(f_measure)/len(f_measure), "Std: ",np.std(f_measure))

accuracy:  0.5791245791245792
accuracy:  0.6161616161616161
accuracy:  0.6026936026936027
accuracy:  0.5286195286195287
accuracy:  0.5878378378378378

Mean accuracy:  0.5828874328874328 Std:  0.029943702664162308
Mean precision:  0.6043200770158108 Std:  0.04521461650379122
Mean recall:  0.537756080925808 Std:  0.040907993008174104
Mean f_measure:  0.5492510618555888 Std:  0.0388265214920135
