<a href="https://colab.research.google.com/github/kampuzzle/IA-T1/blob/main/PaulaBremenkampT1AI.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

* Aluna: Paula Um Porto de Azeredo Bremenkamp
* Matrícula: 2020100794

In [None]:
# utils 
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from scipy import stats
import random 
from collections import Counter

# sklearn utils
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score, GridSearchCV
from sklearn.model_selection import cross_val_predict
from sklearn.model_selection import RepeatedStratifiedKFold
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.base import BaseEstimator
from sklearn.utils.validation import check_X_y
from sklearn.naive_bayes import GaussianNB
from sklearn.utils import resample

# sklearn classifiers
from sklearn.dummy import DummyClassifier
from sklearn.ensemble import BaggingClassifier 
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier


In [None]:
# import da base de dados
df = pd.read_csv('https://raw.githubusercontent.com/VitorBonella/PL-Dataset/main/dataset.csv',sep=";")

df.set_index('id',inplace=True) #Transformando a coluna id no indice da tabela

#Observem que a classe esta separada em duas colunas então devemos concatenadas para formar uma coluna só chamada classe
df['classe'] = df['tipo_lampada'].str.replace(" ", "") + df['potencia'].astype(str) 


## Seleção dos dados de acordo com a matrícula

In [None]:
HARALICK = ['probmax', 'energia', 'entropia', 'contraste','homogeneidade', 'correlacao']
data = df[HARALICK + ['classe']]
df[HARALICK] = df[HARALICK].apply(lambda x: x.str.replace(',', '.').astype(float), axis=1)
print(list(df[HARALICK].dtypes))

data.head(5)

[dtype('float64'), dtype('float64'), dtype('float64'), dtype('float64'), dtype('float64'), dtype('float64')]


Unnamed: 0_level_0,probmax,energia,entropia,contraste,homogeneidade,correlacao,classe
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
355,890374128851521,317034178175013,65716176171224,-307129899022437,376304934586401,30352446061056,metalica400
356,874335002692948,305605231787486,756143396285353,-371051952158663,372262223245045,268692571829909,metalica400
357,88498796746712,313069373707728,685342760134283,-279068889939748,374882866962454,232954297907698,metalica400
358,7646646581833,232154862004449,138784307675733,-109018789473973,342956382241494,166201729869237,metalica400
359,897764519281096,322179527617412,621806574418637,-548235378125258,377587702004172,259598389119525,metalica400


In [None]:
X = data[HARALICK]
y = data['classe']
X = X.replace(',', '.', regex=True).astype(float)

# normalização z-score
scalar = StandardScaler()

## Funções de uso geral

In [None]:
def estats(scores, metodo): 
  media = np.mean(scores)
  desvio_padrao = np.std(scores)
  inf, sup = stats.norm.interval(0.95, loc=media, scale=desvio_padrao/np.sqrt(len(scores)))

  return {'Método': metodo, 'Média': media, 'Desvio Padrão': desvio_padrao, 'Limite Inferior': inf, 'Limite Superior': sup}


## ZeroR (ZR)

In [None]:
# Dummy classifier - 3 rodadas de validação cruzada estratificada de 10 folds do classificador
ZR = DummyClassifier()
rkf = RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=36851234)
scoresZR = cross_val_score(ZR, X, y, scoring='accuracy', cv = rkf)

print (scoresZR)

[0.16666667 0.13333333 0.16666667 0.16666667 0.16666667 0.16666667
 0.16666667 0.17241379 0.17241379 0.17241379 0.16666667 0.13333333
 0.16666667 0.16666667 0.16666667 0.16666667 0.16666667 0.17241379
 0.17241379 0.17241379 0.16666667 0.13333333 0.16666667 0.16666667
 0.16666667 0.16666667 0.16666667 0.17241379 0.17241379 0.17241379]


In [None]:
resultados = []
resultados.append(estats(scoresZR, 'ZR'))

## Bagging (BA)

In [None]:
BA = BaggingClassifier(estimator=GaussianNB(), random_state=11)

parameters = {'bagging__n_estimators':[3, 9, 15, 21]} # setando hiperparametros
pipe = Pipeline([('scaler', StandardScaler()), ('bagging', BA)])

clf = GridSearchCV(pipe, parameters,cv=4) # busca em grade com ciclo interno 4 folds
rkf = RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=36851234) # 3 repetições e 10 folds

scoresBA = cross_val_score(clf, X, y, cv = rkf) 

print(scoresBA)

[0.63333333 0.4        0.5        0.4        0.4        0.4
 0.33333333 0.55172414 0.4137931  0.48275862 0.4        0.4
 0.43333333 0.6        0.4        0.5        0.4        0.27586207
 0.4137931  0.4137931  0.46666667 0.43333333 0.4        0.46666667
 0.3        0.4        0.5        0.44827586 0.44827586 0.37931034]


In [None]:
resultados.append(estats(scoresBA, 'BA'))

## AdaBoost (AB)

In [None]:
AB = AdaBoostClassifier(estimator=GaussianNB(), random_state=11)

parameters = {'boosting__n_estimators':[3, 9, 15, 21]} # setando hiperparametros
pipe = Pipeline([('scaler', StandardScaler()), ('boosting',AB)]) 

clf = GridSearchCV(pipe, parameters,cv=4) # busca em grade com ciclo interno 4 folds
rkf = RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=36851234) # 3 repetições e 10 folds

scoresAB = cross_val_score(clf, X, y, cv = rkf)

scoresAB

array([0.2       , 0.33333333, 0.4       , 0.23333333, 0.5       ,
       0.26666667, 0.33333333, 0.4137931 , 0.31034483, 0.4137931 ,
       0.36666667, 0.4       , 0.33333333, 0.3       , 0.36666667,
       0.2       , 0.4       , 0.24137931, 0.37931034, 0.37931034,
       0.5       , 0.43333333, 0.5       , 0.5       , 0.4       ,
       0.4       , 0.36666667, 0.37931034, 0.4137931 , 0.27586207])

In [None]:
resultados.append(estats(scoresAB, 'AB'))

## RandomForest (RF)

In [None]:
rF = RandomForestClassifier(random_state=11)

pipeline = Pipeline([('transformer', scalar), ('estimator', rF)])
grade={'estimator__n_estimators': [3, 9, 15, 21]}
gs = GridSearchCV(estimator=pipeline, param_grid = grade, scoring='accuracy', cv = 4)
rkf = RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=36851234) # 3 repetições e 10 folds

scoresRF = cross_val_score(gs, X, y, scoring='accuracy', cv=rkf)

scoresRF

array([0.76666667, 0.3       , 0.66666667, 0.53333333, 0.6       ,
       0.66666667, 0.66666667, 0.62068966, 0.51724138, 0.68965517,
       0.4       , 0.46666667, 0.73333333, 0.63333333, 0.53333333,
       0.6       , 0.63333333, 0.65517241, 0.5862069 , 0.62068966,
       0.6       , 0.56666667, 0.73333333, 0.53333333, 0.6       ,
       0.46666667, 0.56666667, 0.48275862, 0.5862069 , 0.5862069 ])

In [None]:
resultados.append(estats(scoresRF, 'RF'))

In [None]:
tabela = pd.DataFrame(resultados)
tabela.set_index('Método',inplace=True)

tabela

Unnamed: 0_level_0,Média,Desvio Padrão,Limite Inferior,Limite Superior
Método,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
ZR,0.165057,0.010883,0.161163,0.168952
BA,0.433142,0.07483,0.406365,0.459919
AB,0.364674,0.082938,0.334996,0.394353
RF,0.58705,0.097992,0.551985,0.622115


## Heterogeneous Pooling (HP)

In [None]:
# retorna a ordenacao das classes de acordo com a ocorrencia nos exemplos na base de treino
def get_ordenacao(y_train):
  class_counts = Counter(y_train) # contagem de ocorrencias
  class_order = sorted(class_counts, key=class_counts.get, reverse=True) # ordenacao

  return class_order

In [None]:
def train_classificador(X_train, y_train, classifier_name):
  if classifier_name == 'KNN':
    classifier = KNeighborsClassifier()
  elif classifier_name == 'NB':
    classifier = GaussianNB()
  elif classifier_name == 'DT':
    classifier = DecisionTreeClassifier()
  
  classifier.fit(X_train, y_train)
  return classifier


In [None]:
class HeterogeneousPolling(BaseEstimator):
  def __init__(self, bias=None):
    super().__init__()
    self.bias = bias
    self.rl = []
    self.nclass = 0 
    self.combined_classifiers = []

  def fit(self,X,y):
    n_samples = [1, 3, 5, 7]
    classifier_names = ['KNN', 'NB', 'DT']
    class_order = get_ordenacao(y)
    random_state = 0

    for n in n_samples:
      if n == 1:
        X_curr_train = X
        y_curr_train = y
      else:
        random_state += 1  # Incrementar o valor de random_state para a próxima iteração
        X_curr_train, y_curr_train = resample(X, y, random_state=random_state, replace=True)

      for classifier_name in classifier_names:
          classifier = train_classificador(X_curr_train, y_curr_train, classifier_name)
          self.combined_classifiers.append(classifier)

  def predict(self, X):
    for classifier in self.combined_classifiers:
      

        

In [None]:
HP = HeterogeneousPolling()

pipeline = Pipeline([('transformer', scalar), ('estimator', HP)])

grade={'estimator__bias': [1, 3, 5, 7]}

gs = GridSearchCV(estimator=pipeline, param_grid = grade, scoring='accuracy', cv = 4)

rkf = RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=36851234) # 3 repetições e 10 folds

scoresHP = cross_val_score(gs, X, y, scoring='accuracy', cv = rkf)

print(scoresHP)

[1;30;43mA saída de streaming foi truncada nas últimas 5000 linhas.[0m

Traceback (most recent call last):
  File "/usr/local/lib/python3.10/dist-packages/sklearn/model_selection/_validation.py", line 767, in _score
    scores = scorer(estimator, X_test, y_test)
  File "/usr/local/lib/python3.10/dist-packages/sklearn/metrics/_scorer.py", line 234, in __call__
    return self._score(
  File "/usr/local/lib/python3.10/dist-packages/sklearn/metrics/_scorer.py", line 276, in _score
    y_pred = method_caller(estimator, "predict", X)
  File "/usr/local/lib/python3.10/dist-packages/sklearn/metrics/_scorer.py", line 73, in _cached_call
    return getattr(estimator, method)(*args, **kwargs)
  File "/usr/local/lib/python3.10/dist-packages/sklearn/pipeline.py", line 481, in predict
    return self.steps[-1][1].predict(Xt, **predict_params)
  File "<ipython-input-33-42b7c6ae5202>", line 45, in predict
    final_predictions.append(self.class_order[0])
AttributeError: 'HeterogeneousPolling' objec

[nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan
 nan nan nan nan nan nan nan nan nan nan nan nan]


Traceback (most recent call last):
  File "/usr/local/lib/python3.10/dist-packages/sklearn/model_selection/_validation.py", line 767, in _score
    scores = scorer(estimator, X_test, y_test)
  File "/usr/local/lib/python3.10/dist-packages/sklearn/metrics/_scorer.py", line 234, in __call__
    return self._score(
  File "/usr/local/lib/python3.10/dist-packages/sklearn/metrics/_scorer.py", line 276, in _score
    y_pred = method_caller(estimator, "predict", X)
  File "/usr/local/lib/python3.10/dist-packages/sklearn/metrics/_scorer.py", line 73, in _cached_call
    return getattr(estimator, method)(*args, **kwargs)
  File "/usr/local/lib/python3.10/dist-packages/sklearn/pipeline.py", line 481, in predict
    return self.steps[-1][1].predict(Xt, **predict_params)
  File "<ipython-input-33-42b7c6ae5202>", line 45, in predict
    final_predictions.append(self.class_order[0])
AttributeError: 'HeterogeneousPolling' object has no attribute 'class_order'

Traceback (most recent call last):
  Fil