#Gradient Boosting Classifier

##Introdução
  Gradient boost é um algoritimo que serve para prever valores continuos. Neste tipo de algoritimo as arvores posteriores se baseiam nos erros das anteriores para melhorar a classificação. As árvores também recebem avaliações, que devem ser números entre 0 e 1, uma vez que trabalhar apenas com os dados de treinamento da tabela traria previsões piores do que as do modelo com avaliações. De maneira geral são utilizadas de 8 a 32 árvores para ter boas predições.

##Execução
  


In [2]:
#importação das bibliotecas
import pandas as pd
import numpy as np
import sklearn
from sklearn.datasets import make_classification
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import train_test_split
from sklearn.model_selection import RepeatedStratifiedKFold
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score
from sklearn import preprocessing
from sklearn import utils



In [3]:
#criação da tabela
clima = pd.read_csv("https://raw.githubusercontent.com/profmoisesomena/escience_and_tools/master/data/estacoes_es_summary.csv")
clima.head()

Unnamed: 0,Municipios,Altitude,X_coord,Y_coord,CHUVA,ETP,ETR,DEF,EXC,ER/ETP,TEMP_MÉD_ANUAL
0,EstacaoMeteorologicaES1,66,-39.96,-18.3,1036.98,1382.294237,1036.98,345.314237,0.0,0.750188,24.84955
1,EstacaoMeteorologicaES2,3,-39.75,-18.56,1164.716667,1413.227948,1164.716667,248.511281,0.0,0.824153,25.061532
2,EstacaoMeteorologicaES3,6,-39.76,-18.95,1394.9,1397.763967,1298.498162,99.265805,96.401838,0.928982,24.959973
3,EstacaoMeteorologicaES4,180,-40.75,-18.99,1250.843333,1341.710579,1151.705054,190.005525,99.138279,0.858386,24.529823
4,EstacaoMeteorologicaES5,90,-40.09,-18.49,1127.65,1367.365007,1127.65,239.715007,0.0,0.824688,24.743049


In [4]:
#verificando se há valores nulos
clima.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 110 entries, 0 to 109
Data columns (total 11 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   Municipios      110 non-null    object 
 1   Altitude        110 non-null    int64  
 2   X_coord         110 non-null    float64
 3   Y_coord         110 non-null    float64
 4   CHUVA           110 non-null    float64
 5   ETP             110 non-null    float64
 6   ETR             110 non-null    float64
 7   DEF             110 non-null    float64
 8   EXC             110 non-null    float64
 9   ER/ETP          110 non-null    float64
 10  TEMP_MÉD_ANUAL  110 non-null    float64
dtypes: float64(9), int64(1), object(1)
memory usage: 9.6+ KB


In [5]:
#Deletando dados desnecessarios para o treinamento
clima = clima.drop(columns=['Municipios'])
clima.head()

Unnamed: 0,Altitude,X_coord,Y_coord,CHUVA,ETP,ETR,DEF,EXC,ER/ETP,TEMP_MÉD_ANUAL
0,66,-39.96,-18.3,1036.98,1382.294237,1036.98,345.314237,0.0,0.750188,24.84955
1,3,-39.75,-18.56,1164.716667,1413.227948,1164.716667,248.511281,0.0,0.824153,25.061532
2,6,-39.76,-18.95,1394.9,1397.763967,1298.498162,99.265805,96.401838,0.928982,24.959973
3,180,-40.75,-18.99,1250.843333,1341.710579,1151.705054,190.005525,99.138279,0.858386,24.529823
4,90,-40.09,-18.49,1127.65,1367.365007,1127.65,239.715007,0.0,0.824688,24.743049


In [6]:
#Tansformando floats em ints
clima = clima.rename(columns={'ER/ETP': 'ERETP'})
clima['X_coord'] = clima.X_coord.astype('int64')
clima['Y_coord'] = clima.Y_coord.astype('int64')
clima['CHUVA'] = clima.CHUVA.astype('int64')
clima['ETP'] = clima.ETP.astype('int64')
clima['ETR'] = clima.ETR.astype('int64')
clima['DEF'] = clima.DEF.astype('int64')
clima['EXC'] = clima.EXC.astype('int64')
clima['ERETP'] = clima.ERETP.astype('int64')
clima['TEMP_MÉD_ANUAL'] = clima.TEMP_MÉD_ANUAL.astype('int64')
clima.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 110 entries, 0 to 109
Data columns (total 10 columns):
 #   Column          Non-Null Count  Dtype
---  ------          --------------  -----
 0   Altitude        110 non-null    int64
 1   X_coord         110 non-null    int64
 2   Y_coord         110 non-null    int64
 3   CHUVA           110 non-null    int64
 4   ETP             110 non-null    int64
 5   ETR             110 non-null    int64
 6   DEF             110 non-null    int64
 7   EXC             110 non-null    int64
 8   ERETP           110 non-null    int64
 9   TEMP_MÉD_ANUAL  110 non-null    int64
dtypes: int64(10)
memory usage: 8.7 KB


In [7]:
#Separando as partes de teste e o a coluna alvo em x e y
x = clima[["Altitude", "X_coord", "Y_coord", "CHUVA", "ETP", "ETR", "DEF", "EXC", "ERETP"]]
y = clima[["TEMP_MÉD_ANUAL"]]

In [14]:
#Treinando com Gradient Boost
x_treino, x_teste, y_treino, y_teste = train_test_split(x, y, test_size=0.25, random_state=0, shuffle=True)
modelo = GradientBoostingClassifier()
modelo.fit(x_treino, y_treino)

predictions_train = modelo.predict(x_treino)
predictions_validation = modelo.predict(x_teste)
print(sklearn.metrics.accuracy_score(y_treino, predictions_train))
print(sklearn.metrics.accuracy_score(y_teste, predictions_validation))

  y = column_or_1d(y, warn=True)


1.0
0.8214285714285714


In [None]:
previsoes = modelo.predict(x_teste)
print(sklearn.metrics.accuracy_score(y_teste, previsoes))

0.8928571428571429
