# Machine Learning - Random Forest (Floresta Aleatória)

#### Importação das bibliotecas necessárias

In [1]:
from sklearn.tree import export_graphviz
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.preprocessing import LabelEncoder
#from sklearn.cross_validation import cross_val_predict
from sklearn.model_selection import cross_val_predict
from sklearn.metrics import classification_report
from sklearn.model_selection import train_test_split
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

#### Carregando a base de dados.

In [2]:
df_edu = pd.read_csv('xAPI-Edu-Data.csv')

In [4]:
df_edu.head()

Unnamed: 0,gender,NationalITy,PlaceofBirth,StageID,GradeID,SectionID,Topic,Semester,Relation,raisedhands,VisITedResources,AnnouncementsView,Discussion,ParentAnsweringSurvey,ParentschoolSatisfaction,StudentAbsenceDays,Class
0,M,KW,KuwaIT,lowerlevel,G-04,A,IT,F,Father,15,16,2,20,Yes,Good,Under-7,M
1,M,KW,KuwaIT,lowerlevel,G-04,A,IT,F,Father,20,20,3,25,Yes,Good,Under-7,M
2,M,KW,KuwaIT,lowerlevel,G-04,A,IT,F,Father,10,7,0,30,No,Bad,Above-7,L
3,M,KW,KuwaIT,lowerlevel,G-04,A,IT,F,Father,30,25,5,35,No,Bad,Above-7,L
4,M,KW,KuwaIT,lowerlevel,G-04,A,IT,F,Father,40,50,12,50,No,Bad,Above-7,M


#### Verificando as distribuições de classes.

In [5]:
df_edu['Class'].value_counts()

M    211
H    142
L    127
Name: Class, dtype: int64

#### Verificando os registros nulos

In [6]:
df_edu.isnull().sum()

gender                      0
NationalITy                 0
PlaceofBirth                0
StageID                     0
GradeID                     0
SectionID                   0
Topic                       0
Semester                    0
Relation                    0
raisedhands                 0
VisITedResources            0
AnnouncementsView           0
Discussion                  0
ParentAnsweringSurvey       0
ParentschoolSatisfaction    0
StudentAbsenceDays          0
Class                       0
dtype: int64

#### Codificando os atributos numéricos.

In [3]:
Features = df_edu
Cat_Colums = Features.dtypes.pipe(lambda Features: Features[Features=='object']).index
for col in Cat_Colums:
    label = LabelEncoder()
    Features[col] = label.fit_transform(Features[col])

In [4]:
Features.head()

Unnamed: 0,gender,NationalITy,PlaceofBirth,StageID,GradeID,SectionID,Topic,Semester,Relation,raisedhands,VisITedResources,AnnouncementsView,Discussion,ParentAnsweringSurvey,ParentschoolSatisfaction,StudentAbsenceDays,Class
0,1,4,4,2,1,0,7,0,0,15,16,2,20,1,1,1,2
1,1,4,4,2,1,0,7,0,0,20,20,3,25,1,1,1,2
2,1,4,4,2,1,0,7,0,0,10,7,0,30,0,0,0,1
3,1,4,4,2,1,0,7,0,0,30,25,5,35,0,0,0,1
4,1,4,4,2,1,0,7,0,0,40,50,12,50,0,0,0,2


#### Separando os dados e classes

In [5]:
dataset = df_edu.drop('Class',axis=1)

In [6]:
classes = df_edu['Class']

# Random Forest vs Árvore de Decisão

#### Resultados Random Forest

In [7]:
random_clf = RandomForestClassifier(random_state=1,n_estimators=100)#n_estimator:número de arvores na floresta aleatória.

In [8]:
resultados_random = cross_val_predict(random_clf, dataset, classes, cv=5)

In [9]:
print(classification_report(classes,resultados_random))

              precision    recall  f1-score   support

           0       0.65      0.64      0.65       142
           1       0.77      0.78      0.77       127
           2       0.63      0.63      0.63       211

    accuracy                           0.67       480
   macro avg       0.68      0.68      0.68       480
weighted avg       0.67      0.67      0.67       480



#### Resultados Decision Tree

In [10]:
tree_clf = DecisionTreeClassifier(random_state=1)

In [11]:
resultados_tree = cross_val_predict(tree_clf,dataset,classes,cv=5)

In [12]:
print(classification_report(classes,resultados_tree))

              precision    recall  f1-score   support

           0       0.50      0.61      0.55       142
           1       0.74      0.68      0.70       127
           2       0.54      0.49      0.52       211

    accuracy                           0.57       480
   macro avg       0.59      0.59      0.59       480
weighted avg       0.58      0.57      0.58       480



#### Verificando Overfitting

In [13]:
X_train, X_test, y_train, y_test = train_test_split(df_edu.drop('Class',axis=1),df_edu['Class'],test_size=0.3,random_state=1)

In [14]:
def compara_modelos_random_forest(maxdepth):
    if maxdepth == 0:
        rf = RandomForestClassifier(n_estimators=100,random_state=1)
    else: 
        rf = RandomForestClassifier(n_estimators=100,random_state=1, max_depth=maxdepth)
    rf.fit(X_train, y_train)
    train_score = rf.score(X_train, y_train)
    test_score = rf.score(X_test, y_test)
    return train_score,test_score

In [15]:
print('{:10} {:20} {:20}'.format('depth', 'Training score','Testing score'))
print('{:10} {:20} {:20}'.format('-----', '--------------','-------------'))
print('{:1}         {} '.format(2,str(compara_modelos_random_forest(2))))
print('{:1}         {} '.format(3,str(compara_modelos_random_forest(3))))
print('{:1}         {} '.format(4,str(compara_modelos_random_forest(4))))
print('{:1}         {} '.format(10,str(compara_modelos_random_forest(10))))
print('{:1}         {} '.format(15,str(compara_modelos_random_forest(15))))
print('{:1}         {} '.format('Full',str(compara_modelos_random_forest(0))))

depth      Training score       Testing score       
-----      --------------       -------------       
2         (0.75, 0.6180555555555556) 
3         (0.8244047619047619, 0.6805555555555556) 
4         (0.8720238095238095, 0.7152777777777778) 
10         (1.0, 0.7569444444444444) 
15         (1.0, 0.7986111111111112) 
Full         (1.0, 0.7986111111111112) 


In [16]:
def compara_modelos_decision_tree(maxdepth):
    if maxdepth == 0:
        df = DecisionTreeClassifier(random_state=1)
    else: 
        df = DecisionTreeClassifier(random_state=1, max_depth=maxdepth)
    df.fit(X_train, y_train)
    train_score = df.score(X_train, y_train)
    test_score = df.score(X_test, y_test)
    return train_score,test_score

In [17]:
print('{:10} {:20} {:20}'.format('depth', 'Training score','Testing score'))
print('{:10} {:20} {:20}'.format('-----', '--------------','-------------'))
print('{:1}         {} '.format(2,str(compara_modelos_decision_tree(2))))
print('{:1}         {} '.format(3,str(compara_modelos_decision_tree(3))))
print('{:1}         {} '.format(4,str(compara_modelos_decision_tree(4))))
print('{:1}         {} '.format(10,str(compara_modelos_decision_tree(10))))
print('{:1}         {} '.format(15,str(compara_modelos_decision_tree(15))))
print('{:1}         {} '.format('Full',str(compara_modelos_decision_tree(0))))

depth      Training score       Testing score       
-----      --------------       -------------       
2         (0.6398809523809523, 0.6805555555555556) 
3         (0.7321428571428571, 0.7013888888888888) 
4         (0.7916666666666666, 0.7430555555555556) 
10         (0.9910714285714286, 0.6875) 
15         (1.0, 0.6944444444444444) 
Full         (1.0, 0.6944444444444444) 


# Tunning do Modelo para Garantir o Melhor Desempenho

#### Como encontrar os melhores valores para os parametros do modelo?

RandomForestClassifier(
n_estimators=?,
criterion='gini' ou 'entropy',
max_depth=?,
min_samples_split=?,
min_samples_leaf=?
) ...

#### GridSearchCV para testes de Hyperparametros

In [18]:
from sklearn.model_selection import GridSearchCV

#### Lista de possíveis valores de estimators ou quantidade de árvores da floresta.

In [19]:
valores_estimators = [10, 20, 50, 100, 150]

#### Lista de possíveis valores para o critério de divisão.

In [20]:
valores_criterion = ['gini','entropy']

#### Lista de possíveis valores para a profundidade máxima de cada árvore

In [21]:
valores_max_depth = [10, 20, 50, 100]

#### Lista de possíveis valores para os parametros min_samples_split e min_samples_leaf.

In [22]:
valores_min_samples_split = [2, 5, 10,15]
valores_min_samples_leaf = [1, 5, 10,15]

#### Define um dicionário que recebe as listas de parâmetros e valores.

In [23]:
parametros_grid = dict(n_estimators=valores_estimators,
                       criterion=valores_criterion,
                       max_depth=valores_max_depth,
                       min_samples_split=valores_min_samples_split,
                       min_samples_leaf=valores_min_samples_leaf 
                      )

#### Dicionário com os parametros que serão utilizados no grid.

In [24]:
parametros_grid

{'n_estimators': [10, 20, 50, 100, 150],
 'criterion': ['gini', 'entropy'],
 'max_depth': [10, 20, 50, 100],
 'min_samples_split': [2, 5, 10, 15],
 'min_samples_leaf': [1, 5, 10, 15]}

#### Instancia o GridSearch com o modelo a ser utilizado, parametros, número de folds e scoring.

In [25]:
rf = RandomForestClassifier()

In [26]:
grid = GridSearchCV(rf, parametros_grid, cv=5, scoring='accuracy')

#### Aplica o GridSearch passando as features e classes

In [27]:
grid.fit(df_edu.drop('Class',axis=1),df_edu['Class'])

GridSearchCV(cv=5, error_score=nan,
             estimator=RandomForestClassifier(bootstrap=True, ccp_alpha=0.0,
                                              class_weight=None,
                                              criterion='gini', max_depth=None,
                                              max_features='auto',
                                              max_leaf_nodes=None,
                                              max_samples=None,
                                              min_impurity_decrease=0.0,
                                              min_impurity_split=None,
                                              min_samples_leaf=1,
                                              min_samples_split=2,
                                              min_weight_fraction_leaf=0.0,
                                              n_estimators=100, n_jobs=None,
                                              oob_score=False,
                                              rando

#### Imprime os scores por combinações.

In [29]:
grid.cv_results_

{'mean_fit_time': array([0.03081055, 0.04817934, 0.11605587, 0.24731998, 0.33526058,
        0.02440133, 0.04496684, 0.11102719, 0.22304645, 0.3421803 ,
        0.02699885, 0.05681882, 0.11555057, 0.21620426, 0.37485228,
        0.03378143, 0.05844297, 0.1398211 , 0.23151264, 0.31903677,
        0.02379088, 0.0454092 , 0.10860653, 0.21161399, 0.34412022,
        0.02499018, 0.0460258 , 0.11194086, 0.21722608, 0.3187973 ,
        0.02622232, 0.04766197, 0.1163703 , 0.22876134, 0.32093883,
        0.02440109, 0.04617567, 0.10841804, 0.21478133, 0.33264995,
        0.02419229, 0.04393344, 0.10633588, 0.20697393, 0.31633329,
        0.02140741, 0.04577384, 0.10621305, 0.21283894, 0.3294806 ,
        0.02408609, 0.04397068, 0.10214338, 0.28807988, 0.41892323,
        0.02938318, 0.05168538, 0.12896266, 0.22235084, 0.32887716,
        0.02278762, 0.05559611, 0.12822738, 0.24736819, 0.31161013,
        0.02439127, 0.04417415, 0.10514064, 0.20375419, 0.39853144,
        0.02203979, 0.03497629,

#### Verificando os melhores parâmetros.

In [30]:
grid.best_params_

{'criterion': 'gini',
 'max_depth': 20,
 'min_samples_leaf': 15,
 'min_samples_split': 15,
 'n_estimators': 10}

#### Verificando o melhor score.

In [31]:
grid.best_score_

0.7416666666666666