In [1]:
import numpy as np
import pandas as pd
%matplotlib inline

import random
from pprint import pprint

from decision_tree_functions import decision_tree_algorithm, decision_tree_predictions
from helper_functions import train_test_split, calculate_accuracy

# Load and Prepare Data

#### Format of the data
- the last column of the data frame must contain the label and it must also be called "label"
- there should be no missing values in the data frame

In [44]:
df = pd.read_csv("D:\Code\DataScience\data\Iris.csv")
df

Unnamed: 0,Id,SepalLengthCm,SepalWidthCm,PetalLengthCm,PetalWidthCm,Species
0,1,5.1,3.5,1.4,0.2,Iris-setosa
1,2,4.9,3.0,1.4,0.2,Iris-setosa
2,3,4.7,3.2,1.3,0.2,Iris-setosa
3,4,4.6,3.1,1.5,0.2,Iris-setosa
4,5,5.0,3.6,1.4,0.2,Iris-setosa
...,...,...,...,...,...,...
145,146,6.7,3.0,5.2,2.3,Iris-virginica
146,147,6.3,2.5,5.0,1.9,Iris-virginica
147,148,6.5,3.0,5.2,2.0,Iris-virginica
148,149,6.2,3.4,5.4,2.3,Iris-virginica


In [45]:

df["label"] = df.Species
df = df.drop(columns=["Species", "Id"])

column_names = []
for column in df.columns:
    name = column.replace("_", "")
    column_names.append(name)
df.columns = column_names

df.head()

Unnamed: 0,SepalLengthCm,SepalWidthCm,PetalLengthCm,PetalWidthCm,label
0,5.1,3.5,1.4,0.2,Iris-setosa
1,4.9,3.0,1.4,0.2,Iris-setosa
2,4.7,3.2,1.3,0.2,Iris-setosa
3,4.6,3.1,1.5,0.2,Iris-setosa
4,5.0,3.6,1.4,0.2,Iris-setosa


In [46]:
def transform_label(value):
    if value == 'Iris-setosa':
        return 0
    elif value == 'Iris-versicolor':
        return 1
    else:
        return 2

df["label"] = df.label.apply(transform_label)

In [47]:
random.seed(50)
train_df, test_df = train_test_split(df, test_size=0.2)

# Random Forest 

### Mô tả thuật toán `random forest ` cho phân loại 
1. từ tập dữ liệu huấn luyện D, ta tạo dữ liệu ngẫu nhiên (mẫu boostrap)
2. Sử dụng các tập con dữ liệu lấy mẫu ngẫu nhiên D1,D2,.., Dk xây dựng nên các cây T1, T2,...,Tk
3. Kết hợp các cây : Sử dụng chiến lược bình chọn theo số đông với bài toán phân loại hoặc lấy giá trị trung bình các giá trị dự đoán từ các cây với bài toán hồi quy.

![](https://i.imgur.com/XlfZoCw.png)

In [48]:
def bootstrapping(train_df, n_bootstrap):
    bootstrap_indices = np.random.randint(low=0, high=len(train_df), size=n_bootstrap)
    df_bootstrapped = train_df.iloc[bootstrap_indices]
    
    return df_bootstrapped

def random_forest_algorithm(train_df, n_trees, n_bootstrap, n_features, dt_max_depth):
    forest = []
    for i in range(n_trees):
        df_bootstrapped = bootstrapping(train_df, n_bootstrap)
        tree = decision_tree_algorithm(df_bootstrapped, max_depth=dt_max_depth, random_subspace=n_features)
        forest.append(tree)
    
    return forest

def random_forest_predictions(test_df, forest):
    df_predictions = {}
    for i in range(len(forest)):
        column_name = "tree_{}".format(i)
        predictions = decision_tree_predictions(test_df, tree=forest[i])
        df_predictions[column_name] = predictions

    df_predictions = pd.DataFrame(df_predictions)
    random_forest_predictions = df_predictions.mode(axis=1)[0]
    
    return random_forest_predictions

In [49]:
train_df

Unnamed: 0,SepalLengthCm,SepalWidthCm,PetalLengthCm,PetalWidthCm,label
1,4.9,3.0,1.4,0.2,0
64,5.6,2.9,3.6,1.3,1
113,5.7,2.5,5.0,2.0,2
149,5.9,3.0,5.1,1.8,2
43,5.0,3.5,1.6,0.6,0
...,...,...,...,...,...
58,6.6,2.9,4.6,1.3,1
61,5.9,3.0,4.2,1.5,1
111,6.4,2.7,5.3,1.9,2
63,6.1,2.9,4.7,1.4,1


In [50]:
test_df

Unnamed: 0,SepalLengthCm,SepalWidthCm,PetalLengthCm,PetalWidthCm,label
49,5.0,3.3,1.4,0.2,0
29,4.7,3.2,1.6,0.2,0
99,5.7,2.8,4.1,1.3,1
34,4.9,3.1,1.5,0.1,0
91,6.1,3.0,4.6,1.4,1
45,4.8,3.0,1.4,0.3,0
116,6.5,3.0,5.5,1.8,2
33,5.5,4.2,1.4,0.2,0
0,5.1,3.5,1.4,0.2,0
144,6.7,3.3,5.7,2.5,2


In [51]:
forest = random_forest_algorithm(train_df, n_trees=4, n_bootstrap=800, n_features=2, dt_max_depth=4)
predictions = random_forest_predictions(test_df, forest)
accuracy = calculate_accuracy(predictions, test_df.label)

print("Accuracy = {}".format(accuracy))

Accuracy = 0.9


# Random Forest with Sklearn

In [52]:
from sklearn.model_selection import train_test_split
X = df.iloc[:, : -1]
Y = df.iloc[: , -1]
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.3)

In [53]:
X_train

Unnamed: 0,SepalLengthCm,SepalWidthCm,PetalLengthCm,PetalWidthCm
130,7.4,2.8,6.1,1.9
114,5.8,2.8,5.1,2.4
1,4.9,3.0,1.4,0.2
28,5.2,3.4,1.4,0.2
136,6.3,3.4,5.6,2.4
...,...,...,...,...
107,7.3,2.9,6.3,1.8
89,5.5,2.5,4.0,1.3
18,5.7,3.8,1.7,0.3
7,5.0,3.4,1.5,0.2


In [54]:
Y_train

130    2
114    2
1      0
28     0
136    2
      ..
107    2
89     1
18     0
7      0
66     1
Name: label, Length: 105, dtype: int64

In [55]:
from sklearn.ensemble import RandomForestClassifier
clf = RandomForestClassifier()

In [56]:
clf.fit(X_train, Y_train)
y_pred = clf.predict(X_test)
y_pred

array([2, 0, 0, 2, 0, 1, 0, 1, 1, 1, 0, 2, 1, 1, 1, 1, 1, 2, 0, 2, 1, 0,
       1, 1, 0, 1, 0, 2, 0, 1, 2, 0, 2, 0, 2, 1, 2, 0, 1, 0, 1, 1, 2, 0,
       0], dtype=int64)

In [57]:
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
print(confusion_matrix(Y_test, y_pred))
print(classification_report(Y_test, y_pred))
print(accuracy_score(Y_test, y_pred))

[[16  0  0]
 [ 0 16  0]
 [ 0  2 11]]
              precision    recall  f1-score   support

           0       1.00      1.00      1.00        16
           1       0.89      1.00      0.94        16
           2       1.00      0.85      0.92        13

    accuracy                           0.96        45
   macro avg       0.96      0.95      0.95        45
weighted avg       0.96      0.96      0.96        45

0.9555555555555556


### Random Forest with Pramaters

Có 2 tham số trong Random Forest

* n_estimators : số lượng cây trong Random Forest 
* criterion {“gini”, “entropy”, “log_loss”}, default=”gini” : 

In [58]:
clf = RandomForestClassifier(n_estimators=456)
clf.fit(X_train, Y_train)
y_pred = clf.predict(X_test)
y_pred

array([2, 0, 0, 2, 0, 1, 0, 1, 1, 1, 0, 2, 1, 1, 1, 1, 1, 2, 0, 2, 1, 0,
       1, 1, 0, 1, 0, 2, 0, 1, 2, 0, 2, 0, 2, 1, 1, 0, 1, 0, 1, 1, 2, 0,
       0], dtype=int64)

In [59]:
print(confusion_matrix(Y_test, y_pred))
print(classification_report(Y_test, y_pred))
print(accuracy_score(Y_test, y_pred))

[[16  0  0]
 [ 0 16  0]
 [ 0  3 10]]
              precision    recall  f1-score   support

           0       1.00      1.00      1.00        16
           1       0.84      1.00      0.91        16
           2       1.00      0.77      0.87        13

    accuracy                           0.93        45
   macro avg       0.95      0.92      0.93        45
weighted avg       0.94      0.93      0.93        45

0.9333333333333333


In [60]:
"""criterion"""
clf = RandomForestClassifier(criterion='entropy')
clf.fit(X_train, Y_train)
y_pred = clf.predict(X_test)
y_pred

array([2, 0, 0, 2, 0, 1, 0, 1, 1, 1, 0, 2, 1, 1, 1, 1, 1, 2, 0, 2, 1, 0,
       1, 1, 0, 1, 0, 2, 0, 1, 2, 0, 2, 0, 2, 1, 2, 0, 1, 0, 1, 1, 2, 0,
       0], dtype=int64)

In [61]:
print(confusion_matrix(Y_test, y_pred))
print(classification_report(Y_test, y_pred))
print(accuracy_score(Y_test, y_pred))

[[16  0  0]
 [ 0 16  0]
 [ 0  2 11]]
              precision    recall  f1-score   support

           0       1.00      1.00      1.00        16
           1       0.89      1.00      0.94        16
           2       1.00      0.85      0.92        13

    accuracy                           0.96        45
   macro avg       0.96      0.95      0.95        45
weighted avg       0.96      0.96      0.96        45

0.9555555555555556


In [62]:
"""criterion"""
clf = RandomForestClassifier(criterion='log_loss')
clf.fit(X_train, Y_train)
y_pred = clf.predict(X_test)
y_pred

array([2, 0, 0, 2, 0, 1, 0, 1, 1, 1, 0, 2, 1, 1, 1, 1, 1, 2, 0, 2, 1, 0,
       1, 1, 0, 1, 0, 2, 0, 1, 2, 0, 2, 0, 2, 1, 2, 0, 1, 0, 1, 1, 2, 0,
       0], dtype=int64)

In [63]:
print(confusion_matrix(Y_test, y_pred))
print(classification_report(Y_test, y_pred))
print(accuracy_score(Y_test, y_pred))

[[16  0  0]
 [ 0 16  0]
 [ 0  2 11]]
              precision    recall  f1-score   support

           0       1.00      1.00      1.00        16
           1       0.89      1.00      0.94        16
           2       1.00      0.85      0.92        13

    accuracy                           0.96        45
   macro avg       0.96      0.95      0.95        45
weighted avg       0.96      0.96      0.96        45

0.9555555555555556
