# Module 3 Assignment, Part 3

## Section 0: Setup & initialization

In [None]:
# importing required libraries
from sklearn.tree import DecisionTreeClassifier
from random import randrange
from sklearn.metrics import confusion_matrix
from sklearn.datasets import load_iris
import pandas as pd

## Section 1: Helper functions

The subsample_data() function performs sampling with replacement to create a subsample from the dataset. Each subsample will be used to train one decision tree in your Random Forest. 


In [None]:
def subsample_data(dataset, num_samples): 
  rows = []

  for i in range(num_samples):
    index = randrange(len(dataset))
    rows.append(index)   
    
  return dataset.iloc[rows,:]

The subsample_features() function selects a subsample of the features int the dataset.  This subsample of features will be used when building one of the Decision Trees in your Random Forest.

In [None]:
def subsample_features(dataset, ratio):
  curr_cols = [i for i in range(len(dataset.columns))]
  sel_cols = []
  n_features = round(len(dataset.columns)*ratio)
  for i in range(n_features):
    index = randrange(len(curr_cols))
    sel_cols.append(curr_cols.pop(index))
  # this returns col index and the value of these col
  return sel_cols, dataset.iloc[:,sel_cols]

Train-test split function written from scratch.

In [None]:
# Split dataset into train and test
def split_train_test(dataset, test_size):
  train_rows = []
  test_rows = []
  
  n_test_rows = round(len(dataset) * test_size)
  curr_rows = [i for i in range(len(dataset))]
  
  for i in range(n_test_rows):
    index = randrange(len(curr_rows))
    test_rows.append(curr_rows.pop(index))
  
  train_rows = curr_rows
  train_data = dataset.iloc[train_rows, :]
  test_data = dataset.iloc[test_rows, :]
  
  return train_data, test_data

## Section 2: Random Forest implementation

### 1\.  Complete the code for training you Random Forest Algorithm.

In [None]:
from sklearn.tree import DecisionTreeClassifier

In [None]:
# Random Forest Algorithm
def random_forest_train(training_data, num_trees, max_tree_depth, sample_size_fr, n_features_ratio):
  trees = []
  
  for i in range(num_trees):  
    # use the subsample_data() function to extract a data sample to train your tree on
    data_sample = subsample_data(training_data, round(len(training_data) * sample_size_fr) )
    
    # use the subsample_features() function to extract sample features, don't pass the last col because it has y
    cols, sample_features = subsample_features(data_sample.iloc[:, :-1], n_features_ratio)
 
    #train.iloc[sample_features.index, train.columns == 'target']
    tree = DecisionTreeClassifier(max_depth = max_tree_depth)
    # pass y
    tree.fit(sample_features, training_data.iloc[sample_features.index, -1])

    # append the tree to your forest of trees
    trees.append([tree,cols])	
  return trees

### 2\. Complete the code for the predict function for your Random Forest algorithm.

Return values:
  * predictions contains the final prediction for each test data point (row)
  * all_tree_preds are the individual predictions of all the trees for each test data point
  

In [None]:
import numpy as np

In [None]:
# trees are input from random_forest_train with [tree, # cols]
def random_forest_predict(test_data, trees):
  final_predictions = []
  all_tree_preds = []
  
  # compute prediction for each test point (row in the dataset)
  for i in range(len(test_data)):
    # this contains predictions for a single point by all trees
    preds = []
    # loop through trees to predict
    for j in range(len(trees)):
      tree = trees[j][0]
      rows = trees[j][1]

      # do this to get 2d array, otherwise might have errors
      pred = tree.predict( test_data.iloc[i:i+1, rows] )
      # pred will return an array despite having a single element, get the prediction with index 0
      preds.append(pred[0])

    # get final prediction by the one with highest freq in preds
    freq = np.bincount(preds)
    final_pred = np.argmax(freq)

    final_predictions.append(final_pred)
    all_tree_preds.append(preds)
    
  return final_predictions, all_tree_preds

## Section 3: Experiment with your newly created Random Forest model

In [None]:
# importing the dataset
dataset = load_iris()
df = pd.DataFrame(dataset.data)
df["target"] = dataset.target
df.head()

Unnamed: 0,0,1,2,3,target
0,5.1,3.5,1.4,0.2,0
1,4.9,3.0,1.4,0.2,0
2,4.7,3.2,1.3,0.2,0
3,4.6,3.1,1.5,0.2,0
4,5.0,3.6,1.4,0.2,0


In [None]:
# splitting the dataset into train and test
train, test = split_train_test(df, 0.3)

In [None]:
train = train.reset_index(drop = True)
test = test.reset_index(drop = True)

In [None]:
trees = random_forest_train(training_data = train, num_trees = 1, max_tree_depth = 3, sample_size_fr = 0.8, n_features_ratio = 0.75)

In [None]:
# doing this will get index 1 with 2 dimensional array
test.iloc[1:2, [0,1,2]]

Unnamed: 0,0,1,2
1,5.6,3.0,4.1


In [None]:
trees

[[DecisionTreeClassifier(max_depth=3), [2, 3, 0]]]

In [None]:
# test
for j in trees:
  tree = j[0]
  rows = j[1]

  print( tree.predict(test.iloc[3, rows].values.reshape(1,3) ) )

[2]


In [None]:
predictions, all_preds = random_forest_predict(test.iloc[:,:-1], trees)

In [None]:
all_preds

[[0],
 [1],
 [1],
 [2],
 [2],
 [2],
 [1],
 [1],
 [2],
 [2],
 [2],
 [0],
 [1],
 [1],
 [2],
 [0],
 [1],
 [2],
 [2],
 [0],
 [1],
 [2],
 [0],
 [1],
 [1],
 [2],
 [2],
 [2],
 [1],
 [1],
 [0],
 [2],
 [2],
 [2],
 [0],
 [0],
 [1],
 [0],
 [0],
 [2],
 [0],
 [0],
 [0],
 [0],
 [0]]

In [None]:
predictions

[0,
 1,
 1,
 2,
 2,
 2,
 1,
 1,
 2,
 2,
 2,
 0,
 1,
 1,
 2,
 0,
 1,
 2,
 2,
 0,
 1,
 2,
 0,
 1,
 1,
 2,
 2,
 2,
 1,
 1,
 0,
 2,
 2,
 2,
 0,
 0,
 1,
 0,
 0,
 2,
 0,
 0,
 0,
 0,
 0]

In [None]:
confusion_matrix(test.iloc[:,-1], predictions)

array([[15,  0,  0],
       [ 0, 11,  1],
       [ 0,  2, 16]])