## Supplement 6: Decision Trees and Random Forest

In [1]:
%matplotlib inline
import numpy as np
import pandas as pd

from sklearn.ensemble import RandomForestClassifier
from sklearn import tree
from scipy.stats import mode
from sklearn.tree import DecisionTreeClassifier
from scipy import stats



### 6.3 Programming Task: Song popularity prediction using Random Forest
The goal of this task is to train a random forest model that predicts the song popularity using the datasets already provided in task 4.3
 

In [2]:
# Read data

train_data = pd.read_csv("train-songs.csv")
print(train_data.head())
train_X = train_data[['danceability', 'key', 'loudness', 'acousticness', 'instrumentalness', 'liveness', 'valence', 'tempo' ]].values
train_y = train_data['popular'].values

test_data = pd.read_csv("test-songs.csv")
print(test_data.head())
test_X = test_data[['danceability', 'key', 'loudness', 'acousticness', 'instrumentalness', 'liveness', 'valence', 'tempo' ]].values
test_y = test_data['popular'].values

   danceability  key  loudness  acousticness  instrumentalness  liveness  \
0         0.391    8    -9.532         0.478          0.000006    0.1160   
1         0.628    1   -13.834         0.156          0.010400    0.0836   
2         0.613    3   -22.789         0.864          0.000000    0.2690   
3         0.504    2    -5.931         0.414          0.000000    0.0845   
4         0.698    9    -3.840         0.101          0.000000    0.1070   

   valence    tempo  popular  
0    0.138  105.593      0.0  
1    0.761  102.974      0.0  
2    0.371   75.104      0.0  
3    0.163  135.927      1.0  
4    0.931  124.042      1.0  
   danceability  key  loudness  acousticness  instrumentalness  liveness  \
0         0.652    9    -7.319        0.7250          0.000002     0.189   
1         0.500   11    -7.996        0.0024          0.000000     0.133   
2         0.422   10    -7.215        0.1090          0.000000     0.722   
3         0.708    5    -5.426        0.0136         

   i\. Implement a function that draws a bootstrap sample of size N from the train dataset, where N can be specified by the user.




In [3]:
def generate_bootstrap(train_X,train_y,N):
    n_rows, n_cols = train_X.shape
    samples = np.random.choice(a=n_rows, size=N, replace=True)
    return train_X[samples], train_y[samples]





   ii\. Complete the implementation of the random forest algorithm. For this task you may use the DecisionTreeClassifier from the scikit-learn library. The other parts of the random forest algorithm must be implemented using only Scipy/Numpy.

In [4]:
class RandomForest:
   def __init__(self,n_trees,max_features,max_samples,min_node_size, max_depth):
        #Initialize list containing weak classifiers. Also initialize any other parameter if required.
        self.num_trees = n_trees
        self.max_depth = max_depth
        self.max_features = max_features
        self.max_samples = max_samples
        # To store individually trained decision trees
        self.decision_trees = []




   def train(self,train_X,train_y):
    if len(self.decision_trees) > 0:
        self.decision_trees = []

    if isinstance(train_X, pd.core.frame.DataFrame):
        train_X = train_X.values
    if isinstance(train_y, pd.core.series.Series):
        train_y = train_y.values
        
    # Training each weak classifier            
    # Build each tree of the forest
    tree_built = 0

    while tree_built < self.num_trees:
        clf = DecisionTreeClassifier(
        max_depth=self.max_depth,
        max_features=self.max_features,
        )

        # Obtain data sample
        _X, _y = generate_bootstrap(train_X, train_y, self.max_samples)
        # Train
        clf.fit(_X, _y)
        # Save the classifier
        self.decision_trees.append(clf)
        tree_built += 1


   
   def predict(self,test_X):
    #Final predictions are obtained by taking majority-vote (most frequent class) from each weak classifier prediction
    y = []
    for tree in self.decision_trees:
        y.append(tree.predict(test_X))
        
    y = np.swapaxes(y, axis1=0, axis2=1)
        
    # Use majority voting for the final prediction
    predicted_classes = stats.mode(y,axis=1, keepdims=True)[0].reshape(-1)
    
    # return y_predictions
    return predicted_classes






   




iii\. Train the model for the dataset from train-songs.csv using the parameters given below.
| Parameter| Value|
|----------|------|
Number of trees|100|
Maximum features per tree|2|
Bootstrap sample size|20000|
Minimum node size|1|
Maximum tree depth|10|


Note: The bootstrap sample size is the same as train dataset size in this task.


In [5]:
# Note: Run this cell without any changes. The model will train if the implementation of subtask (ii) is correct.

random_forest_model = RandomForest(n_trees=100, max_samples=20000,max_depth=10,min_node_size=1, max_features=2 )

random_forest_model.train(train_X, train_y)




   iv\. Calculate the accuracy of the model using the test dataset and compare your results with the
RandomForestClassifier from the scikit-learn library using the following parameters.

In [6]:
# TODO Run predict for test data and calculate accuracy

pred_class = random_forest_model.predict(test_X)

rows = test_y.shape[0]

print("Training Accuracy of the Model:", (sum(pred_class==test_y)*100/rows),"%")



Training Accuracy of the Model: 80.65 %


In [7]:
# TODO: Train and predict using scikit-learn library
clf=RandomForestClassifier(n_estimators=100, max_samples=20000,max_depth=10, max_features=2)

#Train the model using the training sets y_pred=clf.predict(X_test)
clf.fit(train_X,train_y)

y_pred=clf.predict(test_X)
from sklearn import metrics
# Model Accuracy
print("Accuracy using scikit-learn:",metrics.accuracy_score(test_y, y_pred))
print("Training Accuracy using scikit-learn:", (sum(y_pred==test_y)*100/rows),"%")


Accuracy using scikit-learn: 0.805
Training Accuracy using scikit-learn: 80.5 %
