In [None]:
from sklearn.decomposition import PCA
from sklearn import preprocessing
from sklearn.ensemble import RandomForestClassifier

from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.pipeline import make_pipeline

from preprocess import get_preprocessed_data

import numpy as np
import pandas as pd

In [None]:
df = get_preprocessed_data()
num_rows = df.shape[0]
num_cols = df.shape[1]

In [None]:
# digitize yards
bins = np.linspace(-99, 100, 10)
indices = np.digitize(df['Yards'], bins)
df = df.assign(Yards=indices)

In [None]:
# split training and testing data
split = 0.80
input_column_list = df.columns.tolist()
input_column_list.remove('Yards')

In [None]:
tree_count_scores = [] # such that the ith element of the list represents the mean score using 2^i trees
for i in range(0,8):
    # Create pipeline using a RandomForestClassifier
    rf = RandomForestClassifier(n_estimators=2**i, random_state=0, max_leaf_nodes=10000, min_impurity_decrease=0.001)
    clf = make_pipeline(preprocessing.StandardScaler(), PCA(0.5), rf)

    # Cross-validation
    scores = cross_val_score(clf, df[input_column_list], df['Yards'], cv=7)
    tree_count_scores.append(scores.mean())
    print("Random Forest Accuracy (%d Trees): %0.2f (+/- %0.2f)" % (2**i, scores.mean(), scores.std() * 2))

In [None]:
import matplotlib.pyplot as plt
%matplotlib inline

x_axis = [2**x for x in range(0,8)]
plt.plot(x_axis, tree_count_scores)
plt.ylabel('Accuracy')
plt.xlabel('Num Trees')
plt.show()