In [17]:
# This tells matplotlib not to try opening a new window for each plot.
%matplotlib inline

# Libraries for reading, cleaning and plotting the dataa
import numpy as np 
import pandas as pd 
import csv
import matplotlib.pyplot as plt


# Libraries for models 
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import confusion_matrix

In [18]:
# Read in training data 
training_data = []
with open('data/train.csv', newline='') as csvfile:
    train_data = csv.reader(csvfile, delimiter=',', quotechar='|')
    for row in train_data:
        training_data.append(row)
            
# Convert to a numpy array of type int (except for the label row)
training_data = np.array(training_data[1:]).astype(int)   

# Read in test data
testing_data = []
with open('data/test.csv', newline='') as csvfile:
    test_data = csv.reader(csvfile, delimiter=',', quotechar='|')
    for row in test_data:
        testing_data.append(row)

# The testing file is huge so only read in max_test_data
max_test_data = 30001
test_data = np.array(testing_data[1:max_test_data]).astype(int)        

In [19]:
# Shuffle the input: create a random permutation of the integers between 0 and the number of data points and apply this
# permutation to X and Y.
# NOTE: Each time you run this cell, you'll re-shuffle the data, resulting in a different ordering.
shuffle = np.random.permutation(np.arange(training_data.shape[0]))
training_data = training_data[shuffle]

# Split training data (labeled) into 80% training and 20% dev) and skip over the id column (it doesn't add an information)
# Immediately cast train data as floats so we can normalize it later 
split_index = int(len(training_data) * 0.8)
train_data = training_data[:split_index, 1:-1].astype(np.float64)
train_labels = training_data[:split_index, -1]
dev_data = training_data[split_index:, 1:-1].astype(np.float64)
dev_labels = training_data[split_index:, -1]
test_data = test_data[:,1:]

# Retrieve the mean and standard deviation of each feature - axis=0 is for going along the columns, keepdims forces the dimensions to stay the same
# Only compute it for the first ten features (they're numeric - not one hot or categorical)
num_columns = 10
# To avoid dividing by 0, add 1e-10 to the standard deviation 
smoothing = 1e-10
# USe the mean and standard deviation of the training data
feature_mean = train_data[:,:num_columns].mean(axis=0, keepdims=True)
feature_std = train_data[:,:num_columns].std(axis=0, keepdims=True)
# Normalize all numeric features except wilderness type and soil type (one-hot) - first 10 columns
train_data[:,:num_columns] = train_data[:,:num_columns] - feature_mean
train_data[:,:num_columns] = np.divide(train_data[:,:num_columns], feature_std + smoothing)
# Normalize dev data as well (using training mean and standard deviation)
dev_data[:,:num_columns] = dev_data[:,:num_columns] - feature_mean
dev_data[:,:num_columns] = dev_data[:,:num_columns]/(feature_std + smoothing)

In [20]:
# Explore and confirm the shape of the data
print("Training data shape: {0} Training labels shape: {1}".format(train_data.shape, train_labels.shape))
print("Dev data shape: {0} Dev labels shape: {1}".format(train_data.shape, train_labels.shape))
print("Test data shape: ", test_data.shape)
print("First training example: ", train_data[0], train_labels [0])
print("First dev example: ", dev_data[0,:], dev_labels [0])
print("First test example: ", test_data[0])

Training data shape: (12096, 54) Training labels shape: (12096,)
Dev data shape: (12096, 54) Dev labels shape: (12096,)
Test data shape:  (30000, 54)
First training example:  [-1.0874239   1.39067033  0.65424367 -0.36479144  0.2456598  -0.58855921
 -1.9018828   0.04564075  1.48886239 -1.12134094  0.          0.
  0.          1.          0.          0.          0.          0.
  0.          0.          0.          0.          0.          1.
  0.          0.          0.          0.          0.          0.
  0.          0.          0.          0.          0.          0.
  0.          0.          0.          0.          0.          0.
  0.          0.          0.          0.          0.          0.
  0.          0.          0.          0.          0.          0.        ] 3
First dev example:  [-0.78171192 -0.57407384 -0.88760796 -0.79554381 -0.99323992 -1.24549038
  0.72960321  0.30928477 -0.30640172 -1.02432588  0.          0.
  1.          0.          0.          0.          0.          1

In [21]:
# Try a random forest - before any data cleaning 
def RandomForest(num_trees):
    model = RandomForestClassifier(num_trees,max_depth=8)
    model.fit(train_data, train_labels)
    predictions = model.predict(dev_data)
    score = model.score(dev_data, dev_labels)
    print("Random Forest Performance for {0} trees: {1}".format(num_trees,score))
    print("Random Forest Confusion Matrix:\n")
    print(confusion_matrix(predictions, dev_labels))
    
num_trees_list = [1,3,5,10,100]
for num_trees in num_trees_list:
    RandomForest(num_trees)

Random Forest Performance for 1 trees: 0.48214285714285715
Random Forest Confusion Matrix:

[[ 73  35   0   0  11   1  21]
 [167 184   1   0 129   2  82]
 [  0   2  94   7   2  63   0]
 [  1   1  30 358   2  35   0]
 [156 165 139   1 296 154  90]
 [  0   1 166  67   0 178   0]
 [ 34   1   0   0   0   0 275]]
Random Forest Performance for 3 trees: 0.6967592592592593
Random Forest Confusion Matrix:

[[240  95   0   0   5   0  27]
 [ 86 186   6   0  51  11   1]
 [  1   7 218  31   6  51   0]
 [  1   0  34 380   5  30   0]
 [ 30  83  36   0 343  38   3]
 [  0  10 136  22  30 303   0]
 [ 73   8   0   0   0   0 437]]
Random Forest Performance for 5 trees: 0.7013888888888888
Random Forest Confusion Matrix:

[[274 103   0   0   0   0  33]
 [ 55 156   2   0  22   2   6]
 [  0   1 250  65  11  99   0]
 [  0   0  18 340   0  23   0]
 [ 36 100  23   0 393  30   0]
 [  1  17 137  28  14 279   0]
 [ 65  12   0   0   0   0 429]]
Random Forest Performance for 10 trees: 0.7351190476190477
Random Forest