In [42]:
# This tells matplotlib not to try opening a new window for each plot.
%matplotlib inline

# Libraries for reading, cleaning and plotting the dataa
import numpy as np 
import pandas as pd 
import csv
import matplotlib.pyplot as plt


# Libraries for models 
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import confusion_matrix

In [43]:
# Read in training data 
training_data = []
with open('data/train.csv', newline='') as csvfile:
    train_data = csv.reader(csvfile, delimiter=',', quotechar='|')
    for row in train_data:
        training_data.append(row)
            
# Convert to a numpy array of type int (except for the label row)
training_data = np.array(training_data[1:]).astype(int)   

# Read in test data
testing_data = []
with open('data/test.csv', newline='') as csvfile:
    test_data = csv.reader(csvfile, delimiter=',', quotechar='|')
    for row in test_data:
        testing_data.append(row)

# The testing file is huge so only read in 30k 
test_data = np.array(testing_data[1:30001]).astype(int)        

In [44]:
# Shuffle the input: create a random permutation of the integers between 0 and the number of data points and apply this
# permutation to X and Y.
# NOTE: Each time you run this cell, you'll re-shuffle the data, resulting in a different ordering.
shuffle = np.random.permutation(np.arange(training_data.shape[0]))
training_data = training_data[shuffle]

# Split training data (labeled) into 80% training and 20% dev)
train_data = training_data[1:12096, :-1]
train_labels = training_data[1:12096, -1]
dev_data = training_data[12096:, :-1]
dev_labels = training_data[12096:, -1]

In [45]:
# Explore and confirm the shape of the data
print("Training data shape: {0} Training labels shape: {1}".format(train_data.shape, train_labels.shape))
print("Dev data shape: {0} Dev labels shape: {1}".format(train_data.shape, train_labels.shape))
print("Test data shape: ", test_data.shape)
print("First training example: ", train_data[0], train_labels [0])
print("First dev example: ", dev_data[0], dev_labels [0])
print("First test example: ", test_data[0])

Training data shape: (12095, 55) Training labels shape: (12095,)
Dev data shape: (12095, 55) Dev labels shape: (12095,)
Test data shape:  (30000, 55)
First training example:  [5340 2269  125   28   67   28  408  253  208   62 1879    0    0    0
    1    0    0    1    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0] 4
First dev example:  [1176 2807   62   22  485  119  497  233  189   79 2460    1    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    1    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0] 2
First test example:  [15121  2680   354    14     0     0  2684   196   214   156  6645     1
     0     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     0     0     0     0     0     0     0     0     0
 

In [46]:
# Try a random forest - before any data cleaning 
def RandomForest(num_trees):
    model = RandomForestClassifier(num_trees,max_depth=8)
    model.fit(train_data, train_labels)
    predictions = model.predict(dev_data)
    score = model.score(dev_data, dev_labels)
    print("Random Forest Performance for {0} trees: {1}".format(num_trees,score))
    print("Random Forest Confusion Matrix:\n")
    print(confusion_matrix(predictions, dev_labels))
    
num_trees_list = [1,3,5,10,100]
for num_trees in num_trees_list:
    RandomForest(num_trees)

Random Forest Performance for 1 trees: 0.5545634920634921
Random Forest Confusion Matrix:

[[244 168   8   0  83   7  77]
 [ 66 115  15   0  59  16   6]
 [  0   9 221  51   6 165   0]
 [  0   1  65 369   0  46   0]
 [ 62  66  49   0 277  31  65]
 [  0   9  61  13  17 171   0]
 [ 70  54   0   0   2   0 280]]
Random Forest Performance for 3 trees: 0.6861772486772487
Random Forest Confusion Matrix:

[[245 121   1   0   3   2  44]
 [ 50 154   1   0  52   0   1]
 [  0   3 268  25  11  89   1]
 [  0   1  39 390   0  29   0]
 [ 41  92  16   0 354  31   0]
 [  2  13  93  18  12 282   0]
 [104  38   1   0  12   3 382]]
Random Forest Performance for 5 trees: 0.7314814814814815
Random Forest Confusion Matrix:

[[263  97   1   0   5   0  18]
 [ 87 209   5   0  52   7   7]
 [  1   9 286  18  23  91   0]
 [  0   0  44 409   0  31   1]
 [ 41  77   8   0 352  14   2]
 [  1  25  75   6  12 293   0]
 [ 49   5   0   0   0   0 400]]
Random Forest Performance for 10 trees: 0.7318121693121693
Random Forest 