In [28]:
# This tells matplotlib not to try opening a new window for each plot.
%matplotlib inline

# Libraries for reading, cleaning and plotting the dataa
import numpy as np 
import pandas as pd 
import csv
import matplotlib.pyplot as plt


# Libraries for models 
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import confusion_matrix

In [29]:
# Read in training data 
training_data = []
with open('data/train.csv', newline='') as csvfile:
    train_data = csv.reader(csvfile, delimiter=',', quotechar='|')
    for row in train_data:
        training_data.append(row)
            
# Convert to a numpy array of type int (except for the label row)
training_data = np.array(training_data[1:]).astype(int)   

# Read in test data
testing_data = []
with open('data/test.csv', newline='') as csvfile:
    test_data = csv.reader(csvfile, delimiter=',', quotechar='|')
    for row in test_data:
        testing_data.append(row)

# The testing file is huge so only read in 30k 
test_data = np.array(testing_data[1:30001]).astype(int)        

In [36]:
# Shuffle the input: create a random permutation of the integers between 0 and the number of data points and apply this
# permutation to X and Y.
# NOTE: Each time you run this cell, you'll re-shuffle the data, resulting in a different ordering.
shuffle = np.random.permutation(np.arange(training_data.shape[0]))
training_data = training_data[shuffle]

# Split training data (labeled) into 80% training and 20% dev)
train_data = training_data[1:12096, :-1]
train_labels = training_data[1:12096, -1]
dev_data = training_data[12096:, :-1]
dev_labels = training_data[12096:, -1]

In [37]:
# Explore and confirm the shape of the data
print("Training data shape: {0} Training labels shape: {1}".format(train_data.shape, train_labels.shape))
print("Dev data shape: {0} Dev labels shape: {1}".format(train_data.shape, train_labels.shape))
print("Test data shape: ", test_data.shape)
print("First training example: ", train_data[0], train_labels [0])
print("First dev example: ", dev_data[0], dev_labels [0])
print("First test example: ", test_data[0])

Training data shape: (12095, 55) Training labels shape: (12095,)
Dev data shape: (12095, 55) Dev labels shape: (12095,)
Test data shape:  (30000, 55)
First training example:  [4114 3100   96   31  162   32 2794  251  180   33 3697    1    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    1    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0] 1
First dev example:  [8771 3128  165   24  272   -9 1782  233  239  123 2110    0    0    1
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    1    0    0    0    0    0    0    0] 1
First test example:  [15121  2680   354    14     0     0  2684   196   214   156  6645     1
     0     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     0     0     0     0     0     0     0     0     0
 

In [35]:
# Try a random forest - before any data cleaning 
def RandomForest():
    num_trees_list = [1,3,5,10,100]
    for num_trees in num_trees_list:
        model = RandomForestClassifier(num_trees,max_depth=3)
        model.fit(train_data, train_labels)
        predictions = model.predict(dev_data)
        score = model.score(dev_data, dev_labels)
        print("Random Forest Performance for {} trees: {}".format score)
        print("Random Forest Confusion Matrix:\n")
        print(confusion_matrix(predictions, dev_labels))

RandomForest()

Random Forest Performance:  0.2562830687830688
Random Forest Confusion Matrix:

[[  0   0   0   0   0   0   0]
 [  0   1   0   0   1   1   0]
 [  0   0   0   0   0   0   0]
 [374 423 432 446 385 422 172]
 [  2  11   1   0  48   1   0]
 [  0   0   0   0   0   6   0]
 [ 24   0   0   0   0   0 274]]
