In [95]:
import numpy as np
# import os
import pathlib
from datetime import datetime
from sklearn.linear_model import SGDClassifier
from sklearn import tree
from sklearn.ensemble import RandomForestClassifier

In [96]:
def load_data(filename, train=True):
    """
    Function loads data stored in the file filename and returns it as a numpy ndarray.
    
    Inputs:
        filename: given as a string.
        
    Outputs:
        Data contained in the file, returned as a numpy ndarray
    """
    X = []
    y = []
    with open(filename) as f:
        for line in f:
            if (train):
                # remove \n, split on space, separate into label and weights
                X.append(line.strip().split(' ')[1:])
                y.append(line.strip().split(' ')[0])
            else:
                X.append(line.strip().split(' '))
                
    # convert to np, cast to int, and remove the headers
    X = np.asarray(X[1:]).astype(int)
    if (train):
        y = np.asarray(y[1:]).astype(int)
        
    return X, y

In [97]:
def make_predictions(clf, X, y, test):
    clf.fit(X, y)
    predictions = clf.predict(test)
    
    return predictions

In [98]:
def save_data(data, filename="%s.txt" % datetime.today().strftime("%X").replace(":", "")):
    pathlib.Path('submissions').mkdir(parents=True, exist_ok=True)
    with open("submissions\\%s" % filename, "w") as f:
        f.write("Id,Prediction\n")
        for Id, prediction in enumerate(data, 1):
            string = str(Id) + ',' + str(prediction) + '\n'
            f.write(string)
        

In [None]:
def eval_tree_based_model_max_depth(clf, max_depth, X_train, y_train, X_test, y_test):
    """
    This function evaluates the given classifier (either a decision tree or random forest) at all of the 
    maximum tree depth parameters in the vector max_depth, using the given training and testing
    data. It returns two vector, with the training and testing classification errors.
    
    Inputs:
        clf: either a decision tree or random forest classifier object
        max_depth: a (T, ) vector of all the max_depth stopping condition parameters 
                            to test, where T is the number of parameters to test
        X_train: (N, D) matrix of training samples.
        y_train: (N, ) vector of training labels.
        X_test: (N, D) matrix of test samples
        y_test: (N, ) vector of test labels
    Output:
        train_err: (T, ) vector of classification errors on the training data
        test_err: (T, ) vector of classification errors on the test data
    """
   
    train_err = []
    test_err = []
    
    for i in range(len(max_depth)):
        clf.max_depth = max_depth[i]
        clf.fit(X_train, y_train)
        train = classification_err(clf.predict(X_train), y_train)
        test = classification_err(clf.predict(X_test), y_test)
        train_err.append(train)
        test_err.append(test)

    return train_err, test_err

In [99]:
def main():
    # load the data
    X_train, y_train = load_data("training_data.txt")
    X_test, _ = load_data("test_data.txt", False)
    # clf = SGDClassifier(loss="log", penalty="l2")
    clf = RandomForestClassifier(n_estimators=5000, criterion = 'gini')
    predictions = make_predictions(clf, X_train, y_train, X_test)
    
    # save to a file
    save_data(predictions)


In [100]:
if __name__ == "__main__":
    main()