In [14]:
import numpy as np
import pandas as pd
from scipy.sparse import coo_matrix

from sklearn.metrics import confusion_matrix

from sklearn.tree import DecisionTreeClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier

from sklearn.model_selection import learning_curve
from sklearn.model_selection import ShuffleSplit
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

import os
import sys
import time

In [15]:
def sparse_to_data_array(matrix, dtype=np.float32, maintain_size=True):
    """Converts a sparse matrix into a COO data array (numpy array of shape (n, 3)"""
    matrix.eliminate_zeros()
    data = np.zeros((matrix.nnz, 3), dtype=dtype)
    data[:, 0] = matrix.nonzero()[0]
    data[:, 1] = matrix.nonzero()[1]
    data[:, 2] = matrix.data
    if maintain_size:
        m, n = matrix.shape
        data = np.vstack((data, [m - 1, n - 1, 0]))  # insert a 0 value to maintain the size
    return data


def data_to_sparse(data, shape=None, csc=False):
    """Takes in (n,3) array of data and returns csr matrix for that"""
    if csc:
        data_matrix = coo_matrix((data[:, 2], (data[:, 0], data[:, 1])), shape=shape).tocsc()
    else:
        data_matrix = coo_matrix((data[:, 2], (data[:, 0], data[:, 1])), shape=shape).tocsr()

    data_matrix.eliminate_zeros()
    return data_matrix

def load_txt(filename, delimiter=',', verbose=True):
    if verbose:
        print '--> Loading ', filename, ' with np.loadtxt was ',
    sys.stdout.flush()
    t = time.time()
    d = np.loadtxt(filename, delimiter=delimiter)
    if verbose:
        print '%.3f s' % (time.time() - t)
    return d

In [18]:
train_name = "/Users/kristiansuhartono/Documents/GitHub/CS4641-repo/Project3/tw_ny/train.csv"
val_name = "/Users/kristiansuhartono/Documents/GitHub/CS4641-repo/Project3/tw_ny/validation.csv"
test_name = "/Users/kristiansuhartono/Documents/GitHub/CS4641-repo/Project3/tw_ny/test.csv"

train = load_txt(train_name, verbose=False)
val = load_txt(val_name, verbose=False)
test = load_txt(test_name, verbose=False)

train = data_to_sparse(train)
val = data_to_sparse(val)
test = data_to_sparse(test)

In [32]:
print test

  (2, 2)	156.0
  (4, 8896)	1.0
  (6, 6)	138.0
  (8, 1613)	1.0
  (8, 7144)	1.0
  (11, 327)	1.0
  (16, 10270)	1.0
  (19, 29)	1.0
  (19, 550)	1.0
  (27, 3583)	1.0
  (29, 27)	5.0
  (29, 2211)	9.0
  (29, 3279)	1.0
  (29, 9456)	1.0
  (30, 28)	37.0
  (35, 33)	6.0
  (36, 2078)	1.0
  (36, 4031)	1.0
  (37, 35)	152.0
  (37, 102)	3.0
  (37, 381)	10.0
  (38, 35)	1.0
  (38, 1096)	1.0
  (43, 35)	9.0
  (52, 4528)	1.0
  :	:
  (30316, 4956)	1.0
  (30316, 5275)	1.0
  (30316, 7060)	1.0
  (30316, 7377)	1.0
  (30317, 21)	1.0
  (30317, 29)	2.0
  (30317, 76)	1.0
  (30317, 668)	1.0
  (30317, 1267)	1.0
  (30317, 4502)	2.0
  (30318, 99)	1.0
  (30318, 895)	3.0
  (30318, 4689)	2.0
  (30318, 6508)	1.0
  (30319, 29)	1.0
  (30319, 152)	1.0
  (30319, 271)	1.0
  (30319, 2030)	1.0
  (30319, 2267)	1.0
  (30319, 2290)	1.0
  (30319, 3351)	1.0
  (30319, 4868)	1.0
  (30319, 8559)	1.0
  (30319, 9055)	1.0
  (30319, 10189)	1.0
