In [1]:
# Imports
from __future__ import absolute_import, division, print_function

import sys
from syspath import current_path
sys.path.append('../')

import numpy as np
import pandas as pd
import pickle
from scipy import stats
import scipy.sparse as sp
from sklearn.model_selection import train_test_split
from collections import defaultdict
import ml_metrics
import matplotlib.pyplot as plt
import torch
from torch.utils.data import DataLoader, Dataset
import itertools
%matplotlib inline

In [10]:
# load train data
data = {
    'train': sp.load_npz('../data/processed_bets/train_ex_val.npz'),
    'val': sp.load_npz('../data/processed_bets/val_unmasked.npz'),
    'val_masked': sp.load_npz('../data/processed_bets/val_masked.npz')
}

In [13]:
data['train']

<148845x1439 sparse matrix of type '<class 'numpy.int64'>'
	with 3825898 stored elements in Compressed Sparse Row format>

In [11]:
def sparse_to_list(X):
    """list of nonzero indices per row of sparse X"""
    result = np.split(X.indices, X.indptr)[1:-1]
    result = [list(r) for r in result]
    return result

In [12]:
x = sparse_to_list(data['val_masked'])
y = [len(i) for i in x]
print(y)

[2, 1, 1, 8, 3, 0, 5, 0, 27, 2, 0, 0, 0, 1, 3, 2, 0, 53, 5, 3, 3, 0, 0, 15, 2, 7, 1, 1, 6, 0, 4, 1, 2, 1, 10, 0, 1, 1, 3, 15, 0, 1, 8, 0, 1, 2, 0, 2, 0, 2, 3, 3, 1, 1, 0, 12, 1, 13, 3, 1, 0, 2, 0, 2, 4, 1, 7, 23, 2, 0, 6, 0, 0, 0, 3, 6, 0, 7, 2, 6, 0, 0, 4, 4, 0, 0, 5, 8, 5, 1, 3, 2, 1, 0, 0, 4, 5, 5, 0, 1, 5, 0, 15, 2, 0, 4, 0, 1, 0, 1, 5, 1, 1, 11, 7, 0, 3, 0, 3, 0, 1, 30, 1, 4, 2, 3, 0, 2, 3, 15, 25, 0, 0, 0, 0, 3, 0, 0, 5, 15, 15, 2, 3, 0, 6, 6, 2, 0, 1, 41, 2, 3, 32, 2, 2, 1, 0, 0, 0, 8, 3, 1, 30, 32, 10, 45, 0, 1, 3, 6, 12, 5, 4, 4, 4, 1, 1, 18, 0, 8, 8, 1, 3, 1, 0, 0, 2, 1, 4, 0, 1, 31, 21, 4, 3, 0, 7, 3, 0, 6, 25, 1, 0, 1, 3, 4, 16, 2, 0, 3, 29, 1, 0, 5, 0, 2, 3, 2, 0, 19, 16, 2, 9, 0, 1, 0, 2, 7, 12, 0, 14, 0, 1, 0, 2, 0, 1, 0, 1, 18, 17, 1, 21, 0, 4, 1, 12, 1, 0, 9, 4, 8, 3, 17, 1, 1, 0, 6, 13, 7, 0, 12, 2, 2, 2, 0, 15, 21, 2, 2, 1, 1, 0, 1, 0, 0, 0, 2, 6, 2, 3, 1, 1, 1, 5, 8, 0, 0, 0, 1, 2, 5, 0, 1, 4, 1, 0, 0, 6, 5, 9, 4, 4, 14, 3, 5, 3, 2, 17, 2, 1, 0, 5, 3, 4, 2, 0, 0, 3,

In [9]:
sorted(y)

[0,
 0,
 0,
 0,
 0,
 0,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 3,
 3,
 3,
 3,
 3,
 4,
 4,
 4,
 4,
 4,
 4,
 4,
 4,
 5,
 5,
 5,
 5,
 5,
 6,
 6,
 6,
 7,
 7,
 7,
 7,
 8,
 8,
 8,
 8,
 9,
 9,
 9,
 11,
 11,
 11,
 11,
 11,
 12,
 12,
 12,
 12,
 12,
 13,
 13,
 13,
 13,
 13,
 14,
 14,
 15,
 15,
 15,
 15,
 16,
 16,
 17,
 17,
 17,
 18,
 18,
 18,
 19,
 19,
 19,
 20,
 20,
 21,
 22,
 22,
 23,
 23,
 23,
 23,
 23,
 23,
 23,
 24,
 24,
 24,
 24,
 25,
 25,
 25,
 26,
 26,
 26,
 26,
 27,
 27,
 27,
 27,
 28,
 28,
 28,
 29,
 29,
 29,
 29,
 29,
 30,
 30,
 30,
 31,
 31,
 31,
 31,
 32,
 33,
 33,
 33,
 33,
 34,
 34,
 35,
 35,
 35,
 35,
 35,
 36,
 36,
 36,
 36,
 36,
 36,
 37,
 37,
 37,
 37,
 37,
 37,
 37,
 37,
 37,
 37,
 38,
 38,
 38,
 38,
 38,
 39,
 39,
 39,
 39,
 39,
 40,
 40,
 41,
 41,
 41,
 41,
 41,
 41,
 41,
 42,
 43,
 43,
 43,
 43,
 43,
 44,
 44,
 45,
 45,
 45,
 46,
 46,
 46,
 46,
 46,
 47,
 47,
 47,
 47,
 48,
 48,
 48,
 48,
 48,
 49,
 49,
 49,
 50,
 50,
 

In [7]:
data['val']

<643x26307 sparse matrix of type '<class 'numpy.int64'>'
	with 222436 stored elements in Compressed Sparse Row format>