In [99]:
import numpy as np
import matplotlib.pyplot as plt
from scipy.sparse import bsr_matrix, csr_matrix, csc_matrix

def save_sparse_csr(filename,array):
    np.savez(filename,data = array.data ,indices=array.indices,
             indptr =array.indptr, shape=array.shape )

def load_sparse_csr(filename):
    loader = np.load(filename)
    return csr_matrix((  loader['data'], loader['indices'], loader['indptr']),
                         shape = loader['shape'])

In [101]:
# Load train data and print the header

trainData = np.loadtxt("train.csv", delimiter=",", dtype=np.str)

print trainData[0]

trainData = trainData[1:].astype(np.int)

print trainData.shape

['genre_id' 'ts_listen' 'media_id' 'album_id' 'context_type' 'release_date'
 'platform_name' 'platform_family' 'media_duration' 'listen_type'
 'user_gender' 'user_id' 'artist_id' 'user_age' 'is_listened']
(7558834L, 15L)


In [100]:
#Load test data and print the header

testData = np.loadtxt("test.csv", delimiter=",", dtype=np.str)

print testData[0]

testData = testData[1:].astype(np.int)

print testData.shape

['sample_id' 'genre_id' 'ts_listen' 'media_id' 'album_id' 'context_type'
 'release_date' 'platform_name' 'platform_family' 'media_duration'
 'listen_type' 'user_gender' 'user_id' 'artist_id' 'user_age']
(19918L, 15L)


In [102]:
# Print some basic characteristics of the datasets, using python set operations

trainSongSet = set(trainData[:,2])
trainUserSet = set(trainData[:,11])

testSongSet = set(testData[:,3])
testUserSet = set(testData[:,12])

allSongsSet = trainSongSet | testSongSet
sortedSongsIDs = np.asarray(sorted(allSongsSet))

print "There are a total of ", len(allSongsSet)," distinct songs"
print "... and a total of ", len(trainUserSet)," distinct users\n"

print "Train:\n-------"
print trainData.shape[0], "train data samples"
print len(trainSongSet), "Distinct songs in train set"
print len(trainUserSet), "Distinct users in train set"

print "\nTest:\n-------"
print testData.shape[0], "test data samples"
print len(testSongSet), "Distinct songs in test set"
print len(testData), "Distinct users in test set"

There are a total of  453310  distinct songs
... and a total of  19918  distinct users

Train:
-------
7558834 train data samples
452975 Distinct songs in train set
19918 Distinct users in train set

Test:
-------
19918 test data samples
9732 Distinct songs in test set
19918 Distinct users in test set


In [103]:
# While UserIDs range from 0 to 19918, mediaIDs seem to be random. We sort them and asign then 
# and identifier in the range 0-453310. By doing so we can latter use this ID to index a matrix 

IDtoIndex = {}

for song in allSongsSet:
    IDtoIndex[song] = np.where(sortedSongsIDs == song)[0][0] 

In [104]:
# Replace MediaIDs with our computed identifiers

for i in range(trainData.shape[0]):
    trainData[i,2] = IDtoIndex[trainData[i,2]]
    
for i in range(testData.shape[0]):
    testData[i,3] = IDtoIndex[testData[i,3]]

In [108]:
# We store the user-item matrix as a CSR sparse matrix, so we dont have to store the zeros
# 1s represent the user listened more than 30s of song
# -1s indicate the user did not listen at least 30s of the song
# 0s indicate the user never heard the song at all (or no info is available)

user_song = csr_matrix((len(trainUserSet), len(allSongsSet)))

print user_song.shape

In [None]:
# Populate the user-item matrix with the data from the training set (this takes up to 2 days on my i7 processor)

for i in range(7228095, trainData.shape[0]):
    if trainData[i,14] == 1:
        user_song[trainData[i,11], trainData[i,2]] = 1
    else: 
        user_song[trainData[i,11], trainData[i,2]] = -1

In [137]:
#Since it took us ages to compute the matrix, we should better save it to disk

save_sparse_csr("user_song2", user_song)

In [None]:
""" Perform Collaborative Filtering

    for each test sample:
        1. retrieve the users who listened or disliked the song recommended to the test user (i.e., 1 or -1)
        2. train and use a kNN classifier to determine the probability that the test user listens to the recommended song
"""

from sklearn.neighbors import KNeighborsClassifier

predictions = {}

for i in range(19918):
    
    recommended = testData[i, 3]
    user = testData[i, 12]
    
    idx = (user_song[:, recommended] != 0).nonzero()[0]
    
    if len(idx) < 2:
        predictions[i] = 0.5
        continue
    
    relevantUsers = user_song[idx, :]
    y_train = user_song[idx, recommended].todense()
    #print relevantUsers.shape
    
    
    knn = KNeighborsClassifier(algorithm="brute", metric="cosine", n_neighbors=min(20, len(idx)), weights="distance").fit(relevantUsers, y_train)
    pred_prob = knn.predict_proba(user_song[user,:])[0]
    pred = knn.predict(user_song[user,:])[0]
    
    if len(pred_prob) == 2:
        predictions[i] = pred_prob[1]
    else:
        if pred == 1:
            predictions[i] = 1.
        else:
            predictions[i] = 0.

In [144]:
#Generate submmision file

f = open("submission.csv", "w")

f.write("sample_id,is_listened\n")

for i in range(19918):
    f.write(" %d,%.2f\n"%(i, predictions[i]))
    
f.close()