# Train test split

Here we try a very simple idea of train-test split. We just split r

# Setup

In [1]:
import sys
sys.path.insert(0, '../src')
import preprocess
import utils
import random
import matplotlib.pyplot as plt
from collections import defaultdict
%matplotlib inline
from matplotlib import rc
rc('figure', figsize=(16, 8), max_open_warning=False)
from surprise import SVD, SVDpp, NMF
from surprise import Dataset
from surprise.model_selection import cross_validate
from surprise import accuracy
from surprise.model_selection import train_test_split
import pandas as pd
import numpy as np
import implicit
import codecs
import json

In [2]:
from sklearn.model_selection import train_test_split, KFold

def aminer_implicit_train_test_split(filename):
    # Now we create train test split
    # this can also be used as train-val-test
    df = pd.read_table(filename,
                             usecols=[0, 1, 2],
                             names=['user', 'ref', 'score'],
                             na_filter=False)
    train, test = train_test_split(df, test_size=0.1)
    train.to_csv(filename[:-4]+'_train.tsv',sep='\t', header=False, index=False)
    test.to_csv(filename[:-4]+'_test.tsv',sep='\t', header=False, index=False) # This should have some false entries as well???

# Implicit

In [8]:
import argparse
import codecs
import logging
import time

import numpy
import pandas
from scipy.sparse import coo_matrix

from implicit.als import AlternatingLeastSquares
from implicit.approximate_als import (AnnoyAlternatingLeastSquares, FaissAlternatingLeastSquares,
                                      NMSLibAlternatingLeastSquares)
from implicit.bpr import BayesianPersonalizedRanking
from implicit.nearest_neighbours import (BM25Recommender, CosineRecommender,
                                         TFIDFRecommender, bm25_weight)

# maps command line model argument to class name
MODELS = {"als":  AlternatingLeastSquares,
          "nmslib_als": NMSLibAlternatingLeastSquares,
          "annoy_als": AnnoyAlternatingLeastSquares,
          "faiss_als": FaissAlternatingLeastSquares,
          "tfidf": TFIDFRecommender,
          "cosine": CosineRecommender,
          "bpr": BayesianPersonalizedRanking,
          "bm25": BM25Recommender}


def get_model(model_name,k,lam):
    model_class = MODELS.get(model_name)
    if not model_class:
        raise ValueError("Unknown Model '%s'" % model_name)

    # some default params
    if model_name == "faiss_als":
        params = {'factors': k, 'dtype': numpy.float32, 'use_gpu': True, #training substantially slower if I don't use use_gpu
                  #'gpu': True,  # keyerror -1
                  "calculate_training_loss": True, "regularization": lam}
    elif issubclass(model_class, AlternatingLeastSquares):
        params = {'factors': k, 'dtype': numpy.float32, 'use_gpu': True, #training substantially slower if I don't use use_gpu
                  #'gpu': True,  # keyerror -1
                  "calculate_training_loss": True, "regularization": lam}
    elif model_name == "bm25":
        params = {'K1': 100, 'B': 0.5}
    elif model_name == "bpr":
        params = {'factors': 63, 'use_gpu': True}
    else:
        params = {}

    return model_class(**params)


def read_data(filename):
    """ Reads in the AMINER dataset, and returns a tuple of a pandas dataframe
    and a sparse matrix of artist/user/playcount """
    # read in triples of user/artist/playcount from the input dataset
    # get a model based off the input params
    start = time.time()
    print("reading data from %s"% (filename))
    data = pandas.read_table(filename,
                             usecols=[0, 1, 2],
                             names=['user', 'ref', 'cites'],
                             na_filter=False)

    # map each artist and user to a unique numeric value
    data['user'] = data['user'].astype("category")
    data['ref'] = data['ref'].astype("category")

    # create a sparse matrix of all the users/plays
    cites = coo_matrix((data['cites'].astype(numpy.float32),
                       (data['ref'].cat.codes.copy(),
                        data['user'].cat.codes.copy())))

    print("read data file in {}".format( time.time() - start))
    return data, cites

'''
def calculate_similar_artists(input_filename, output_filename, model_name="als"):
    """ generates a list of similar artists in lastfm by utiliizing the 'similar_items'
    api of the models """
    df, plays = read_data(input_filename)

    # create a model from the input data
    model = get_model(model_name)

    # if we're training an ALS based model, weight input for last.fm
    # by bm25
    if issubclass(model.__class__, AlternatingLeastSquares):
        # lets weight these models by bm25weight.
        print("weighting matrix by bm25_weight")
        plays = bm25_weight(plays, K1=100, B=0.8)

        # also disable building approximate recommend index
        model.approximate_recommend = False

    # this is actually disturbingly expensive:
    plays = plays.tocsr()

    print("training model {}".format( model_name))
    start = time.time()
    model.fit(plays)
    print("trained model '%s' in %0.2f"%( model_name, time.time() - start))

    # write out similar artists by popularity
    artists = dict(enumerate(df['artist'].cat.categories))
    start = time.time()
    print("calculating top artists")
    user_count = df.groupby('artist').size()
    to_generate = sorted(list(artists), key=lambda x: -user_count[x])

    # write out as a TSV of artistid, otherartistid, score
    with codecs.open(output_filename, "w", "utf8") as o:
        for artistid in to_generate:
            artist = artists[artistid]
            for other, score in model.similar_items(artistid, 11):
                o.write("%s\t%s\t%s\n" % (artist, artists[other], score))

    print("generated similar artists in %0.2f"%(time.time() - start))
'''

def calculate_recommendations(k, lam, input_filename, output_filename, model_name="als"):
    """ Generates artist recommendations for each user in the dataset """
    # train the model based off input params
    df, cites = read_data(input_filename)

    # create a model from the input data
    model = get_model(model_name,k,lam) 

    # if we're training an ALS based model, weight input for last.fm
    # by bm25
    if issubclass(model.__class__, AlternatingLeastSquares):
        # lets weight these models by bm25weight.
        print("weighting matrix by bm25_weight")
        cites = bm25_weight(cites, K1=100, B=0.8)

        # also disable building approximate recommend index
        model.approximate_similar_items = False

    # this is actually disturbingly expensive:
    cites = cites.tocsr()

    print("training model {}".format(model_name))
    start = time.time()
    model.fit(cites) 
    print("trained model '%s' in %0.2f"%( model_name, time.time() - start))

    # generate recommendations for each user and write out to a file

    artists = dict(enumerate(df['ref'].cat.categories))
    start = time.time()
    user_cites = cites.T.tocsr()
    with codecs.open(output_filename, "w", "utf8") as o:
        for userid, username in enumerate(df['user'].cat.categories):
            for artistid, score in model.recommend(userid, user_cites):
                o.write("%s\t%s\t%s\n" % (username, artists[artistid], score))
    print("generated recommendations in %0.2f"%(  time.time() - start))


def start(k, lam, inputfile='../dblp-ref/aminer-debug.tsv',outputfile='../output-debug.tsv', model='faiss_als'):

    #if recommend:
    calculate_recommendations(k, lam, inputfile, outputfile, model_name=model)
    #else:
        #calculate_similar_artists(inputfile, outputfile, model_name=model)
       

# Run it on debug data (proper train-test split, but negative or nan loss error!)

In [4]:
logging.getLogger("implicit").addHandler(logging.FileHandler('output.log', 'a'))

In [5]:
logging.getLogger("implicit").setLevel(logging.DEBUG)

In [7]:
!export OPENBLAS_NUM_THREADS=1
!export MKL_NUM_THREADS=1

#logging.basicConfig(filename='example.log',level=logging.DEBUG)
#This file should be used to draw plots; but it is not closing.

#k = 100
lam = 0.1
for k in [10,25,50,75,100,200]:
    print(k,lam)
    start(k, lam, '../dblp-ref/merged_train.tsv', "../faiss_rank_{}_lam_0.1_output_debug_train.tsv".format(k))

10 0.1
reading data from ../dblp-ref/merged_train.tsv


DEBUG:implicit:Calculated transpose in 0.006s


read data file in 1.5308563709259033
weighting matrix by bm25_weight
training model faiss_als


DEBUG:implicit:initialize factors in 0.052545785903930664
DEBUG:implicit:finished iteration 0 in 0.029s
DEBUG:implicit:loss at iteration 0 is -0.0035676476545631886
DEBUG:implicit:finished iteration 1 in 0.031s
DEBUG:implicit:loss at iteration 1 is -0.0035522643011063337
DEBUG:implicit:finished iteration 2 in 0.032s
DEBUG:implicit:loss at iteration 2 is -0.0035453555174171925
DEBUG:implicit:finished iteration 3 in 0.028s
DEBUG:implicit:loss at iteration 3 is -0.003543371334671974
DEBUG:implicit:finished iteration 4 in 0.029s
DEBUG:implicit:loss at iteration 4 is -0.0035426183603703976
DEBUG:implicit:finished iteration 5 in 0.028s
DEBUG:implicit:loss at iteration 5 is -0.003542246762663126
DEBUG:implicit:finished iteration 6 in 0.029s
DEBUG:implicit:loss at iteration 6 is -0.0035420204512774944
DEBUG:implicit:finished iteration 7 in 0.028s
DEBUG:implicit:loss at iteration 7 is -0.00354186724871397
DEBUG:implicit:finished iteration 8 in 0.026s
DEBUG:implicit:loss at iteration 8 is -0.003

trained model 'faiss_als' in 1.13
25 0.1
reading data from ../dblp-ref/merged_train.tsv




read data file in 1.4623689651489258
weighting matrix by bm25_weight
training model faiss_als


DEBUG:implicit:Calculated transpose in 0.010s
DEBUG:implicit:initialize factors in 0.13492965698242188
DEBUG:implicit:finished iteration 0 in 0.029s
DEBUG:implicit:loss at iteration 0 is -0.0035481788218021393
DEBUG:implicit:finished iteration 1 in 0.032s
DEBUG:implicit:loss at iteration 1 is -0.0035198030527681112
DEBUG:implicit:finished iteration 2 in 0.034s
DEBUG:implicit:loss at iteration 2 is -0.0035102202091366053
DEBUG:implicit:finished iteration 3 in 0.034s
DEBUG:implicit:loss at iteration 3 is -0.003507304936647415
DEBUG:implicit:finished iteration 4 in 0.035s
DEBUG:implicit:loss at iteration 4 is -0.0035060355439782143
DEBUG:implicit:finished iteration 5 in 0.033s
DEBUG:implicit:loss at iteration 5 is -0.0035053223837167025
DEBUG:implicit:finished iteration 6 in 0.033s
DEBUG:implicit:loss at iteration 6 is -0.003504849271848798
DEBUG:implicit:finished iteration 7 in 0.034s
DEBUG:implicit:loss at iteration 7 is -0.0035045037511736155
DEBUG:implicit:finished iteration 8 in 0.03

trained model 'faiss_als' in 1.49
50 0.1
reading data from ../dblp-ref/merged_train.tsv


DEBUG:implicit:Calculated transpose in 0.011s


read data file in 1.3290276527404785
weighting matrix by bm25_weight
training model faiss_als


DEBUG:implicit:initialize factors in 0.3344898223876953
DEBUG:implicit:finished iteration 0 in 0.067s
DEBUG:implicit:loss at iteration 0 is -0.0036145022604614496
DEBUG:implicit:finished iteration 1 in 0.126s
DEBUG:implicit:loss at iteration 1 is nan
DEBUG:implicit:finished iteration 2 in 0.045s
DEBUG:implicit:loss at iteration 2 is -0.0035811373963952065
DEBUG:implicit:finished iteration 3 in 0.140s
DEBUG:implicit:loss at iteration 3 is nan
DEBUG:implicit:finished iteration 4 in 0.045s
DEBUG:implicit:loss at iteration 4 is -0.003581139026209712
DEBUG:implicit:finished iteration 5 in 0.138s
DEBUG:implicit:loss at iteration 5 is nan
DEBUG:implicit:finished iteration 6 in 0.045s
DEBUG:implicit:loss at iteration 6 is -0.003581139724701643
DEBUG:implicit:finished iteration 7 in 0.050s
DEBUG:implicit:loss at iteration 7 is nan
DEBUG:implicit:finished iteration 8 in 1.077s
DEBUG:implicit:loss at iteration 8 is -0.003581139026209712
DEBUG:implicit:finished iteration 9 in 0.044s
DEBUG:implicit

trained model 'faiss_als' in 5.59
75 0.1
reading data from ../dblp-ref/merged_train.tsv


DEBUG:implicit:Calculated transpose in 0.010s


read data file in 1.3236982822418213
weighting matrix by bm25_weight
training model faiss_als


DEBUG:implicit:initialize factors in 0.5168826580047607
DEBUG:implicit:finished iteration 0 in 0.124s
DEBUG:implicit:loss at iteration 0 is -0.003587933024391532
DEBUG:implicit:finished iteration 1 in 0.187s
DEBUG:implicit:loss at iteration 1 is nan
DEBUG:implicit:finished iteration 2 in 0.077s
DEBUG:implicit:loss at iteration 2 is -0.0035811320412904024
DEBUG:implicit:finished iteration 3 in 0.182s
DEBUG:implicit:loss at iteration 3 is nan
DEBUG:implicit:finished iteration 4 in 0.080s
DEBUG:implicit:loss at iteration 4 is -0.003581129014492035
DEBUG:implicit:finished iteration 5 in 0.184s
DEBUG:implicit:loss at iteration 5 is nan
DEBUG:implicit:finished iteration 6 in 0.073s
DEBUG:implicit:loss at iteration 6 is -0.003581134369596839
DEBUG:implicit:finished iteration 7 in 0.185s
DEBUG:implicit:loss at iteration 7 is nan
DEBUG:implicit:finished iteration 8 in 0.077s
DEBUG:implicit:loss at iteration 8 is -0.0035811339039355516
DEBUG:implicit:finished iteration 9 in 0.172s
DEBUG:implicit

trained model 'faiss_als' in 5.44
100 0.1
reading data from ../dblp-ref/merged_train.tsv


DEBUG:implicit:Calculated transpose in 0.006s


read data file in 1.3758156299591064
weighting matrix by bm25_weight
training model faiss_als


DEBUG:implicit:initialize factors in 0.5780715942382812
DEBUG:implicit:finished iteration 0 in 0.213s
DEBUG:implicit:loss at iteration 0 is -0.0037069013342261314
DEBUG:implicit:finished iteration 1 in 0.197s
DEBUG:implicit:loss at iteration 1 is -0.004564741626381874
DEBUG:implicit:finished iteration 2 in 0.203s
DEBUG:implicit:loss at iteration 2 is -1.161307692527771
DEBUG:implicit:finished iteration 3 in 0.193s
DEBUG:implicit:loss at iteration 3 is -7.569947242736816
DEBUG:implicit:finished iteration 4 in 0.200s
DEBUG:implicit:loss at iteration 4 is -0.0979917049407959
DEBUG:implicit:finished iteration 5 in 0.224s
DEBUG:implicit:loss at iteration 5 is 0.009082394652068615
DEBUG:implicit:finished iteration 6 in 0.208s
DEBUG:implicit:loss at iteration 6 is 0.160442054271698
DEBUG:implicit:finished iteration 7 in 0.202s
DEBUG:implicit:loss at iteration 7 is -0.01064402237534523
DEBUG:implicit:finished iteration 8 in 0.201s
DEBUG:implicit:loss at iteration 8 is -0.008704557083547115
DEB

trained model 'faiss_als' in 6.72
200 0.1
reading data from ../dblp-ref/merged_train.tsv


DEBUG:implicit:Calculated transpose in 0.011s


read data file in 1.3966021537780762
weighting matrix by bm25_weight
training model faiss_als


DEBUG:implicit:initialize factors in 1.2033517360687256
DEBUG:implicit:finished iteration 0 in 0.665s
DEBUG:implicit:loss at iteration 0 is -0.003416931489482522
DEBUG:implicit:finished iteration 1 in 0.647s
DEBUG:implicit:loss at iteration 1 is -0.0033125898335129023
DEBUG:implicit:finished iteration 2 in 0.649s
DEBUG:implicit:loss at iteration 2 is -0.0032741837203502655
DEBUG:implicit:finished iteration 3 in 0.651s
DEBUG:implicit:loss at iteration 3 is nan
DEBUG:implicit:finished iteration 4 in 0.915s
DEBUG:implicit:loss at iteration 4 is -0.003581139724701643
DEBUG:implicit:finished iteration 5 in 0.405s
DEBUG:implicit:loss at iteration 5 is -0.003580586751922965
DEBUG:implicit:finished iteration 6 in 0.657s
DEBUG:implicit:loss at iteration 6 is -3678.474365234375
DEBUG:implicit:finished iteration 7 in 0.670s
DEBUG:implicit:loss at iteration 7 is -209.8435821533203
DEBUG:implicit:finished iteration 8 in 0.665s
DEBUG:implicit:loss at iteration 8 is -145.05694580078125
DEBUG:implicit

trained model 'faiss_als' in 16.85


In [9]:
k=75
reg=0.1
start(k, reg, '../dblp-ref/merged_train.tsv', "../faiss_rank_{}_lam_{}_output_merged_train.tsv".format(k,reg))

reading data from ../dblp-ref/merged_train.tsv


DEBUG:implicit:Calculated transpose in 0.006s


read data file in 1.49714994430542
weighting matrix by bm25_weight
training model faiss_als


DEBUG:implicit:initialize factors in 0.4359250068664551
DEBUG:implicit:finished iteration 0 in 0.137s
DEBUG:implicit:loss at iteration 0 is -0.003520719241350889
DEBUG:implicit:finished iteration 1 in 0.151s
DEBUG:implicit:loss at iteration 1 is -1465.019775390625
DEBUG:implicit:finished iteration 2 in 0.186s
DEBUG:implicit:loss at iteration 2 is nan
DEBUG:implicit:finished iteration 3 in 0.556s
DEBUG:implicit:loss at iteration 3 is -0.0035811392590403557
DEBUG:implicit:finished iteration 4 in 0.180s
DEBUG:implicit:loss at iteration 4 is -0.00358113506808877
DEBUG:implicit:finished iteration 5 in 0.217s
DEBUG:implicit:loss at iteration 5 is nan
DEBUG:implicit:finished iteration 6 in 0.111s
DEBUG:implicit:loss at iteration 6 is -0.003581071738153696
DEBUG:implicit:finished iteration 7 in 0.203s
DEBUG:implicit:loss at iteration 7 is nan
DEBUG:implicit:finished iteration 8 in 0.106s
DEBUG:implicit:loss at iteration 8 is -0.003581122262403369
DEBUG:implicit:finished iteration 9 in 0.199s
D

trained model 'faiss_als' in 5.61
generated recommendations in 84.49


In [16]:
k=68
reg=0.01
model="faiss_als"
start(k, reg, '../dblp-ref/aminer-debug_train.tsv', "../{}_rank_{}_lam_{}_output_merged_train.tsv".format(model,k,reg), model=model)

reading data from ../dblp-ref/aminer-debug_train.tsv




read data file in 1.5024960041046143
weighting matrix by bm25_weight
training model faiss_als


DEBUG:implicit:Calculated transpose in 0.012s
DEBUG:implicit:initialize factors in 0.35889697074890137
DEBUG:implicit:finished iteration 0 in 0.107s
DEBUG:implicit:loss at iteration 0 is 0.0027605979703366756
DEBUG:implicit:finished iteration 1 in 0.118s
DEBUG:implicit:loss at iteration 1 is 29.408891677856445
DEBUG:implicit:finished iteration 2 in 0.118s
DEBUG:implicit:loss at iteration 2 is 22.787931442260742
DEBUG:implicit:finished iteration 3 in 0.118s
DEBUG:implicit:loss at iteration 3 is 22.739137649536133
DEBUG:implicit:finished iteration 4 in 0.117s
DEBUG:implicit:loss at iteration 4 is 0.9866578578948975
DEBUG:implicit:finished iteration 5 in 0.115s
DEBUG:implicit:loss at iteration 5 is 0.031559694558382034
DEBUG:implicit:finished iteration 6 in 0.123s
DEBUG:implicit:loss at iteration 6 is 0.029237857088446617
DEBUG:implicit:finished iteration 7 in 0.119s
DEBUG:implicit:loss at iteration 7 is 0.019538160413503647
DEBUG:implicit:finished iteration 8 in 0.119s
DEBUG:implicit:los

trained model 'faiss_als' in 4.59


KeyboardInterrupt: 

In [18]:
k=68
reg=0.01
model="faiss_als"
start(k, reg, '../dblp-ref/test.tsv', "../{}_rank_{}_lam_{}_output_merged_train.tsv".format(model,k,reg), model=model)

reading data from ../dblp-ref/test.tsv


DEBUG:implicit:Calculated transpose in 0.003s


read data file in 0.5276312828063965
weighting matrix by bm25_weight
training model faiss_als


DEBUG:implicit:initialize factors in 0.12929058074951172
DEBUG:implicit:finished iteration 0 in 0.042s
DEBUG:implicit:loss at iteration 0 is -0.0031484595965594053
DEBUG:implicit:finished iteration 1 in 0.049s
DEBUG:implicit:loss at iteration 1 is -171920.921875
DEBUG:implicit:finished iteration 2 in 0.053s
DEBUG:implicit:loss at iteration 2 is -1139844.875
DEBUG:implicit:finished iteration 3 in 0.051s
DEBUG:implicit:loss at iteration 3 is -11634379.0
DEBUG:implicit:finished iteration 4 in 0.063s
DEBUG:implicit:loss at iteration 4 is -6728624640.0
DEBUG:implicit:finished iteration 5 in 0.049s
DEBUG:implicit:loss at iteration 5 is nan
DEBUG:implicit:finished iteration 6 in 0.157s
DEBUG:implicit:loss at iteration 6 is -0.0032769700046628714
DEBUG:implicit:finished iteration 7 in 0.034s
DEBUG:implicit:loss at iteration 7 is -0.003288082778453827
DEBUG:implicit:finished iteration 8 in 0.104s
DEBUG:implicit:loss at iteration 8 is nan
DEBUG:implicit:finished iteration 9 in 0.025s
DEBUG:impli

trained model 'faiss_als' in 2.52


KeyError: -1

# Result