# Train test split

Here we try a very simple idea of train-test split. We just split r

# Setup

In [1]:
import sys
sys.path.insert(0, '../src')
import preprocess
import utils
import random
import matplotlib.pyplot as plt
from collections import defaultdict
%matplotlib inline
from matplotlib import rc
rc('figure', figsize=(16, 8), max_open_warning=False)
from surprise import SVD, SVDpp, NMF
from surprise import Dataset
from surprise.model_selection import cross_validate
from surprise import accuracy
from surprise.model_selection import train_test_split
import pandas as pd
import numpy as np
import implicit
import codecs
import json

In [2]:
from sklearn.model_selection import train_test_split, KFold

def aminer_implicit_train_test_split(filename):
    # Now we create train test split
    # this can also be used as train-val-test
    df = pd.read_table(filename,
                             usecols=[0, 1, 2],
                             names=['user', 'ref', 'score'],
                             na_filter=False)
    train, test = train_test_split(df, test_size=0.1)
    train.to_csv(filename[:-4]+'_train.tsv',sep='\t', header=False, index=False)
    test.to_csv(filename[:-4]+'_test.tsv',sep='\t', header=False, index=False) # This should have some false entries as well???

# Implicit

In [6]:
import argparse
import codecs
import logging
import time

import numpy
import pandas
from scipy.sparse import coo_matrix

from implicit.als import AlternatingLeastSquares
from implicit.approximate_als import (AnnoyAlternatingLeastSquares, FaissAlternatingLeastSquares,
                                      NMSLibAlternatingLeastSquares)
from implicit.bpr import BayesianPersonalizedRanking
from implicit.nearest_neighbours import (BM25Recommender, CosineRecommender,
                                         TFIDFRecommender, bm25_weight)

# maps command line model argument to class name
MODELS = {"als":  AlternatingLeastSquares,
          "nmslib_als": NMSLibAlternatingLeastSquares,
          "annoy_als": AnnoyAlternatingLeastSquares,
          "faiss_als": FaissAlternatingLeastSquares,
          "tfidf": TFIDFRecommender,
          "cosine": CosineRecommender,
          "bpr": BayesianPersonalizedRanking,
          "bm25": BM25Recommender}


def get_model(model_name,k,lam):
    model_class = MODELS.get(model_name)
    if not model_class:
        raise ValueError("Unknown Model '%s'" % model_name)

    # some default params
    if model_name == "faiss_als":
        params = {'factors': k, 'dtype': numpy.float32, 'use_gpu': True, #training substantially slower if I don't use use_gpu
                  #'gpu': True,  # keyerror -1
                  "calculate_training_loss": True, "regularization": lam}
    elif issubclass(model_class, AlternatingLeastSquares):
        params = {'factors': k, 'dtype': numpy.float32, 'use_gpu': True, #training substantially slower if I don't use use_gpu
                  #'gpu': True,  # keyerror -1
                  "calculate_training_loss": True, "regularization": lam}
    elif model_name == "bm25":
        params = {'K1': 100, 'B': 0.5}
    elif model_name == "bpr":
        params = {'factors': 63, 'use_gpu': True}
    else:
        params = {}

    return model_class(**params)


def read_data(filename):
    """ Reads in the AMINER dataset, and returns a tuple of a pandas dataframe
    and a sparse matrix of artist/user/playcount """
    # read in triples of user/artist/playcount from the input dataset
    # get a model based off the input params
    start = time.time()
    print("reading data from %s"% (filename))
    data = pandas.read_table(filename,
                             usecols=[0, 1, 2],
                             names=['user', 'ref', 'cites'],
                             na_filter=False)

    # map each artist and user to a unique numeric value
    data['user'] = data['user'].astype("category")
    data['ref'] = data['ref'].astype("category")

    # create a sparse matrix of all the users/plays
    cites = coo_matrix((data['cites'].astype(numpy.float32),
                       (data['ref'].cat.codes.copy(),
                        data['user'].cat.codes.copy())))

    print("read data file in {}".format( time.time() - start))
    return data, cites

'''
def calculate_similar_artists(input_filename, output_filename, model_name="als"):
    """ generates a list of similar artists in lastfm by utiliizing the 'similar_items'
    api of the models """
    df, plays = read_data(input_filename)

    # create a model from the input data
    model = get_model(model_name)

    # if we're training an ALS based model, weight input for last.fm
    # by bm25
    if issubclass(model.__class__, AlternatingLeastSquares):
        # lets weight these models by bm25weight.
        print("weighting matrix by bm25_weight")
        plays = bm25_weight(plays, K1=100, B=0.8)

        # also disable building approximate recommend index
        model.approximate_recommend = False

    # this is actually disturbingly expensive:
    plays = plays.tocsr()

    print("training model {}".format( model_name))
    start = time.time()
    model.fit(plays)
    print("trained model '%s' in %0.2f"%( model_name, time.time() - start))

    # write out similar artists by popularity
    artists = dict(enumerate(df['artist'].cat.categories))
    start = time.time()
    print("calculating top artists")
    user_count = df.groupby('artist').size()
    to_generate = sorted(list(artists), key=lambda x: -user_count[x])

    # write out as a TSV of artistid, otherartistid, score
    with codecs.open(output_filename, "w", "utf8") as o:
        for artistid in to_generate:
            artist = artists[artistid]
            for other, score in model.similar_items(artistid, 11):
                o.write("%s\t%s\t%s\n" % (artist, artists[other], score))

    print("generated similar artists in %0.2f"%(time.time() - start))
'''

def calculate_recommendations(k, lam, input_filename, output_filename, model_name="als"):
    """ Generates artist recommendations for each user in the dataset """
    # train the model based off input params
    df, cites = read_data(input_filename)

    # create a model from the input data
    model = get_model(model_name,k,lam) 

    # if we're training an ALS based model, weight input for last.fm
    # by bm25
    if issubclass(model.__class__, AlternatingLeastSquares):
        # lets weight these models by bm25weight.
        print("weighting matrix by bm25_weight")
        cites = bm25_weight(cites, K1=100, B=0.8)

        # also disable building approximate recommend index
        model.approximate_similar_items = False

    # this is actually disturbingly expensive:
    cites = cites.tocsr()

    print("training model {}".format(model_name))
    start = time.time()
    model.fit(cites) 
    print("trained model '%s' in %0.2f"%( model_name, time.time() - start))

    # generate recommendations for each user and write out to a file
    '''
    artists = dict(enumerate(df['ref'].cat.categories))
    start = time.time()
    user_cites = cites.T.tocsr()
    with codecs.open(output_filename, "w", "utf8") as o:
        for userid, username in enumerate(df['user'].cat.categories):
            for artistid, score in model.recommend(userid, user_cites):
                o.write("%s\t%s\t%s\n" % (username, artists[artistid], score))
    print("generated recommendations in %0.2f"%(  time.time() - start))
    '''

def start(k, lam, inputfile='../dblp-ref/aminer-debug.tsv',outputfile='../output-debug.tsv', model='faiss_als'):

    #if recommend:
    calculate_recommendations(k, lam, inputfile, outputfile, model_name=model)
    #else:
        #calculate_similar_artists(inputfile, outputfile, model_name=model)
       

# Run it on debug data (wrong, arbitrary train-test split)

In [4]:
logging.getLogger("implicit").addHandler(logging.FileHandler('output.log', 'a'))

In [5]:
logging.getLogger("implicit").setLevel(logging.DEBUG)

In [7]:
!export OPENBLAS_NUM_THREADS=1
!export MKL_NUM_THREADS=1

#logging.basicConfig(filename='example.log',level=logging.DEBUG)
#This file should be used to draw plots; but it is not closing.

#k = 100
lam = 0.01
for k in [10,25,50,75,100,200]:
    print(k,lam)
    start(k, lam, '../dblp-ref/aminer-debug_train.tsv', "../faiss_rank_{}_output_debug_train.tsv".format(k))

10 0.01
reading data from ../dblp-ref/aminer-debug_train.tsv




read data file in 1.5096197128295898
weighting matrix by bm25_weight


DEBUG:implicit:Calculated transpose in 0.007s
DEBUG:implicit:initialize factors in 0.05744028091430664


training model faiss_als


DEBUG:implicit:finished iteration 0 in 0.027s
DEBUG:implicit:loss at iteration 0 is 0.0028188310097903013
DEBUG:implicit:finished iteration 1 in 0.028s
DEBUG:implicit:loss at iteration 1 is 0.002806367352604866
DEBUG:implicit:finished iteration 2 in 0.029s
DEBUG:implicit:loss at iteration 2 is 0.002800916787236929
DEBUG:implicit:finished iteration 3 in 0.032s
DEBUG:implicit:loss at iteration 3 is 0.002799402689561248
DEBUG:implicit:finished iteration 4 in 0.034s
DEBUG:implicit:loss at iteration 4 is 0.0027988534420728683
DEBUG:implicit:finished iteration 5 in 0.030s
DEBUG:implicit:loss at iteration 5 is 0.0027986024506390095
DEBUG:implicit:finished iteration 6 in 0.029s
DEBUG:implicit:loss at iteration 6 is 0.002798462985083461
DEBUG:implicit:finished iteration 7 in 0.030s
DEBUG:implicit:loss at iteration 7 is 0.002798376139253378
DEBUG:implicit:finished iteration 8 in 0.028s
DEBUG:implicit:loss at iteration 8 is 0.0027983197942376137
DEBUG:implicit:finished iteration 9 in 0.027s
DEBUG

trained model 'faiss_als' in 2.22
25 0.01
reading data from ../dblp-ref/aminer-debug_train.tsv




read data file in 1.4836468696594238
weighting matrix by bm25_weight


DEBUG:implicit:Calculated transpose in 0.009s
DEBUG:implicit:initialize factors in 0.13600420951843262


training model faiss_als


DEBUG:implicit:finished iteration 0 in 0.028s
DEBUG:implicit:loss at iteration 0 is 0.0028011836111545563
DEBUG:implicit:finished iteration 1 in 0.031s
DEBUG:implicit:loss at iteration 1 is 0.0027780537493526936
DEBUG:implicit:finished iteration 2 in 0.033s
DEBUG:implicit:loss at iteration 2 is 0.0027706236578524113
DEBUG:implicit:finished iteration 3 in 0.033s
DEBUG:implicit:loss at iteration 3 is 0.0027684480883181095
DEBUG:implicit:finished iteration 4 in 0.033s
DEBUG:implicit:loss at iteration 4 is 0.0027675526216626167
DEBUG:implicit:finished iteration 5 in 0.035s
DEBUG:implicit:loss at iteration 5 is 0.0027670892886817455
DEBUG:implicit:finished iteration 6 in 0.032s
DEBUG:implicit:loss at iteration 6 is 0.0027668101247400045
DEBUG:implicit:finished iteration 7 in 0.034s
DEBUG:implicit:loss at iteration 7 is 0.0027666129171848297
DEBUG:implicit:finished iteration 8 in 0.035s
DEBUG:implicit:loss at iteration 8 is 0.002766469493508339
DEBUG:implicit:finished iteration 9 in 0.031s
D

trained model 'faiss_als' in 1.99
50 0.01
reading data from ../dblp-ref/aminer-debug_train.tsv




read data file in 1.4491844177246094
weighting matrix by bm25_weight
training model faiss_als


DEBUG:implicit:Calculated transpose in 0.007s
DEBUG:implicit:initialize factors in 0.26712965965270996
DEBUG:implicit:finished iteration 0 in 0.065s
DEBUG:implicit:loss at iteration 0 is 0.0028647612780332565
DEBUG:implicit:finished iteration 1 in 0.135s
DEBUG:implicit:loss at iteration 1 is nan
DEBUG:implicit:finished iteration 2 in 0.047s
DEBUG:implicit:loss at iteration 2 is 0.0028311999049037695
DEBUG:implicit:finished iteration 3 in 0.133s
DEBUG:implicit:loss at iteration 3 is nan
DEBUG:implicit:finished iteration 4 in 0.044s
DEBUG:implicit:loss at iteration 4 is 0.0028312443755567074
DEBUG:implicit:finished iteration 5 in 0.141s
DEBUG:implicit:loss at iteration 5 is nan
DEBUG:implicit:finished iteration 6 in 0.039s
DEBUG:implicit:loss at iteration 6 is 0.002831238554790616
DEBUG:implicit:finished iteration 7 in 0.147s
DEBUG:implicit:loss at iteration 7 is nan
DEBUG:implicit:finished iteration 8 in 0.041s
DEBUG:implicit:loss at iteration 8 is 0.0028312434442341328
DEBUG:implicit:f

trained model 'faiss_als' in 5.21
75 0.01
reading data from ../dblp-ref/aminer-debug_train.tsv




read data file in 1.6788642406463623
weighting matrix by bm25_weight


DEBUG:implicit:Calculated transpose in 0.013s


training model faiss_als


DEBUG:implicit:initialize factors in 0.4050014019012451
DEBUG:implicit:finished iteration 0 in 0.121s
DEBUG:implicit:loss at iteration 0 is 0.002768352860584855
DEBUG:implicit:finished iteration 1 in 0.144s
DEBUG:implicit:loss at iteration 1 is 5.920126438140869
DEBUG:implicit:finished iteration 2 in 0.141s
DEBUG:implicit:loss at iteration 2 is 2.0068018436431885
DEBUG:implicit:finished iteration 3 in 0.137s
DEBUG:implicit:loss at iteration 3 is 0.31495827436447144
DEBUG:implicit:finished iteration 4 in 0.144s
DEBUG:implicit:loss at iteration 4 is 0.3155200481414795
DEBUG:implicit:finished iteration 5 in 0.140s
DEBUG:implicit:loss at iteration 5 is 0.2117840051651001
DEBUG:implicit:finished iteration 6 in 0.128s
DEBUG:implicit:loss at iteration 6 is 0.23009587824344635
DEBUG:implicit:finished iteration 7 in 0.129s
DEBUG:implicit:loss at iteration 7 is -0.012185148894786835
DEBUG:implicit:finished iteration 8 in 0.125s
DEBUG:implicit:loss at iteration 8 is 0.16417063772678375
DEBUG:impl

trained model 'faiss_als' in 5.37
100 0.01
reading data from ../dblp-ref/aminer-debug_train.tsv




read data file in 1.571303367614746
weighting matrix by bm25_weight
training model faiss_als


DEBUG:implicit:Calculated transpose in 0.007s
DEBUG:implicit:initialize factors in 0.5423779487609863
DEBUG:implicit:finished iteration 0 in 0.203s
DEBUG:implicit:loss at iteration 0 is 0.0027237783651798964
DEBUG:implicit:finished iteration 1 in 0.193s
DEBUG:implicit:loss at iteration 1 is 1980.674072265625
DEBUG:implicit:finished iteration 2 in 0.194s
DEBUG:implicit:loss at iteration 2 is 1187.1552734375
DEBUG:implicit:finished iteration 3 in 0.247s
DEBUG:implicit:loss at iteration 3 is nan
DEBUG:implicit:finished iteration 4 in 0.107s
DEBUG:implicit:loss at iteration 4 is 0.002831244608387351
DEBUG:implicit:finished iteration 5 in 0.240s
DEBUG:implicit:loss at iteration 5 is nan
DEBUG:implicit:finished iteration 6 in 0.104s
DEBUG:implicit:loss at iteration 6 is 0.0028312443755567074
DEBUG:implicit:finished iteration 7 in 0.258s
DEBUG:implicit:loss at iteration 7 is nan
DEBUG:implicit:finished iteration 8 in 0.107s
DEBUG:implicit:loss at iteration 8 is 0.0028312434442341328
DEBUG:imp

trained model 'faiss_als' in 6.43
200 0.01
reading data from ../dblp-ref/aminer-debug_train.tsv




read data file in 1.4590554237365723
weighting matrix by bm25_weight


DEBUG:implicit:Calculated transpose in 0.009s


training model faiss_als


DEBUG:implicit:initialize factors in 1.0761346817016602
DEBUG:implicit:finished iteration 0 in 0.582s
DEBUG:implicit:loss at iteration 0 is 0.002694434253498912
DEBUG:implicit:finished iteration 1 in 0.590s
DEBUG:implicit:loss at iteration 1 is 0.00262020924128592
DEBUG:implicit:finished iteration 2 in 0.594s
DEBUG:implicit:loss at iteration 2 is 0.0026027236599475145
DEBUG:implicit:finished iteration 3 in 0.594s
DEBUG:implicit:loss at iteration 3 is 0.0025661822874099016
DEBUG:implicit:finished iteration 4 in 0.591s
DEBUG:implicit:loss at iteration 4 is 0.0025594253093004227
DEBUG:implicit:finished iteration 5 in 0.590s
DEBUG:implicit:loss at iteration 5 is 0.0025530599523335695
DEBUG:implicit:finished iteration 6 in 0.595s
DEBUG:implicit:loss at iteration 6 is 0.0025472294073551893
DEBUG:implicit:finished iteration 7 in 0.595s
DEBUG:implicit:loss at iteration 7 is 0.0025435981806367636
DEBUG:implicit:finished iteration 8 in 0.587s
DEBUG:implicit:loss at iteration 8 is 0.0025410591624

trained model 'faiss_als' in 16.15
