In [1]:
# imports
import tabulate
import random as rd
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.preprocessing import MinMaxScaler
from collections import defaultdict
from scipy import stats
from surprise import AlgoBase
from surprise import BaselineOnly
from surprise import KNNBasic
from surprise import KNNWithMeans
from surprise import NMF
from surprise import Dataset
from surprise import Reader
from surprise import accuracy
from surprise.model_selection import train_test_split

In [2]:
# constants and initialization
item_threshold = 1 # 1 means no filtering
my_seed = 0
rd.seed(my_seed)
np.random.seed(my_seed)
predict_col = 'artist'
top_fraction = 0.2
algo_names = ['Random',
                'MostPopular',
                'UserItemAvg',
                'UserKNN',
                'UserKNNAvg',
                'NMF']


In [3]:
def readdata(data_src):
    user_events_file = ("data/%s/user_events.txt" % data_src)
    low_user_file = ("data/%s/low_main_users.txt" % data_src)
    medium_user_file = ("data/%s/medium_main_users.txt" % data_src)
    high_user_file = ("data/%s/high_main_users.txt" % data_src)
    low_users = pd.read_csv(low_user_file, sep=',').set_index('user_id')
    medium_users = pd.read_csv(medium_user_file, sep=',').set_index('user_id')
    high_users = pd.read_csv(high_user_file, sep=',').set_index('user_id')
    # read user events
    cols = ['user', 'artist', 'album', 'track', 'timestamp']
    df_events = pd.read_csv(user_events_file, sep='\t', names=cols)
    df_events = df_events.groupby(['user', predict_col]).size().reset_index(name='count')
    df_events = df_events[df_events['count'] >= item_threshold]
    print('No. filtered user events: ' + str(len(df_events)))
    print('No. filtered items: ' + str(len(df_events[predict_col].unique())))
    scaled_df_events = pd.DataFrame()
    for user_id, group in df_events.groupby('user'):
        min_rating = group['count'].min()
        max_rating = group['count'].max()
        scaler = MinMaxScaler(feature_range=(1, 1000))
        scaled_ratings = scaler.fit_transform(group['count'].values.reshape(-1, 1).astype(float))
        new_rows = group.copy()
        new_rows['count'] = scaled_ratings
        scaled_df_events = scaled_df_events.append(new_rows)
    df_events = scaled_df_events
    print('Min rating: ' + str(df_events['count'].min()))
    print('Max rating: ' + str(df_events['count'].max()))
    reader = Reader(rating_scale=(df_events['count'].min(), df_events['count'].max()))
    df_events.head()
    data = Dataset.load_from_df(df_events, reader)
    trainset, testset = train_test_split(data, test_size = 0.2, random_state = my_seed)
    return [low_users, medium_users, high_users, trainset, testset]

In [4]:
def get_pred_of_groups(lu, mu, hu, predictions):
    print('All: ')
    accuracy.mae(predictions)
    low_predictions = []
    med_predictions = []
    high_predictions = []
    for uid, iid, true_r, est, details in predictions:
        prediction = [(uid, iid, true_r, est, details)]
        if uid in lu.index:
            low_predictions.append(accuracy.mae(prediction, verbose=False))
        elif uid in mu.index:
            med_predictions.append(accuracy.mae(prediction, verbose=False))
        else:
            high_predictions.append(accuracy.mae(prediction, verbose=False))
    return [low_predictions, med_predictions, high_predictions]

In [5]:
def fitmodels(trainset, testset):
    sim_users = {'name': 'cosine', 'user_based': True}  # compute cosine similarities between users
    algos = [] # Random and MostPopular is calculated by default
    algos.append(None)#Random())
    algos.append(None)#MostPopular())
    algos.append(BaselineOnly())
    algos.append(KNNBasic(sim_options = sim_users, k=40)) 
    algos.append(KNNWithMeans(sim_options = sim_users, k=40)) 
    algos.append(NMF(n_factors = 15))
    
    i = 0
    restable = []
    for i in range(0, len(algo_names)):    
        # get accuracy for personalized approaches
        if algo_names[i] != 'Random' and algo_names[i] != 'MostPopular':
            print(algo_names[i])
            algos[i].fit(trainset)
            predictions = algos[i].test(testset)
            restable.append(predictions)
    return restable

In [6]:
low_users, medium_users, high_users, trainset, testset = readdata("original")
restable = fitmodels(trainset, testset)


In [7]:
ptable = []
for predictions in restable:
    low_predictions, med_predictions, high_predictions = get_pred_of_groups(low_users, medium_users, high_users, predictions)
    lowMS = np.mean(low_predictions)
    medMS = np.mean(med_predictions)
    highMS = np.mean(high_predictions)
    allMS = np.mean(accuracy.mae(predictions))
    ttestLH = stats.ttest_ind(low_predictions, high_predictions)
    ttestLM = stats.ttest_ind(low_predictions, med_predictions)
    ttestMH = stats.ttest_ind(med_predictions, high_predictions)
    ptable.append([lowMS, medMS, highMS, allMS, ttestLH.pvalue, ttestLM.pvalue, ttestMH.pvalue])

table = tabulate.tabulate( np.array(ptable).transpose(), headers = np.array(algo_names)[list(range(2,6))], showindex = ["LowMS", "MedMS", "HighMS", "All", "p-value Low-High", "p-value Low-Med", "p-value Med-High"], tablefmt='latex_raw')
print(table)
