# Prepare Last.FM 1K Data Set

This script will prepare the Last.FM 1K data set for the experiment.

This data set contains individual play records, but our experiment is set up for artist play counts. So we will proceed in 2 steps:

1. Aggregate by artist to produce play counts.
2. Crossfold for recommender evaluation.

## Setup

In [1]:
library(readr)
library(dplyr)


Caricamento pacchetto: 'dplyr'


I seguenti oggetti sono mascherati da 'package:stats':

    filter, lag


I seguenti oggetti sono mascherati da 'package:base':

    intersect, setdiff, setequal, union




In [2]:
options(repr.matrix.max.rows=10)

# Read Data

In [4]:
message("reading users")
users = read_tsv("data/lastfm-dataset-1K/userid-profile.tsv") %>%
    rename(key=`#id`) %>%
    mutate(id=1:n())
users

reading users

[1m[1mRows: [1m[22m[34m[34m992[34m[39m [1m[1mColumns: [1m[22m[34m[34m5[34m[39m

[36m--[39m [1m[1mColumn specification[1m[22m [36m------------------------------------------------------------------------------------------------[39m
[1mDelimiter:[22m "\t"
[31mchr[39m (4): #id, gender, country, registered
[32mdbl[39m (1): age


[36mi[39m Use [30m[47m[30m[47m`spec()`[47m[30m[49m[39m to retrieve the full column specification for this data.
[36mi[39m Specify the column types or set [30m[47m[30m[47m`show_col_types = FALSE`[47m[30m[49m[39m to quiet this message.



key,gender,age,country,registered,id
<chr>,<chr>,<dbl>,<chr>,<chr>,<int>
user_000001,m,,Japan,"Aug 13, 2006",1
user_000002,f,,Peru,"Feb 24, 2006",2
user_000003,m,22,United States,"Oct 30, 2005",3
user_000004,f,,,"Apr 26, 2006",4
user_000005,m,,Bulgaria,"Jun 29, 2006",5
...,...,...,...,...,...
user_000996,f,,United States,"Jul 17, 2006",988
user_000997,m,,United States,"Jan 5, 2007",989
user_000998,m,,United Kingdom,"Sep 28, 2005",990
user_000999,f,,Poland,"Jul 24, 2007",991


In [5]:
message("reading plays")
plays = read_delim("data/lastfm-dataset-1K/userid-timestamp-artid-artname-traid-traname.tsv",
                   col_names=c("user", "time", "artistId", "artist", "trackId", "track"),
                   delim="\t", quote="")
head(plays)

reading plays

[1m[1mRows: [1m[22m[34m[34m19150868[34m[39m [1m[1mColumns: [1m[22m[34m[34m6[34m[39m

[36m--[39m [1m[1mColumn specification[1m[22m [36m------------------------------------------------------------------------------------------------[39m
[1mDelimiter:[22m "\t"
[31mchr[39m  (5): user, artistId, artist, trackId, track
[34mdttm[39m (1): time


[36mi[39m Use [30m[47m[30m[47m`spec()`[47m[30m[49m[39m to retrieve the full column specification for this data.
[36mi[39m Specify the column types or set [30m[47m[30m[47m`show_col_types = FALSE`[47m[30m[49m[39m to quiet this message.



user,time,artistId,artist,trackId,track
<chr>,<dttm>,<chr>,<chr>,<chr>,<chr>
user_000001,2009-05-04 23:08:57,f1b1cf71-bd35-4e99-8624-24a6e15f133a,Deep Dish,,Fuck Me Im Famous (Pacha Ibiza)-09-28-2007
user_000001,2009-05-04 13:54:10,a7f7df4a-77d8-4f12-8acd-5c60c93f4de8,<U+5742><U+672C><U+9F8D><U+4E00>,,Composition 0919 (Live_2009_4_15)
user_000001,2009-05-04 13:52:04,a7f7df4a-77d8-4f12-8acd-5c60c93f4de8,<U+5742><U+672C><U+9F8D><U+4E00>,,Mc2 (Live_2009_4_15)
user_000001,2009-05-04 13:42:52,a7f7df4a-77d8-4f12-8acd-5c60c93f4de8,<U+5742><U+672C><U+9F8D><U+4E00>,,Hibari (Live_2009_4_15)
user_000001,2009-05-04 13:42:11,a7f7df4a-77d8-4f12-8acd-5c60c93f4de8,<U+5742><U+672C><U+9F8D><U+4E00>,,Mc1 (Live_2009_4_15)
user_000001,2009-05-04 13:38:31,a7f7df4a-77d8-4f12-8acd-5c60c93f4de8,<U+5742><U+672C><U+9F8D><U+4E00>,,To Stanford (Live_2009_4_15)


In [6]:
plays %>%
    filter(is.na(artistId)) %>%
    head()

user,time,artistId,artist,trackId,track
<chr>,<dttm>,<chr>,<chr>,<chr>,<chr>
user_000001,2009-05-02 14:30:56,,Rocket Empire,,Simmer Down Jammie
user_000001,2009-05-01 11:58:15,,Rocket Empire,,Simmer Down Jammie
user_000001,2009-05-01 09:36:10,,Rocket Empire,,Simmer Down Jammie
user_000001,2009-05-01 09:12:21,,Nicken,,Lifeb
user_000001,2009-05-01 08:58:19,,Ivan Enot,,Grooves Of The Heart
user_000001,2009-05-01 08:18:35,,Rocket Empire,,Simmer Down Jammie


OK, we have some NAs. Grab the artists!

In [7]:
artists = plays %>%
    select(artistId, artist) %>%
    distinct() %>%
    mutate(id=1:n())
head(artists)

artistId,artist,id
<chr>,<chr>,<int>
f1b1cf71-bd35-4e99-8624-24a6e15f133a,Deep Dish,1
a7f7df4a-77d8-4f12-8acd-5c60c93f4de8,<U+5742><U+672C><U+9F8D><U+4E00>,2
ba2f4f3b-0293-4bc8-bb94-2f73b5207343,Underworld,3
a16e47f5-aa54-47fe-87e4-bb8af91a9fdd,Ennio Morricone,4
463a94f1-2713-40b1-9c88-dcc9c0170cae,Minus 8,5
ad0811ea-e213-451d-b22f-fa1a7f9e0226,Beanfield,6


Now we will group into artist play counts.

In [8]:
artist_play_counts = plays %>%
    inner_join(artists) %>%
    select(key=user, artist=id) %>%
    inner_join(users) %>%
    select(user=id, artist) %>%
    group_by(user, artist) %>%
    summarize(count=n())
head(artist_play_counts)

Joining, by = c("artistId", "artist")

Joining, by = "key"

`summarise()` has grouped output by 'user'. You can override using the `.groups` argument.



user,artist,count
<int>,<int>,<int>
1,1,1
1,2,855
1,3,779
1,4,12
1,5,18
1,6,8


In [9]:
user_stats = artist_play_counts %>%
    group_by(user) %>%
    summarize(nartists=n(), medPlays=median(count))
user_stats %>% 
    select(-user) %>%
    summary()

    nartists         medPlays     
 Min.   :   2.0   Min.   : 1.000  
 1st Qu.: 311.8   1st Qu.: 2.000  
 Median : 604.5   Median : 3.000  
 Mean   : 911.9   Mean   : 4.387  
 3rd Qu.:1184.8   3rd Qu.: 5.000  
 Max.   :8453.0   Max.   :46.000  

In [10]:
nrow(user_stats %>% filter(nartists >= 10))

## Write Data

Now we want to write data!

In [11]:
write_csv(users %>% select(id, gender, age),
          "build/lfm1k-users.csv", na="")

In [12]:
write_csv(artist_play_counts, "build/lfm1k-play-counts.csv", na="")

In [13]:
user_parts = user_stats %>%
    filter(nartists >= 10) %>%
    mutate(partition=sample(rep(1:5, ceiling(n() / 5))[1:n()])) %>%
    select(user, partition)
head(user_parts)

user,partition
<int>,<int>
1,3
2,4
3,5
4,4
5,4
6,1


Let's quick check partition distributions

In [None]:
user_parts %>% group_by(partition) %>% summarize(nusers=n())

In [None]:
dir.create("build/lfm1k-splits")
for (i in 1:5) {
    message(sprintf("writing partition %d", i))
    test_ratings = user_parts %>%
        filter(partition == i) %>%
        inner_join(artist_play_counts) %>%
        group_by(user) %>%
        mutate(pos=sample(1:n())) %>%
        filter(pos <= 5) %>%
        select(user, item=artist, count)
    train_ratings = artist_play_counts %>%
        rename(item=artist) %>%
        anti_join(test_ratings)
    write_csv(test_ratings, sprintf("build/lfm1k-splits/u.part%d.test.csv", i), na="")
    write_csv(train_ratings, sprintf("build/lfm1k-splits/u.part%d.train.csv", i), na="")
}