### User Gender Balance for ml-1m and last.FM datasets

In [13]:
library(dplyr)
library(readr)

In [14]:
options(repr.plot.height=5)
options(repr.matrix.max.rows=10)
options(repr.matrix.max.columns=10)

### Load MoveLens Data

In [15]:
ml_ratings = read_delim("data/ml-1m/ratings.dat", delim=":",
                        col_names=c("user", "item", "rating", "timestamp"),
                        col_types="i_i_i_i")
ml_ratings

user,item,rating,timestamp
1,1193,5,978300760
1,661,3,978302109
1,914,3,978301968
1,3408,4,978300275
1,2355,5,978824291
⋮,⋮,⋮,⋮
6040,1091,1,956716541
6040,1094,5,956704887
6040,562,5,956704746
6040,1096,4,956715648


Load user info

In [16]:
ml_users = read_delim("data/ml-1m/users.dat", delim=":",
                      col_names=c("user", "gender", "age", "occupation", "zip"),
                      col_types="i_c_c_c_c") %>%
    mutate(gender=as.factor(gender),
           age=as.factor(age),
           occupation=as.factor(occupation))
ml_users

user,gender,age,occupation,zip
1,F,1,10,48067
2,M,56,16,70072
3,M,25,15,55117
4,M,45,7,02460
5,M,25,20,55455
⋮,⋮,⋮,⋮,⋮
6036,F,25,15,32603
6037,F,45,1,76006
6038,F,56,1,14706
6039,F,45,0,01060


Join gender info with user ratings

In [17]:
ml_ratings_gender = ml_ratings %>%
    inner_join(ml_users %>% select(user, gender))
ml_ratings_gender

Joining, by = "user"


user,item,rating,timestamp,gender
1,1193,5,978300760,F
1,661,3,978302109,F
1,914,3,978301968,F
1,3408,4,978300275,F
1,2355,5,978824291,F
⋮,⋮,⋮,⋮,⋮
6040,1091,1,956716541,M
6040,1094,5,956704887,M
6040,562,5,956704746,M
6040,1096,4,956715648,M


Filter Male and Female ratings in order to sample 

In [18]:
ml_ratings_M = ml_ratings_gender %>%
    filter(gender=='M')
ml_ratings_M
ml_ratings_F = ml_ratings_gender %>%
    filter(gender=='F')
ml_ratings_F

user,item,rating,timestamp,gender
2,1357,5,978298709,M
2,3068,4,978299000,M
2,1537,4,978299620,M
2,647,3,978299351,M
2,2194,4,978299297,M
⋮,⋮,⋮,⋮,⋮
6040,1091,1,956716541,M
6040,1094,5,956704887,M
6040,562,5,956704746,M
6040,1096,4,956715648,M


user,item,rating,timestamp,gender
1,1193,5,978300760,F
1,661,3,978302109,F
1,914,3,978301968,F
1,3408,4,978300275,F
1,2355,5,978824291,F
⋮,⋮,⋮,⋮,⋮
6039,1081,4,956705989,F
6039,1083,3,956706051,F
6039,1086,4,956706182,F
6039,1088,4,956706019,F


Filter distinct female and male users

In [19]:
ml_distinct_F = ml_ratings_F %>%
    select(user) %>%
    distinct()
ml_distinct_F %>% 
    nrow()

In [20]:
ml_distinct_M = ml_ratings_M %>%
    select(user) %>%
    distinct()
ml_distinct_M %>%
    nrow()

Sample same size user in two group
Sample size is equal to totoal female size

In [21]:
size = 1500
ml_sample_M = ml_distinct_M %>%
    sample_n(size=size,replace=FALSE)
ml_sample_F = ml_distinct_F %>%
    sample_n(size=size,replace=FALSE)

In [22]:
ml_sample_M %>%
    nrow()
ml_sample_F %>%
    nrow()

Combine sample users with rating data

In [23]:
ml_ratings_sample_M = ml_ratings_M %>%
    inner_join(ml_sample_M)
ml_ratings_sample_F = ml_ratings_F %>%
    inner_join(ml_sample_F)

Joining, by = "user"
Joining, by = "user"


In [24]:
ml_ratings_sample_M
ml_ratings_sample_F

user,item,rating,timestamp,gender
7,648,4,978234737,M
7,861,4,978234874,M
7,2916,5,978234842,M
7,3578,3,978234737,M
7,3793,3,978234737,M
⋮,⋮,⋮,⋮,⋮
6034,1345,4,956711771,M
6034,3334,3,956712333,M
6034,2726,5,956712258,M
6034,527,4,956711771,M


user,item,rating,timestamp,gender
1,1193,5,978300760,F
1,661,3,978302109,F
1,914,3,978301968,F
1,3408,4,978300275,F
1,2355,5,978824291,F
⋮,⋮,⋮,⋮,⋮
6039,1081,4,956705989,F
6039,1083,3,956706051,F
6039,1086,4,956706182,F
6039,1088,4,956706019,F


bind two dataframes

In [25]:
ml_ratings_sample = bind_rows(ml_ratings_sample_F %>% select(user, item, rating, timestamp),
                             ml_ratings_sample_M %>% select(user, item, rating, timestamp))
ml_ratings_sample

user,item,rating,timestamp
1,1193,5,978300760
1,661,3,978302109
1,914,3,978301968
1,3408,4,978300275
1,2355,5,978824291
⋮,⋮,⋮,⋮
6034,1345,4,956711771
6034,3334,3,956712333
6034,2726,5,956712258
6034,527,4,956711771


Save ml ratings to dat file

In [26]:
ml_ratings_sample %>%
    write_csv("data/ml-1m/ml-ratings-sample.csv", na="")

## Load Last.FM data
See python notebook

In [27]:
lfm_play_counts = read_csv("build/lastfm-play-counts.csv")
lfm_play_counts

Parsed with column specification:
cols(
  user = col_integer(),
  item = col_integer(),
  count = col_integer()
)


user,item,count
1,1,2137
1,2,1099
1,3,897
1,4,717
1,5,706
⋮,⋮,⋮
358717,800,1517
358717,862,1494
358717,1516,1128
358717,5547,1126


In [28]:
lfm_users_raw = read_csv("build/lastfm-users.csv")
head(lfm_users_raw)

Parsed with column specification:
cols(
  id = col_integer(),
  key = col_character(),
  gender = col_character(),
  age = col_integer()
)


id,key,gender,age
1,00000c289a1829a808ac09c00daf10bc3c4e223b,f,22.0
2,00001411dc427966b17297bf4d69e7e193135d89,f,
3,00004d2ac9316e22dc007ab2243d6fcb239e707d,,
4,000063d3fe1cf2ba248b9e3c3f0334845a27a6bf,m,19.0
5,00007a47085b9aab8af55f52ec8846ac479ac4fe,m,28.0
6,0000c176103e538d5c9828e695fed4f7ae42dd01,m,20.0


In [29]:
lfm_users = lfm_users_raw %>%
    select(user=id,gender) %>%
    mutate(gender = as.factor(toupper(gender)))
lfm_users

user,gender
1,F
2,F
3,
4,M
5,M
⋮,⋮
359343,M
359344,M
359345,M
359346,M


Join user info with ratings

In [30]:
lfm_ratings_gender = lfm_play_counts %>%
    inner_join(lfm_users)
lfm_ratings_gender

Joining, by = "user"


user,item,count,gender
1,1,2137,F
1,2,1099,F
1,3,897,F
1,4,717,F
1,5,706,F
⋮,⋮,⋮,⋮
358717,800,1517,M
358717,862,1494,M
358717,1516,1128,M
358717,5547,1126,M


Split data into two group based on gender

In [31]:
lfm_ratings_M = lfm_ratings_gender %>%
    filter(gender=='M')
lfm_ratings_M
lfm_ratings_F = lfm_ratings_gender %>%
    filter(gender=='F')
lfm_ratings_F

user,item,count,gender
4,147,106,M
4,148,48,M
4,149,31,M
4,150,29,M
4,151,26,M
⋮,⋮,⋮,⋮
358717,800,1517,M
358717,862,1494,M
358717,1516,1128,M
358717,5547,1126,M


user,item,count,gender
1,1,2137,F
1,2,1099,F
1,3,897,F
1,4,717,F
1,5,706,F
⋮,⋮,⋮,⋮
358713,2188,36,F
358713,827,35,F
358713,4074,34,F
358713,266,34,F


Check rows in order to sample

In [32]:
lfm_distinct_M = lfm_ratings_M %>%
    select(user) %>%
    distinct()
lfm_distinct_M %>% 
    nrow()

In [33]:
lfm_distinct_F = lfm_ratings_F %>%
    select(user) %>%
    distinct()
lfm_distinct_F %>% 
    nrow()

sample data

In [34]:
size = 33000
lfm_sample_M = lfm_distinct_M %>%
    sample_n(size=size,replace=FALSE)
lfm_sample_F = lfm_distinct_F %>%
    sample_n(size=size,replace=FALSE)

In [35]:
lfm_sample_M %>%
    nrow()
lfm_sample_F %>%
    nrow()

join sampled id with ratings

In [40]:
lfm_ratings_sample_M = lfm_ratings_M %>%
    inner_join(lfm_sample_M)
lfm_ratings_sample_F = lfm_ratings_F %>%
    inner_join(lfm_sample_F)

Joining, by = "user"
Joining, by = "user"


In [41]:
lfm_ratings_sample_M
lfm_ratings_sample_F

user,item,count,gender
4,147,106,M
4,148,48,M
4,149,31,M
4,150,29,M
4,151,26,M
⋮,⋮,⋮,⋮
358717,800,1517,M
358717,862,1494,M
358717,1516,1128,M
358717,5547,1126,M


user,item,count,gender
1,1,2137,F
1,2,1099,F
1,3,897,F
1,4,717,F
1,5,706,F
⋮,⋮,⋮,⋮
358713,2188,36,F
358713,827,35,F
358713,4074,34,F
358713,266,34,F


In [43]:
lfm_ratings_sample = bind_rows(lfm_ratings_sample_M %>% select(user, item, count),
                             lfm_ratings_sample_F %>% select(user, item, count))
lfm_ratings_sample

user,item,count
4,147,106
4,148,48
4,149,31
4,150,29
4,151,26
⋮,⋮,⋮
358713,2188,36
358713,827,35
358713,4074,34
358713,266,34


In [44]:
lfm_ratings_sample %>%
    write_csv("data/lastfm-dataset-360K/lfm-ratings-sample.csv", na="")