### User Gender Balance for ml-1m and last.FM datasets

In [None]:
library(dplyr)
library(readr)
library(ggplot2)

In [None]:
options(repr.plot.height=5)
options(repr.matrix.max.rows=10)
options(repr.matrix.max.columns=10)

### Load MoveLens Data

In [None]:
ml_ratings = read_delim("data/ml-1m/ratings.dat", delim=":",
                        col_names=c("user", "item", "rating", "timestamp"),
                        col_types="i_i_i_i")
ml_ratings

Load user info

In [None]:
ml_users = read_delim("data/ml-1m/users.dat", delim=":",
                      col_names=c("user", "gender", "age", "occupation", "zip"),
                      col_types="i_c_c_c_c") %>%
    mutate(gender=as.factor(gender),
           age=as.factor(age),
           occupation=as.factor(occupation))
ml_users

Join gender info with user ratings

In [None]:
ml_ratings_gender = ml_ratings %>%
    inner_join(ml_users %>% select(user, gender))
ml_ratings_gender

Filter Male and Female ratings in order to sample 

In [None]:
ml_ratings_M = ml_ratings_gender %>%
    filter(gender=='M')
ml_ratings_M
ml_ratings_F = ml_ratings_gender %>%
    filter(gender=='F')
ml_ratings_F

Filter distinct female and male users

In [None]:
ml_distinct_F = ml_ratings_F %>%
    select(user) %>%
    distinct()
ml_distinct_F %>% 
    nrow()

In [None]:
ml_distinct_M = ml_ratings_M %>%
    select(user) %>%
    distinct()
ml_distinct_M %>%
    nrow()

Sample same size user in two group
Sample size is equal to totoal female size

In [None]:
size = 1500
ml_sample_M = ml_distinct_M %>%
    sample_n(size=size,replace=FALSE)
ml_sample_F = ml_distinct_F %>%
    sample_n(size=size,replace=FALSE)

In [None]:
ml_sample_M %>%
    nrow()
ml_sample_F %>%
    nrow()

Combine sample users with rating data

In [None]:
ml_ratings_sample_M = ml_ratings_M %>%
    inner_join(ml_sample_M)
ml_ratings_sample_F = ml_ratings_F %>%
    inner_join(ml_sample_F)

In [None]:
ml_ratings_sample_M
ml_ratings_sample_F

bind two dataframes

In [None]:
ml_ratings_sample = bind_rows(ml_ratings_sample_F %>% select(user, item, rating, timestamp),
                             ml_ratings_sample_M %>% select(user, item, rating, timestamp))
ml_ratings_sample

Save ml ratings to dat file

In [None]:
ml_ratings_sample %>%
    write_csv("build/ml-ratings-sample-genderbalanced.csv")

## Load Last.FM data
See python notebook