## Data preprocessing
Code used to preprocess the MovieLens data so that we can use it in Wan et al.'s framework.

In [1]:
import os
from collections import defaultdict

import h5py
import numpy as np
import pandas as pd

In [2]:
DATA_DIR = './data/ml-1m'
OUT_DIR = './data'

In [3]:
ratings = pd.read_csv(os.path.join(DATA_DIR, 'ratings.dat'), sep='::', engine='python', header=None)
ratings.columns = ['user_id', 'item_id', 'rating', 'timestamp']
ratings.head()

Unnamed: 0,user_id,item_id,rating,timestamp
0,1,1193,5,978300760
1,1,661,3,978302109
2,1,914,3,978301968
3,1,3408,4,978300275
4,1,2355,5,978824291


In [4]:
users = pd.read_csv(os.path.join(DATA_DIR, 'users.dat'), sep='::', engine='python', header=None)
users.columns = ['user_id', 'gender', 'age', 'occupation', 'zip_code']
users.head()

Unnamed: 0,user_id,gender,age,occupation,zip_code
0,1,F,1,10,48067
1,2,M,56,16,70072
2,3,M,25,15,55117
3,4,M,45,7,2460
4,5,M,25,20,55455


In [5]:
# currently not used, remove?
age_mapping = {  
    1:  "Under 18",
    18:  "18-24",
    25:  "25-34",
    35:  "35-44",
    45:  "45-49", 
    50:  "50-55",
    56:  "56+"
}

occupation_mapping = {
    0:  "other",
    1:  "academic/educator",
    2:  "artist",
    3: "clerical/admin",
    4:  "college/grad student",
    5:  "customer service",
    6:  "doctor/health care",
    7:  "executive/managerial",
    8:  "farmer",
    9:  "homemaker",
    10:  "K-12 student",
    11:  "lawyer",
    12:  "programmer",
    13:  "retired",
    14:  "sales/marketing",
    15:  "scientist",
    16:  "self-employed",
    17:  "technician/engineer",
    18:  "tradesman/craftsman",
    19:  "unemployed",
    20:  "writer",
}

In [6]:
movies = pd.read_csv(os.path.join(DATA_DIR, 'movies.dat'), sep='::', engine='python', header=None)
movies.columns = ['movie_id', 'title', 'genres']
movies.head()

Unnamed: 0,movie_id,title,genres
0,1,Toy Story (1995),Animation|Children's|Comedy
1,2,Jumanji (1995),Adventure|Children's|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama
4,5,Father of the Bride Part II (1995),Comedy


In [7]:
movies['genres'].unique().tolist()

["Animation|Children's|Comedy",
 "Adventure|Children's|Fantasy",
 'Comedy|Romance',
 'Comedy|Drama',
 'Comedy',
 'Action|Crime|Thriller',
 "Adventure|Children's",
 'Action',
 'Action|Adventure|Thriller',
 'Comedy|Drama|Romance',
 'Comedy|Horror',
 "Animation|Children's",
 'Drama',
 'Action|Adventure|Romance',
 'Drama|Thriller',
 'Drama|Romance',
 'Thriller',
 'Action|Comedy|Drama',
 'Crime|Drama|Thriller',
 'Drama|Sci-Fi',
 'Romance',
 'Adventure|Sci-Fi',
 'Adventure|Romance',
 "Children's|Comedy|Drama",
 'Documentary',
 'Drama|War',
 'Action|Crime|Drama',
 'Action|Adventure',
 'Crime|Thriller',
 "Animation|Children's|Musical|Romance",
 'Action|Drama|Thriller',
 "Children's|Comedy",
 'Drama|Mystery',
 'Sci-Fi|Thriller',
 'Action|Comedy|Crime|Horror|Thriller',
 'Drama|Musical',
 'Crime|Drama|Romance',
 'Adventure|Drama',
 'Action|Thriller',
 "Adventure|Children's|Comedy|Musical",
 'Action|Drama|War',
 'Action|Adventure|Crime',
 'Crime',
 'Drama|Mystery|Romance',
 'Action|Drama',
 'Drama

In [8]:
# assignment based on https://www.statista.com/statistics/254115/favorite-movie-genres-in-the-us/#:~:text=The%20study%20revealed%20that%20comedy,and%20action%20were%20similarly%20popular.
# TODO try to come up with better way
movie_stereotype_map = {
    "Action": 'M',
    "Adventure": 'M',
    "Animation": 'F',
    "Children's": 'NA',  # F
    "Comedy": 'NA',  # F
    "Crime": 'M',
    "Documentary": 'NA',  # F
    "Drama": 'F',
    "Fantasy": 'NA',  # M
    "Film-Noir": 'M',
    "Horror": 'M',
    "Musical": 'F',
    "Mystery": 'NA',  # M
    "Romance": 'F',
    "Sci-Fi": 'M',
    "Thriller": 'NA',  # M
    "War": 'M',
    "Western": 'M'
}

In [9]:
movie_gender_stereotype = []
for row in movies.itertuples():
    genres = row.genres.split('|')
    m_count = 0
    f_count = 0
    mf_count = 0
    for g in genres:
        stereotype = movie_stereotype_map[g]
        if stereotype == 'NA':  # maybe try to consider 'MF' case more precisely
            continue
        elif stereotype == 'M':
            m_count += 1
        else:
            f_count += 1
    if not m_count and not f_count:
        movie_gender_stereotype.append('MF')
    elif m_count < f_count:
        movie_gender_stereotype.append('F')
    elif m_count > f_count:
        movie_gender_stereotype.append('M')
    else:
        movie_gender_stereotype.append('MF')

In [10]:
movies['model_attr'] = movie_gender_stereotype

In [11]:
movies.head()

Unnamed: 0,movie_id,title,genres,model_attr
0,1,Toy Story (1995),Animation|Children's|Comedy,F
1,2,Jumanji (1995),Adventure|Children's|Fantasy,M
2,3,Grumpier Old Men (1995),Comedy|Romance,F
3,4,Waiting to Exhale (1995),Comedy|Drama,F
4,5,Father of the Bride Part II (1995),Comedy,MF


In [12]:
movies.to_csv(os.path.join(DATA_DIR, 'movies.attr.csv'), index=False)

In [13]:
ratings = pd.merge(left=ratings, right=movies, left_on='item_id', right_on='movie_id')
ratings = pd.merge(left=ratings, right=users[['user_id', 'gender']], left_on='user_id', right_on='user_id')
ratings.head()

Unnamed: 0,user_id,item_id,rating,timestamp,movie_id,title,genres,model_attr,gender
0,1,1193,5,978300760,1193,One Flew Over the Cuckoo's Nest (1975),Drama,F,F
1,1,661,3,978302109,661,James and the Giant Peach (1996),Animation|Children's|Musical,F,F
2,1,914,3,978301968,914,My Fair Lady (1964),Musical|Romance,F,F
3,1,3408,4,978300275,3408,Erin Brockovich (2000),Drama,F,F
4,1,2355,5,978824291,2355,"Bug's Life, A (1998)",Animation|Children's|Comedy,F,F


In [14]:
ratings.rename(columns={'gender': 'user_attr'}, inplace=True)
ratings.head()

Unnamed: 0,user_id,item_id,rating,timestamp,movie_id,title,genres,model_attr,user_attr
0,1,1193,5,978300760,1193,One Flew Over the Cuckoo's Nest (1975),Drama,F,F
1,1,661,3,978302109,661,James and the Giant Peach (1996),Animation|Children's|Musical,F,F
2,1,914,3,978301968,914,My Fair Lady (1964),Musical|Romance,F,F
3,1,3408,4,978300275,3408,Erin Brockovich (2000),Drama,F,F
4,1,2355,5,978824291,2355,"Bug's Life, A (1998)",Animation|Children's|Comedy,F,F


In [15]:
def get_split(df, is_datetime=False, test_only=165):
    if is_datetime: 
        df['timestamp'] = pd.to_datetime(df['timestamp'])
    df = df.sort_values(by='timestamp')
    
    user_ratings = []
    splits = []
    indices = []
    
    for g_id, group in df.groupby(by='user_id'):
        indices.extend(group.index)
        user_ratings.append(len(group))
        
        div = int(len(group) * 0.8)
        if len(group) < test_only:
            splits.extend([0] * div)
            splits.extend([2] * (len(group) - div))
        else:
            splits.extend([0] * div)
            rem = len(group) - div 
            splits.extend([1] * (rem // 2))
            splits.extend([2] * (rem - rem // 2))
            
    return np.array(splits), np.array(indices), np.array(user_ratings)

In [16]:
splits, indices, user_rcount = get_split(ratings, is_datetime=False)

In [17]:
np.mean(user_rcount), np.min(user_rcount), np.max(user_rcount)

(165.5975165562914, 20, 2314)

In [18]:
len(splits[splits == 0]) / len(ratings), len(splits[splits == 1]) / len(ratings), len(splits[splits == 2]) / len(ratings)

(0.7975913034175858, 0.0712611064287564, 0.13114759015365787)

In [19]:
sorted_ratings = ratings.ix[indices]
sorted_ratings.head()

.ix is deprecated. Please use
.loc for label based indexing or
.iloc for positional indexing

See the documentation here:
http://pandas.pydata.org/pandas-docs/stable/indexing.html#ix-indexer-is-deprecated
  """Entry point for launching an IPython kernel.


Unnamed: 0,user_id,item_id,rating,timestamp,movie_id,title,genres,model_attr,user_attr
31,1,3186,4,978300019,3186,"Girl, Interrupted (1999)",Drama,F,F
27,1,1721,4,978300055,1721,Titanic (1997),Drama|Romance,F,F
37,1,1022,5,978300055,1022,Cinderella (1950),Animation|Children's|Musical,F,F
22,1,1270,5,978300055,1270,Back to the Future (1985),Comedy|Sci-Fi,M,F
24,1,2340,3,978300103,2340,Meet Joe Black (1998),Romance,F,F


In [20]:
sorted_ratings['split'] = splits

In [21]:
sorted_ratings.head()

Unnamed: 0,user_id,item_id,rating,timestamp,movie_id,title,genres,model_attr,user_attr,split
31,1,3186,4,978300019,3186,"Girl, Interrupted (1999)",Drama,F,F,0
27,1,1721,4,978300055,1721,Titanic (1997),Drama|Romance,F,F,0
37,1,1022,5,978300055,1022,Cinderella (1950),Animation|Children's|Musical,F,F,0
22,1,1270,5,978300055,1270,Back to the Future (1985),Comedy|Sci-Fi,M,F,0
24,1,2340,3,978300103,2340,Meet Joe Black (1998),Romance,F,F,0


In [22]:
sorted_ratings.to_csv(os.path.join(OUT_DIR, 'df_movielens_1m.csv'), index=False)