## Data preprocessing
Code used to preprocess the MovieLens data so that we can use it in Wan et al.'s framework.

In [1]:
import os
from collections import defaultdict

import h5py
import numpy as np 
import pandas as pd

In [2]:
DATA_DIR = './data/ml-1m'
OUT_DIR = './data'

In [3]:
ratings = pd.read_csv(
    os.path.join(DATA_DIR, 'ratings.dat'), sep='::', engine='python', header=None)
ratings.columns = ['user_id', 'item_id', 'rating', 'timestamp']
ratings.head()

Unnamed: 0,user_id,item_id,rating,timestamp
0,1,1193,5,978300760
1,1,661,3,978302109
2,1,914,3,978301968
3,1,3408,4,978300275
4,1,2355,5,978824291


In [4]:
users = pd.read_csv(
    os.path.join(DATA_DIR, 'users.dat'), sep='::', engine='python', header=None)
users.columns = ['user_id', 'gender', 'age', 'occupation', 'zip_code']
users.head()

Unnamed: 0,user_id,gender,age,occupation,zip_code
0,1,F,1,10,48067
1,2,M,56,16,70072
2,3,M,25,15,55117
3,4,M,45,7,2460
4,5,M,25,20,55455


In [5]:
movies = pd.read_csv(
    os.path.join(DATA_DIR, 'movies.dat'), sep='::', engine='python', header=None)
movies.columns = ['movie_id', 'title', 'genres']
movies.head()

Unnamed: 0,movie_id,title,genres
0,1,Toy Story (1995),Animation|Children's|Comedy
1,2,Jumanji (1995),Adventure|Children's|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama
4,5,Father of the Bride Part II (1995),Comedy


Assign movies to their stereotypical gender by genre and some manual annotation

In [7]:
genre_movielist = defaultdict(list)
genre_movieids = defaultdict(list)
horror_movieids = set()
for movie in movies.itertuples():
    genre_movielist[movie.genres].append(movie.title)
    genre_movieids[movie.genres].append(movie.movie_id)
    if 'Horror' in movie.genres:
        horror_movieids.add(movie.movie_id)

In [37]:
genre_movielist['Drama']

['Nixon (1995)',
 'Othello (1995)',
 'Now and Then (1995)',
 'Shanghai Triad (Yao a yao yao dao waipo qiao) (1995)',
 'Dangerous Minds (1995)',
 'Dead Man Walking (1995)',
 'Cry, the Beloved Country (1995)',
 'Restoration (1995)',
 'Lamerica (1994)',
 'Georgia (1995)',
 'Home for the Holidays (1995)',
 "Mr. Holland's Opus (1995)",
 'Two Bits (1995)',
 'Crossing Guard, The (1995)',
 'White Balloon, The (Badkonake Sefid ) (1995)',
 "Antonia's Line (Antonia) (1995)",
 'Once Upon a Time... When We Were Colored (1995)',
 'Journey of August King, The (1995)',
 'Beautiful Girls (1996)',
 'Hate (Haine, La) (1995)',
 "Margaret's Museum (1995)",
 'Race the Sun (1996)',
 'Boys of St. Vincent, The (1993)',
 "Star Maker, The (Uomo delle stelle, L') (1995)",
 'Silence of the Palace, The (Saimt el Qusur) (1994)',
 'Angela (1995)',
 'Neon Bible, The (1995)',
 'Shadows (Cienie) (1988)',
 'Gospa (1995)',
 'Basketball Diaries, The (1995)',
 'Awfully Big Adventure, An (1995)',
 'Apollo 13 (1995)',
 'Belle

In [8]:
for genre, mlist in genre_movielist.items():
    if len(mlist) > 20:
        print(genre, len(mlist))

Comedy|Romance 142
Comedy|Drama 162
Comedy 521
Adventure|Children's 30
Action 65
Comedy|Drama|Romance 34
Comedy|Horror 31
Animation|Children's 35
Drama 843
Drama|Thriller 63
Drama|Romance 134
Thriller 101
Romance 40
Documentary 116
Drama|War 43
Action|Adventure 25
Crime|Thriller 21
Children's|Comedy 47
Action|Thriller 48
Crime 26
Action|Drama 39
Horror 178
Action|Adventure|Sci-Fi 21
Horror|Sci-Fi 33
Western 33
Crime|Drama 44
Horror|Thriller 32
Action|Comedy 22
Action|Sci-Fi 28
Sci-Fi 27


In [10]:
# manual annotation, based on movie synopsis and prior knowledge
MANUAL_ANNOT = {
    'Titanic (1953)': 'F',
    'Thelma & Louise (1991)': 'F',
    'Phantom of the Opera, The (1943)': 'F',
    'Waiting to Exhale (1995)': 'F',
    'Last Summer in the Hamptons (1995)': 'F',
    'Grace of My Heart (1996)': 'F',
    'Beautiful (2000)': 'F',
    'Story of Us, The (1999)': 'F',
    'Molly (1999)': 'F',
    'Working Girl (1988)': 'F',
    'Terms of Endearment (1983)': 'F',
    'Breakfast Club, The (1985)': 'F',
    'Adventures of Priscilla, Queen of the Desert, The (1994)': 'F',
    'Dear Diary (Caro Diario) (1994)': 'F',
    'Hairspray (1988)': 'F',
    'Virgin Suicides, The (1999)': 'F',
    'Rainmaker, The (1997)': 'M',
    'Field of Dreams (1989)': 'M',
    'Dead Poets Society (1989)': 'M',
    'Entertaining Angels: The Dorothy Day Story (1996)': 'F',
    "Sophie's Choice (1982)": 'F',
    'Trainspotting (1996)': 'M',
    'Time to Kill, A (1996)': 'M',
    'Boy Called Hate, A (1995)': 'M',
    'Somebody to Love (1994)': 'F',
    'Last of the High Kings, The (a.k.a. Summer Fling) (1996)': 'M',
    'Sunset Park (1996)': 'M',
    'I Shot Andy Warhol (1996)': 'F',
    'Naked (1993)': 'M',
    'To Cross the Rubicon (1991)': 'F',
    "What's Love Got to Do with It? (1993)": 'F',
    'M. Butterfly (1993)': 'F',
    'Man Without a Face, The (1993)': 'M',
    'Fearless (1993)': 'M',
    'Calendar Girl (1993)': 'M',
    'Dangerous Game (1993)': 'M',
    'Bronx Tale, A (1993)': 'M',
    'S.F.W. (1994)': 'M',
    'Angela (1995)': 'F',
    'Nixon (1995)': 'M',
    'Othello (1995)': 'F',
    'Beautiful Girls (1996)': 'F',
    "Margaret's Museum (1995)": 'F',
    'Basketball Diaries, The (1995)': 'M',
    'Apollo 13 (1995)': 'M',
    'Little Women (1994)': 'F',
    'Shawshank Redemption, The (1994)': 'M',
    'When a Man Loves a Woman (1994)': 'F',
    'Age of Innocence, The (1993)': 'F',
    "It's a Wonderful Life (1946)": 'F',
    'Robin Hood: Prince of Thieves (1991)': 'M',
    'Streetcar Named Desire, A (1951)': 'F',
    'Hamlet (1996)': 'F',
    'Portrait of a Lady, The (1996)': 'F',
    'Boys Life 2 (1997)': 'M',
    'Career Girls (1997)': 'F',
    'Joy Luck Club, The (1993)': 'F',
    'Good Will Hunting (1997)': 'M',
    'Mis�rables, Les (1998)': 'F',
    'Hamlet (1948)': 'F', 
    'Kramer Vs. Kramer (1979)': 'F',
    'Outsiders, The (1983)': 'M',
    'Karate Kid, The (1984)': 'M',
    'Boys, The (1997)': 'M',
    'Eyes Wide Shut (1999)': 'M',
    'Spartacus (1960)': 'M',
    'Lolita (1962)': 'F',
    'Fight Club (1999)': 'M',
    'Mansfield Park (1999)': 'F',
    'Cider House Rules, The (1999)': 'F',
    'Girl, Interrupted (1999)': 'F',
    'Star Is Born, A (1937)': 'F',
    'Hoosiers (1986)': 'M',
    'Erin Brockovich (2000)': 'F',
    'Waking the Dead (1999)': 'M',
    'Footloose (1984)': 'F',
    'Remember the Titans (2000)': 'M',
    'Requiem for a Dream (2000)': 'M'   
}

In [12]:
GENRE_ANNOT = {
    "Animation|Children's|Comedy": 'F',
    "Adventure|Children's|Fantasy": 'F',
    'Comedy|Romance': 'F',
    'Comedy|Drama': 'M',
    'Comedy': 'M',
    'Action|Crime|Thriller': 'M',
    "Adventure|Children's": 'F',
    'Action': 'M',
    'Action|Adventure|Thriller': 'M',
    'Comedy|Drama|Romance': 'F',
    'Comedy|Horror': 'M',
    "Animation|Children's": 'MF',
    'Drama': 'MF',
    'Action|Adventure|Romance': 'M',
    'Drama|Thriller': 'M',
    'Drama|Romance': 'M',
    'Thriller': 'M',
    'Action|Comedy|Drama': 'M',
    'Crime|Drama|Thriller': 'M',
    'Drama|Sci-Fi': 'M',
    'Romance': 'F',
    'Adventure|Sci-Fi': 'M',
    'Adventure|Romance': 'M',
    "Children's|Comedy|Drama": 'F',
    'Documentary': 'F',
    'Drama|War': 'M',
    'Action|Crime|Drama': 'M',
    'Action|Adventure': 'M',
    'Crime|Thriller': 'M',
    "Animation|Children's|Musical|Romance": 'F',
    'Action|Drama|Thriller': 'M',
    "Children's|Comedy": 'F',
    'Drama|Mystery': 'M',
    'Sci-Fi|Thriller': 'M',
    'Action|Comedy|Crime|Horror|Thriller': 'M',
    'Drama|Musical': 'F',
    'Crime|Drama|Romance': 'M',
    'Adventure|Drama': 'M',
    'Action|Thriller': 'M',
    "Adventure|Children's|Comedy|Musical": 'F', 
    'Action|Drama|War': 'M',
    'Action|Adventure|Crime': 'M',
    'Crime': 'M',
    'Drama|Mystery|Romance': 'F',
    'Action|Drama': 'M',
    'Drama|Romance|War': 'M',
    'Horror': 'M',
    'Action|Adventure|Comedy|Crime': 'M',
    'Comedy|War': 'M',
    'Action|Adventure|Mystery|Sci-Fi': 'M',
    'Drama|Thriller|War': 'M',
    'Action|Romance|Thriller': 'M',
    'Crime|Film-Noir|Mystery|Thriller': 'M',
    'Action|Adventure|Drama|Romance': 'M',
    "Adventure|Children's|Drama": 'F',
    'Action|Sci-Fi|Thriller': 'M',
    'Action|Adventure|Sci-Fi': 'M',
    "Action|Children's": 'M',
    'Horror|Sci-Fi': 'M',
    'Action|Crime|Sci-Fi': 'M',
    'Western': 'M',
    "Animation|Children's|Comedy|Romance": 'F',
    "Children's|Drama": 'F',
    'Crime|Drama': 'M',
    'Drama|Fantasy|Romance|Thriller': 'MF',
    'Drama|Horror': 'M',
    'Comedy|Sci-Fi': 'M',
    'Mystery|Thriller': 'M',
    "Adventure|Children's|Comedy|Fantasy|Romance": 'F',
    'Action|Adventure|Fantasy|Sci-Fi': 'M',
    'Drama|Romance|War|Western': 'M',
    'Action|Crime': 'M',
    'Crime|Drama|Romance|Thriller': 'M',
    'Action|Adventure|Western': 'M',
    'Horror|Thriller': 'M',
    "Children's|Comedy|Fantasy": 'F',
    'Film-Noir|Thriller': 'M',
    'Action|Comedy|Musical|Sci-Fi': 'F',
    "Children's": 'F',
    'Drama|Mystery|Thriller': 'M',
    'Comedy|Romance|War': 'MF',
    'Action|Comedy': 'M',
    "Adventure|Children's|Romance": 'F',
    "Animation|Children's|Musical": 'F',
    'Comedy|Crime|Fantasy': 'M',
    'Action|Comedy|Western': 'M',
    'Action|Sci-Fi': 'M',
    'Action|Adventure|Comedy|Romance': 'M',
    'Comedy|Crime|Drama': 'M',
    'Comedy|Thriller': 'M',
    'Horror|Sci-Fi|Thriller': 'M',
    'Mystery|Romance|Thriller': 'M',
    'Comedy|Western': 'M',
    'Drama|Western': 'M',
    'Action|Adventure|Crime|Thriller': 'M',
    'Action|Comedy|War': 'M',
    'Comedy|Mystery': 'M',
    'Comedy|Mystery|Romance': 'F',
    'Comedy|Drama|War': 'M',
    'Action|Drama|Mystery': 'M',
    'Comedy|Crime|Horror': 'M',
    'Film-Noir|Sci-Fi': 'M',
    'Comedy|Romance|Thriller': 'M',
    "Action|Adventure|Children's|Sci-Fi": 'F',
    "Children's|Comedy|Musical": 'F',
    'Action|Adventure|Comedy': 'M',
    'Action|Crime|Romance': 'MF',
    "Action|Adventure|Animation|Children's|Fantasy": 'M',
    "Animation|Children's|Comedy|Musical": 'F',
    'Adventure|Drama|Western': 'M',
    'Action|Adventure|Crime|Drama': 'M',
    'Action|Adventure|Animation|Horror|Sci-Fi': 'M',
    'Action|Horror|Sci-Fi': 'M',
    'War': 'M',
    'Action|Adventure|Mystery': 'M',
    'Mystery': 'M',
    'Action|Adventure|Fantasy': 'M',
    "Adventure|Animation|Children's|Comedy|Fantasy": 'M',
    'Sci-Fi': 'M',
    'Documentary|Drama': 'F',
    'Action|Adventure|Comedy|War': 'M',
    'Crime|Film-Noir|Thriller': 'M',
    'Animation': 'F',
    'Action|Adventure|Romance|Thriller': 'M',
    'Animation|Sci-Fi': 'M',
    'Animation|Comedy|Thriller': 'M',
    'Film-Noir': 'M',
    'Sci-Fi|War': 'M',
    'Adventure': 'M',
    'Comedy|Crime': 'M',
    'Action|Sci-Fi|War': 'M',
    'Comedy|Fantasy|Romance|Sci-Fi': 'F',
    'Fantasy': 'MF',
    'Action|Mystery|Thriller': 'M',
    'Comedy|Musical': 'F',
    'Action|Adventure|Sci-Fi|Thriller': 'M',
    "Children's|Drama|Fantasy": 'F',
    'Adventure|War': 'M',
    'Musical|Romance': 'F',
    'Comedy|Musical|Romance': 'F',
    'Comedy|Mystery|Romance|Thriller': 'M',
    'Film-Noir|Mystery': 'M',
    'Musical': 'F',
    "Adventure|Children's|Drama|Musical": 'F',
    'Drama|Mystery|Sci-Fi|Thriller': 'M',
    'Romance|Thriller': 'MF',
    'Film-Noir|Romance|Thriller': 'M',
    'Crime|Film-Noir|Mystery': 'M',
    'Adventure|Comedy': 'M',
    'Action|Adventure|Romance|War': 'M',
    'Romance|War': 'MF',
    'Action|Drama|Western': 'M',
    "Children's|Comedy|Western": 'M',
    "Adventure|Children's|Comedy": 'M',
    "Children's|Comedy|Mystery": 'MF',
    "Adventure|Children's|Fantasy|Sci-Fi": 'M',
    "Adventure|Animation|Children's|Musical": 'F',
    "Adventure|Children's|Musical": 'F',
    'Crime|Film-Noir': 'M',
    "Adventure|Children's|Comedy|Fantasy": 'MF',
    "Children's|Drama|Fantasy|Sci-Fi": 'MF',
    'Action|Romance': 'M',
    'Adventure|Western': 'M',
    'Comedy|Fantasy': 'M',
    'Animation|Comedy': 'F',
    'Crime|Drama|Film-Noir': 'M',
    'Action|Adventure|Drama|Sci-Fi|War': 'M',
    'Action|Sci-Fi|Thriller|War': 'M',
    'Action|Western': 'M',
    "Action|Animation|Children's|Sci-Fi|Thriller|War": 'M',
    'Action|Adventure|Romance|Sci-Fi|War': 'M',
    'Action|Horror|Sci-Fi|Thriller': 'M',
    'Action|Adventure|Comedy|Horror|Sci-Fi': 'M',
    'Action|Comedy|Musical': 'F',
    'Mystery|Sci-Fi': 'M',
    'Film-Noir|Mystery|Thriller': 'M',
    'Adventure|Comedy|Drama': 'M' ,
    'Action|Adventure|Comedy|Horror': 'M',
    'Action|Drama|Mystery|Romance|Thriller': 'M',
    'Comedy|Mystery|Thriller': 'M',
    'Adventure|Animation|Sci-Fi|Thriller': 'M',
    'Action|Drama|Romance': 'F',
    'Action|Adventure|Drama': 'M',
    'Comedy|Drama|Musical': 'F',
    'Documentary|War': 'M',
    'Drama|Musical|War': 'F',
    'Action|Horror': 'M',
    'Horror|Romance': 'M',
    'Action|Comedy|Sci-Fi|War': 'M',
    'Crime|Drama|Sci-Fi': 'M',
    'Action|Romance|War': 'MF',
    'Action|Comedy|Crime|Drama': 'M',
    'Action|Drama|Thriller|War': 'M',
    "Action|Adventure|Children's": 'M',
    "Action|Adventure|Children's|Fantasy": 'M',
    "Adventure|Animation|Children's|Comedy|Musical": 'F',
    'Crime|Drama|Mystery': 'M',
    'Action|Adventure|Comedy|Sci-Fi': 'M',
    "Children's|Fantasy": 'F',
    'Action|Mystery|Sci-Fi|Thriller': 'M',
    'Action|Mystery|Romance|Thriller': 'M',
    'Adventure|Thriller': 'M',
    'Action|Thriller|War': 'M',
    'Action|Crime|Mystery': 'M',
    'Horror|Mystery|Thriller': 'M',
    'Crime|Horror|Mystery|Thriller': 'M',
    'Comedy|Drama|Thriller': 'M',
    'Drama|Sci-Fi|Thriller': 'M',
    'Drama|Romance|Thriller': 'M',
    'Action|Adventure|Sci-Fi|War': 'M',
    'Comedy|Crime|Drama|Mystery': 'MF',
    'Comedy|Crime|Mystery|Thriller': 'M',
    'Film-Noir|Sci-Fi|Thriller': 'M',
    'Adventure|Sci-Fi|Thriller': 'M',
    'Crime|Drama|Mystery|Thriller': 'M',
    'Comedy|Documentary': 'MF',
    'Documentary|Musical': 'F',
    'Action|Drama|Sci-Fi|Thriller': 'M',
    "Adventure|Animation|Children's|Fantasy": 'F',
    'Adventure|Comedy|Romance': 'F',
    'Mystery|Sci-Fi|Thriller': 'M',
    'Action|Comedy|Crime': 'M',
    "Animation|Children's|Fantasy|War": 'MF',
    'Action|Crime|Drama|Thriller': 'M',
    'Comedy|Sci-Fi|Western': 'M',
    "Children's|Fantasy|Musical": 'F',
    'Fantasy|Sci-Fi': 'M',
    "Children's|Comedy|Sci-Fi": 'M',
    "Action|Adventure|Children's|Comedy": 'M',
    "Adventure|Children's|Drama|Romance": 'F',
    "Adventure|Children's|Sci-Fi": 'M',
    "Adventure|Children's|Comedy|Fantasy|Sci-Fi": 'MF',
    "Animation|Children's|Comedy|Musical|Romance": 'F',
    "Children's|Musical": 'F',
    'Drama|Fantasy': 'F',
    "Animation|Children's|Fantasy|Musical": 'F',
    'Adventure|Comedy|Musical': 'F',
    "Children's|Sci-Fi": 'MF',
    "Children's|Horror": 'M',
    'Comedy|Fantasy|Romance': 'F',
    'Comedy|Crime|Thriller': 'M',
    "Adventure|Animation|Children's|Sci-Fi": 'MF',
    'Action|Crime|Mystery|Thriller': 'M',
    'Adventure|Musical': 'F',
    "Animation|Children's|Drama|Fantasy": 'F',
    "Children's|Fantasy|Sci-Fi": 'F',
    'Adventure|Fantasy|Romance': 'F',
    'Crime|Horror': 'M',
    'Action|Adventure|Horror': 'M',
    'Adventure|Fantasy|Sci-Fi': 'M',
    'Drama|Film-Noir|Thriller': 'M',
    'Action|Comedy|Fantasy': 'MF',
    'Sci-Fi|Thriller|War': 'M',
    'Action|Adventure|Sci-Fi|Thriller|War': 'M',
    'Action|Adventure|Drama|Thriller': 'M',
    'Crime|Horror|Thriller': 'M',
    'Animation|Musical': 'F',
    'Action|War': 'M',
    'Action|Comedy|Romance|Thriller': 'MF',
    'Comedy|Horror|Thriller': 'M',
    'Drama|Horror|Thriller': 'M',
    'Action|Sci-Fi|Thriller|Western': 'M',
    'Drama|Romance|Sci-Fi': 'MF',
    'Action|Adventure|Horror|Thriller': 'M',
    'Comedy|Film-Noir|Thriller': 'M',
    'Comedy|Horror|Musical|Sci-Fi': 'MF',
    'Comedy|Romance|Sci-Fi': 'MF',
    'Action|Comedy|Sci-Fi|Thriller': 'M',
    'Action|Sci-Fi|Western': 'M',
    'Comedy|Horror|Musical': 'F',
    'Crime|Mystery': 'M',
    'Animation|Mystery': 'MF',
    'Action|Horror|Thriller': 'M',
    'Action|Drama|Fantasy|Romance': 'F',
    'Horror|Mystery': 'M',
    "Adventure|Animation|Children's": 'F',
    'Musical|Romance|War': 'F',
    'Adventure|Drama|Romance': 'F',
    'Adventure|Animation|Film-Noir': 'M',
    'Action|Adventure|Animation': 'MF',
    'Comedy|Drama|Western': 'M',
    'Adventure|Comedy|Sci-Fi': 'M',
    'Drama|Romance|Western': 'F',
    'Comedy|Drama|Sci-Fi': 'M',
    'Action|Drama|Romance|Thriller': 'MF',
    'Adventure|Romance|Sci-Fi': 'MF',
    'Film-Noir|Horror': 'M',
    'Crime|Drama|Film-Noir|Thriller': 'M',
    'Action|Adventure|War': 'M',
    'Romance|Western': 'MF',
    "Action|Children's|Fantasy": 'F',
    'Adventure|Drama|Thriller': 'M',
    'Adventure|Fantasy': 'MF',
    'Musical|War': 'F',
    'Adventure|Musical|Romance': 'F',
    'Action|Romance|Sci-Fi': 'MF',
    'Drama|Film-Noir': 'M',
    'Comedy|Horror|Sci-Fi': 'M',
    'Adventure|Drama|Romance|Sci-Fi': 'MF',
    'Adventure|Animation|Sci-Fi': 'M',
    'Adventure|Crime|Sci-Fi|Thriller': 'M'
}

In [13]:
for genre, annot in GENRE_ANNOT.items():
    if annot == 'MF' and len(genre_movielist[genre]) > 5:
        print(genre, len(genre_movielist[genre]))

Animation|Children's 35
Drama 843


In [14]:
movie_gender_stereotype = []
for row in movies.itertuples():
    if row.title in movie_gendered:
        movie_gender_stereotype.append(movie_gendered[row.title])
    else:
        movie_gender_stereotype.append(GENRE_ANNOT[row.genres])

`model_attr` naming is used just because this is what is hard-coded into Wan et al.'s framework

In [15]:
movies['model_attr'] = movie_gender_stereotype

In [16]:
movies.head()

Unnamed: 0,movie_id,title,genres,model_attr
0,1,Toy Story (1995),Animation|Children's|Comedy,F
1,2,Jumanji (1995),Adventure|Children's|Fantasy,F
2,3,Grumpier Old Men (1995),Comedy|Romance,F
3,4,Waiting to Exhale (1995),Comedy|Drama,F
4,5,Father of the Bride Part II (1995),Comedy,M


In [17]:
movies.to_csv(os.path.join(DATA_DIR, 'movies.attr.csv'), index=False)

In [23]:
nohorror_movies = movies[~movies['movie_id'].isin(horror_movieids)]
nohorror_movies.to_csv(os.path.join(DATA_DIR, 'movies.attr.no_horror.csv'), index=False)

Create ratings file that can be used directly in Wan et al.'s framework

In [18]:
ratings = pd.merge(left=ratings, right=movies, left_on='item_id', right_on='movie_id')
ratings = pd.merge(left=ratings, right=users[['user_id', 'gender']], left_on='user_id', right_on='user_id')
ratings.head()

Unnamed: 0,user_id,item_id,rating,timestamp,movie_id,title,genres,model_attr,gender
0,1,1193,5,978300760,1193,One Flew Over the Cuckoo's Nest (1975),Drama,MF,F
1,1,661,3,978302109,661,James and the Giant Peach (1996),Animation|Children's|Musical,F,F
2,1,914,3,978301968,914,My Fair Lady (1964),Musical|Romance,F,F
3,1,3408,4,978300275,3408,Erin Brockovich (2000),Drama,F,F
4,1,2355,5,978824291,2355,"Bug's Life, A (1998)",Animation|Children's|Comedy,F,F


In [20]:
ratings.rename(columns={'gender': 'user_attr'}, inplace=True)
ratings.head()

Unnamed: 0,user_id,item_id,rating,timestamp,movie_id,title,genres,model_attr,user_attr
0,1,1193,5,978300760,1193,One Flew Over the Cuckoo's Nest (1975),Drama,MF,F
1,1,661,3,978302109,661,James and the Giant Peach (1996),Animation|Children's|Musical,F,F
2,1,914,3,978301968,914,My Fair Lady (1964),Musical|Romance,F,F
3,1,3408,4,978300275,3408,Erin Brockovich (2000),Drama,F,F
4,1,2355,5,978824291,2355,"Bug's Life, A (1998)",Animation|Children's|Comedy,F,F


In [24]:
nohorror_ratings = ratings[~ratings['movie_id'].isin(horror_movieids)]

In [25]:
def get_split(df, is_datetime=False, test_only=165):
    """
    Split data into training, validation, and test samples,
    where training is the earliest set of user ratings,
    validation is the middle set, and test is the latest set
    of user ratings.
    
    `test_only=165` default was selected after seeing that the
    mean number of ratings per user is around 165
    """
    
    if is_datetime: 
        df['timestamp'] = pd.to_datetime(df['timestamp'])
    df = df.sort_values(by='timestamp')
    
    user_ratings = []
    splits = []
    indices = []
    
    for g_id, group in df.groupby(by='user_id'):
        indices.extend(group.index)
        user_ratings.append(len(group))
        
        div = int(len(group) * 0.8)
        if len(group) < test_only:
            splits.extend([0] * div)
            splits.extend([2] * (len(group) - div))
        else:
            splits.extend([0] * div)
            rem = len(group) - div 
            splits.extend([1] * (rem // 2))
            splits.extend([2] * (rem - rem // 2))
            
    return np.array(splits), np.array(indices), np.array(user_ratings)

In [28]:
splits, indices, user_rcount = get_split(ratings, is_datetime=False)
nh_splits, nh_indices, nh_user_rcount = get_split(
    nohorror_ratings, is_datetime=False, test_only=150)

In [30]:
sorted_ratings = ratings.ix[indices]
sorted_ratings['split'] = splits
sorted_ratings.head()

.ix is deprecated. Please use
.loc for label based indexing or
.iloc for positional indexing

See the documentation here:
http://pandas.pydata.org/pandas-docs/stable/indexing.html#ix-indexer-is-deprecated
  """Entry point for launching an IPython kernel.


Unnamed: 0,user_id,item_id,rating,timestamp,movie_id,title,genres,model_attr,user_attr,split
31,1,3186,4,978300019,3186,"Girl, Interrupted (1999)",Drama,F,F,0
27,1,1721,4,978300055,1721,Titanic (1997),Drama|Romance,M,F,0
37,1,1022,5,978300055,1022,Cinderella (1950),Animation|Children's|Musical,F,F,0
22,1,1270,5,978300055,1270,Back to the Future (1985),Comedy|Sci-Fi,M,F,0
24,1,2340,3,978300103,2340,Meet Joe Black (1998),Romance,F,F,0


In [33]:
nh_sorted_ratings = nohorror_ratings.ix[nh_indices]
nh_sorted_ratings['split'] = nh_splits
nh_sorted_ratings.head()

.ix is deprecated. Please use
.loc for label based indexing or
.iloc for positional indexing

See the documentation here:
http://pandas.pydata.org/pandas-docs/stable/indexing.html#ix-indexer-is-deprecated
  """Entry point for launching an IPython kernel.


Unnamed: 0,user_id,item_id,rating,timestamp,movie_id,title,genres,model_attr,user_attr,split
31,1,3186,4,978300019,3186,"Girl, Interrupted (1999)",Drama,F,F,0
22,1,1270,5,978300055,1270,Back to the Future (1985),Comedy|Sci-Fi,M,F,0
27,1,1721,4,978300055,1721,Titanic (1997),Drama|Romance,M,F,0
37,1,1022,5,978300055,1022,Cinderella (1950),Animation|Children's|Musical,F,F,0
24,1,2340,3,978300103,2340,Meet Joe Black (1998),Romance,F,F,0


In [34]:
sorted_ratings.to_csv(os.path.join(OUT_DIR, 'df_movielens_1m.csv'), index=False)

In [35]:
nh_sorted_ratings.to_csv(os.path.join(OUT_DIR, 'df_movielens_1m.no_horror.csv'), index=False)