## Generate Train/Test/Val Splits

In [27]:
import h5py
import sys
import gc
import os
import pandas as pd
import numpy as np

In [2]:
OUT_DIR = '/data/lisatmp4/jethahan/datasets/nflx_full/tf/'

In [3]:
def load_data(fn):
    ds = h5py.File(fn,'r')
    print('Loading data into memory')
    # Load entire ds into memory
    data_arr = ds['nflx_full'][()]
    return data_arr

In [4]:
# load the data into memory
ds = load_data('/data/lisatmp4/jethahan/datasets/nflx_full/nflx_full.h5')
df = pd.DataFrame(ds)
ds = None
# explicitly freee np array memory
_ = gc.collect()

Loading data into memory


In [5]:
df.columns = ['user_id', 'movie_id', 'rating']
df.head()

Unnamed: 0,user_id,movie_id,rating
0,1228213.0,10817.0,5.0
1,2032427.0,10817.0,5.0
2,476805.0,10817.0,2.0
3,503548.0,10817.0,5.0
4,2236799.0,10817.0,3.0


In [6]:
# Calculate sparsity
unique_user_ids = sorted(pd.unique(df['user_id']))
unique_movie_ids = sorted(pd.unique(df['movie_id']))
num_users = len(unique_user_ids)
num_movies = len(unique_movie_ids)
sparsity = (df.shape[0] / (num_users*num_movies))*100
print("sparsity  %.2f% %" % sparsity)

sparsity  1.18%


In [7]:
# Reindex users & movies
uid2uix = dict((user_id, ix) for (ix, user_id) in enumerate(unique_user_ids))
mid2mix = dict((movie_id, ix) for (ix, movie_id) in enumerate(unique_movie_ids))

In [8]:
# Generate train & test DFs
np.random.seed(12345)
num_ratings = df.shape[0]
test_ixs = np.random.choice(num_ratings, size=int(0.20 * num_ratings), replace=False)
test_mask = np.zeros(num_ratings, dtype=bool)
test_mask[test_ixs] = True
test_df = df[test_mask]
train_df = df[~test_mask]

In [9]:
# Data validation checks
print("USERS:\ntrain_unique: %d\nfull_unique %d" % (len(pd.unique(train_df['user_id'])), len(pd.unique(df['user_id']))))

USERS:
train_unique: 479844
full_unique 480189


In [10]:
print("MOVIES:\ntrain_unique: %d\nfull_unique %d" % (len(pd.unique(train_df['movie_id'])), len(pd.unique(df['movie_id']))))

MOVIES:
train_unique: 17770
full_unique 17770


In [11]:
# Find user_ids not in train set
train_ids = set(pd.unique(train_df['user_id'])) 
all_ids = set(pd.unique(df['user_id']))
move_ids = all_ids - train_ids
move_ixs = test_df['user_id'].isin(move_ids)
train_df = train_df.append(test_df[move_ixs])
test_df = test_df[~move_ixs]

In [12]:
print("USERS:\ntrain_unique: %d\nfull_unique %d" % (len(pd.unique(train_df['user_id'])), len(pd.unique(df['user_id']))))

USERS:
train_unique: 480189
full_unique 480189


In [13]:
print("MOVIES:\ntrain_unique: %d\nfull_unique %d" % (len(pd.unique(train_df['movie_id'])), len(pd.unique(df['movie_id']))))

MOVIES:
train_unique: 17770
full_unique 17770


In [14]:
# Create validation set
np.random.seed(138238)
num_ratings = train_df.shape[0]
val_ixs = np.random.choice(num_ratings, size=int(0.10 * num_ratings), replace=False)
val_mask = np.zeros(num_ratings, dtype=bool)
val_mask[val_ixs] = True
val_df = train_df[val_mask]
train_df = train_df[~val_mask]

In [15]:
print("USERS:\ntrain_unique: %d\nfull_unique %d" % (len(pd.unique(train_df['user_id'])), len(pd.unique(df['user_id']))))

USERS:
train_unique: 479935
full_unique 480189


In [16]:
print("MOVIES:\ntrain_unique: %d\nfull_unique %d" % (len(pd.unique(train_df['movie_id'])), len(pd.unique(df['movie_id']))))

MOVIES:
train_unique: 17770
full_unique 17770


In [17]:
# Find user_ids not in train set
train_ids = set(pd.unique(train_df['user_id'])) 
all_ids = set(pd.unique(df['user_id']))
move_ids = all_ids - train_ids
move_ixs = val_df['user_id'].isin(move_ids)
train_df = train_df.append(val_df[move_ixs])
val_df = val_df[~move_ixs]

In [18]:
print("USERS:\ntrain_unique: %d\nfull_unique %d" % (len(pd.unique(train_df['user_id'])), len(pd.unique(df['user_id']))))

USERS:
train_unique: 480189
full_unique 480189


In [19]:
print("MOVIES:\ntrain_unique: %d\nfull_unique %d" % (len(pd.unique(train_df['movie_id'])), len(pd.unique(df['movie_id']))))

MOVIES:
train_unique: 17770
full_unique 17770


In [20]:
map_user_ids = list(map(lambda x: uid2uix[x], train_df['user_id']))
map_movie_ids = list(map(lambda x: mid2mix[x], train_df['movie_id']))
train_df['user_id'] = map_user_ids
train_df['movie_id'] = map_movie_ids

In [21]:
map_user_ids = list(map(lambda x: uid2uix[x], test_df['user_id']))
map_movie_ids = list(map(lambda x: mid2mix[x], test_df['movie_id']))
test_df['user_id'] = map_user_ids
test_df['movie_id'] = map_movie_ids

In [22]:
map_user_ids = list(map(lambda x: uid2uix[x], val_df['user_id']))
map_movie_ids = list(map(lambda x: mid2mix[x], val_df['movie_id']))
val_df['user_id'] = map_user_ids
val_df['movie_id'] = map_movie_ids

In [28]:
train_df.to_csv(os.path.join(OUT_DIR, 'train.csv'), index=False)

In [29]:
test_df.to_csv(os.path.join(OUT_DIR, 'test.csv'), index=False)

In [30]:
val_df.to_csv(os.path.join(OUT_DIR, 'val.csv'), index=False)