# Preprocess MovieLens

In [1]:
import datetime
import json
import os
import time

import numpy as np

import matplotlib
matplotlib.use('Agg')
import matplotlib.pyplot as plt
%matplotlib inline

import pandas as pd
import scipy.sparse

import seaborn as sns
sns.set(context="paper", font_scale=1.5, rc={"lines.linewidth": 2}, font='DejaVu Serif')
from sklearn.utils import shuffle

In [30]:
dname = 'ml-1m'

In [31]:
def timestamp_to_date(timestamp):
    return datetime.datetime.fromtimestamp(timestamp).strftime('%Y-%m-%d %H:%M:%S')

In [33]:
if dname == 'ml-100k':
    DATA_DIR = 'data/ml-100k/'
    raw_data = pd.read_csv(os.path.join(DATA_DIR, 'u.data'),sep = '\t', header=0)
elif dname == 'ml-1m':
    DATA_DIR = 'data/ml-1m'
    raw_data = pd.read_csv(os.path.join(DATA_DIR, 'ratings.dat'),sep = '::', header=0,engine = 'python')

In [34]:
raw_data.columns = ['userId','movieId','rating','timestamp']
raw_data.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,1,661,3,978302109
1,1,914,3,978301968
2,1,3408,4,978300275
3,1,2355,5,978824291
4,1,1197,3,978302268


In [35]:
raw_data = raw_data[raw_data.rating >= 4]
raw_data = shuffle(raw_data)

In [36]:
raw_data.head()

Unnamed: 0,userId,movieId,rating,timestamp
524693,3238,1242,5,968355304
677143,4058,3359,4,996712145
727831,4352,2027,4,965534478
313114,1866,223,5,974760856
279331,1680,2303,4,974711125


In [37]:
tstamp = np.array(raw_data['timestamp'])

In [38]:
print("Time span of the dataset: From %s to %s" % 
      (timestamp_to_date(np.min(tstamp)), timestamp_to_date(np.max(tstamp))))

Time span of the dataset: From 2000-04-26 08:05:32 to 2003-03-01 02:49:50


In [39]:
train_raw_data = raw_data[:int(0.8 * raw_data.shape[0])]

In [45]:
def get_count(tp, id):
    playcount_groupbyid = tp[[id]].groupby(id, as_index=False)
    count = playcount_groupbyid.size()
    return count

In [46]:
def filter_triplets(tp, min_uc=3, min_sc=0):
    # Only keep the triplets for songs which were listened to by at least min_sc users. 
    if min_sc > 0:
        songcount = get_count(tp, 'movieId')
        tp = tp[tp['movieId'].isin(songcount.index[songcount >= min_sc])]
    
    # Only keep the triplets for users who listened to at least min_uc songs
    # After doing this, some of the songs will have less than min_uc users, but should only be a small proportion
    if min_uc > 0:
        usercount = get_count(tp, 'userId')
        tp = tp[tp['userId'].isin(usercount.index[usercount >= min_uc])]
    
    # Update both usercount and songcount after filtering
    usercount, songcount = get_count(tp, 'userId'), get_count(tp, 'movieId') 
    return tp, usercount, songcount

In [47]:
train_raw_data, user_activity, item_popularity = filter_triplets(train_raw_data)

In [48]:
sparsity = 1. * train_raw_data.shape[0] / (user_activity.shape[0] * item_popularity.shape[0])

print("After filtering, there are %d watching events from %d users and %d movies (sparsity: %.3f%%)" % 
      (train_raw_data.shape[0], user_activity.shape[0], item_popularity.shape[0], sparsity * 100))

After filtering, there are 460220 watching events from 6035 users and 3502 movies (sparsity: 2.178%)


In [49]:
unique_uid = user_activity.index
unique_sid = item_popularity.index

In [50]:
song2id = dict((sid, i) for (i, sid) in enumerate(unique_sid))
user2id = dict((uid, i) for (i, uid) in enumerate(unique_uid))

In [51]:
with open(os.path.join(DATA_DIR, 'pro', 'unique_uid.txt'), 'w') as f:
    for uid in unique_uid:
        f.write('%s\n' % uid)

In [52]:
with open(os.path.join(DATA_DIR, 'pro', 'unique_sid.txt'), 'w') as f:
    for sid in unique_sid:
        f.write('%s\n' % sid)

Split 12.5% (10% of the total ratings) as validation set

Make sure there is no empty users/items

In [53]:
print ("There are total of %d unique users in the training set and %d unique users in the entire dataset" % \
(len(pd.unique(train_raw_data['userId'])), len(unique_uid)))

There are total of 6035 unique users in the training set and 6035 unique users in the entire dataset


In [54]:
print ("There are total of %d unique items in the training set and %d unique items in the entire dataset" % \
(len(pd.unique(train_raw_data['movieId'])), len(unique_sid)))

There are total of 3502 unique items in the training set and 3502 unique items in the entire dataset


In [55]:
train_sid = set(pd.unique(train_raw_data['movieId']))

In [56]:
left_sid = list()
for i, sid in enumerate(unique_sid):
    if sid not in train_sid:
        left_sid.append(sid)

In [57]:
print ("There are total of %d unique items in the training set and %d unique items in the entire dataset" % \
(len(pd.unique(train_raw_data['movieId'])), len(unique_sid)))

There are total of 3502 unique items in the training set and 3502 unique items in the entire dataset


For test data, only keep the users and items that appear in the training/validation sets

In [58]:
test_raw_data = raw_data[int(0.8 * len(raw_data)):]

In [59]:
test_raw_data = test_raw_data[test_raw_data['movieId'].isin(unique_sid)]
test_raw_data = test_raw_data[test_raw_data['userId'].isin(unique_uid)]

In [60]:
print (len(train_raw_data), len(test_raw_data))

460220 115020


Basic data information: what's the timespan for train/test?

### Numerize the data into (timestamp, user_index, item_index) format

In [61]:
def numerize(tp):
    otp = tp.copy()
    otp['uid'] = tp.userId.apply(lambda x : user2id[x])
    otp['sid'] = tp.movieId.apply(lambda x : song2id[x])
    return otp[['timestamp', 'uid', 'sid','rating']]

In [62]:
test_data = numerize(test_raw_data)
test_data.to_csv(os.path.join(DATA_DIR, 'pro', 'test.csv'), index=False)

In [63]:
train_data = numerize(train_raw_data)
train_data.to_csv(os.path.join(DATA_DIR, 'pro', 'train.csv'), index=False)