# Preprocess MovieLens

In [3]:
import datetime
import json
import os
import time

import numpy as np

import matplotlib
matplotlib.use('Agg')
import matplotlib.pyplot as plt
%matplotlib inline

import pandas as pd
import scipy.sparse

import seaborn as sns
sns.set(context="paper", font_scale=1.5, rc={"lines.linewidth": 2}, font='DejaVu Serif')
from sklearn.utils import shuffle

This call to matplotlib.use() has no effect because the backend has already
been chosen; matplotlib.use() must be called *before* pylab, matplotlib.pyplot,
or matplotlib.backends is imported for the first time.

The backend was *originally* set to 'module://ipykernel.pylab.backend_inline' by the following code:
  File "/home/ita/anaconda3/lib/python3.6/runpy.py", line 193, in _run_module_as_main
    "__main__", mod_spec)
  File "/home/ita/anaconda3/lib/python3.6/runpy.py", line 85, in _run_code
    exec(code, run_globals)
  File "/home/ita/anaconda3/lib/python3.6/site-packages/ipykernel_launcher.py", line 16, in <module>
    app.launch_new_instance()
  File "/home/ita/anaconda3/lib/python3.6/site-packages/traitlets/config/application.py", line 658, in launch_instance
    app.start()
  File "/home/ita/anaconda3/lib/python3.6/site-packages/ipykernel/kernelapp.py", line 478, in start
    self.io_loop.start()
  File "/home/ita/anaconda3/lib/python3.6/site-packages/zmq/eventloop/ioloop.py",

In [4]:
dname = 'ml-100k'

In [5]:
def timestamp_to_date(timestamp):
    return datetime.datetime.fromtimestamp(timestamp).strftime('%Y-%m-%d %H:%M:%S')

In [6]:
if dname == 'ml-100k':
    DATA_DIR = 'data/ml-100k/'
    raw_data = pd.read_csv(os.path.join(DATA_DIR, 'u.data'),sep = '\t', header=0)
elif dname == 'ml-1m':
    DATA_DIR = 'data/ml-1m'
    raw_data = pd.read_csv(os.path.join(DATA_DIR, 'ratings.dat'),sep = '::', header=0,engine = 'python')

In [7]:
raw_data.columns = ['userId','movieId','rating','timestamp']
raw_data.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,186,302,3,891717742
1,22,377,1,878887116
2,244,51,2,880606923
3,166,346,1,886397596
4,298,474,4,884182806


In [8]:
raw_data = raw_data[raw_data.rating >= 4]
raw_data = shuffle(raw_data)

In [9]:
raw_data.head()

Unnamed: 0,userId,movieId,rating,timestamp
8166,328,162,4,885048004
63268,626,258,4,878771243
9092,226,14,5,883889691
30511,230,216,4,880484444
48501,345,258,4,884916532


In [10]:
tstamp = np.array(raw_data['timestamp'])

In [11]:
print("Time span of the dataset: From %s to %s" % 
      (timestamp_to_date(np.min(tstamp)), timestamp_to_date(np.max(tstamp))))

Time span of the dataset: From 1997-09-20 12:05:10 to 1998-04-23 08:10:38


In [12]:
train_raw_data = raw_data[:int(0.8 * raw_data.shape[0])]

In [13]:
def get_count(tp, id):
    playcount_groupbyid = tp[[id]].groupby(id, as_index=False)
    count = playcount_groupbyid.size()
    return count

In [14]:
def filter_triplets(tp, min_uc=0, min_sc=0):
    # Only keep the triplets for songs which were listened to by at least min_sc users. 
    if min_sc > 0:
        songcount = get_count(tp, 'movieId')
        tp = tp[tp['movieId'].isin(songcount.index[songcount >= min_sc])]
    
    # Only keep the triplets for users who listened to at least min_uc songs
    # After doing this, some of the songs will have less than min_uc users, but should only be a small proportion
    if min_uc > 0:
        usercount = get_count(tp, 'userId')
        tp = tp[tp['userId'].isin(usercount.index[usercount >= min_uc])]
    
    # Update both usercount and songcount after filtering
    usercount, songcount = get_count(tp, 'userId'), get_count(tp, 'movieId') 
    return tp, usercount, songcount

In [15]:
train_raw_data, user_activity, item_popularity = filter_triplets(train_raw_data)

In [16]:
sparsity = 1. * train_raw_data.shape[0] / (user_activity.shape[0] * item_popularity.shape[0])

print("After filtering, there are %d watching events from %d users and %d movies (sparsity: %.3f%%)" % 
      (train_raw_data.shape[0], user_activity.shape[0], item_popularity.shape[0], sparsity * 100))

After filtering, there are 44300 watching events from 942 users and 1417 movies (sparsity: 3.319%)


In [17]:
unique_uid = user_activity.index
unique_sid = item_popularity.index

In [18]:
song2id = dict((sid, i) for (i, sid) in enumerate(unique_sid))
user2id = dict((uid, i) for (i, uid) in enumerate(unique_uid))

In [21]:
with open(os.path.join(DATA_DIR, 'pro', 'unique_uid.txt'), 'w') as f:
    for uid in unique_uid:
        f.write('%s\n' % uid)

In [22]:
with open(os.path.join(DATA_DIR, 'pro', 'unique_sid.txt'), 'w') as f:
    for sid in unique_sid:
        f.write('%s\n' % sid)

Split 12.5% (10% of the total ratings) as validation set

Make sure there is no empty users/items

In [23]:
print ("There are total of %d unique users in the training set and %d unique users in the entire dataset" % \
(len(pd.unique(train_raw_data['userId'])), len(unique_uid)))

There are total of 942 unique users in the training set and 942 unique users in the entire dataset


In [24]:
print ("There are total of %d unique items in the training set and %d unique items in the entire dataset" % \
(len(pd.unique(train_raw_data['movieId'])), len(unique_sid)))

There are total of 1417 unique items in the training set and 1417 unique items in the entire dataset


In [25]:
train_sid = set(pd.unique(train_raw_data['movieId']))

In [26]:
left_sid = list()
for i, sid in enumerate(unique_sid):
    if sid not in train_sid:
        left_sid.append(sid)

In [27]:
print ("There are total of %d unique items in the training set and %d unique items in the entire dataset" % \
(len(pd.unique(train_raw_data['movieId'])), len(unique_sid)))

There are total of 1417 unique items in the training set and 1417 unique items in the entire dataset


For test data, only keep the users and items that appear in the training/validation sets

In [28]:
test_raw_data = raw_data[int(0.8 * len(raw_data)):]

In [29]:
test_raw_data = test_raw_data[test_raw_data['movieId'].isin(unique_sid)]
test_raw_data = test_raw_data[test_raw_data['userId'].isin(unique_uid)]

In [30]:
print (len(train_raw_data), len(test_raw_data))

44300 11043


Basic data information: what's the timespan for train/test?

### Numerize the data into (timestamp, user_index, item_index) format

In [31]:
def numerize(tp):
    otp = tp.copy()
    otp['uid'] = tp.userId.apply(lambda x : user2id[x])
    otp['sid'] = tp.movieId.apply(lambda x : song2id[x])
    return otp[['timestamp', 'uid', 'sid','rating']]

In [32]:
test_data = numerize(test_raw_data)
test_data.to_csv(os.path.join(DATA_DIR, 'pro', 'test.csv'), index=False)

In [33]:
train_data = numerize(train_raw_data)
train_data.to_csv(os.path.join(DATA_DIR, 'pro', 'train.csv'), index=False)