# Collaborative Filtering Recommendations via Matrix Factorization

## Step 1: Set parameters and download dataset if it does not exist

In [1]:
import os
import urllib.request
import zipfile

In [2]:
# dataset choices: 100k, 1m, 10m, 20m, latest-small, latest
dataset = '100k'
data_dir = 'data/'
input_dir = '{}ml-{}'.format(data_dir, dataset)

In [3]:
def download_dataset(dataset, out_dir):
    zip_format = '{}ml-{}.zip'
    base_url = 'http://files.grouplens.org/datasets/movielens/'
    
    zip_url = zip_format.format(base_url, dataset)

    os.makedirs(data_dir, exist_ok=True)

    zip_file, _ = urllib.request.urlretrieve(zip_url)

    with zipfile.ZipFile(zip_file, 'r') as zip_ref:
        zip_ref.extractall(out_dir)

    os.unlink(zip_file)

In [4]:
if not os.path.exists(input_dir):
    download_dataset(dataset, data_dir)
    print("Database downloaded.")
else:
    print("Database already exists.")

Database already exists.


## Step 2: Read the rating data as a coordinate list

In [5]:
import numpy as np

In [6]:
train_input_file = 'u1.base'
test_input_file = 'u1.test'

In [15]:
def get_file_data(input_file, separator='\t', ignore_extra=True):
    with open(input_file, 'r') as in_file:
        file_data = np.array([[int(v) for v in line.split(separator)] for line in in_file], dtype=np.int32)
    if ignore_extra:
        file_data = file_data[:, :3]
    return file_data

In [16]:
train_data = get_file_data(os.path.join(input_dir, train_input_file))
test_data = get_file_data(os.path.join(input_dir, test_input_file))

In [17]:
print(train_data.shape, test_data.shape)

(80000, 3) (20000, 3)
