In [None]:
import pandas as pd
import io, csv, os, sys
from sklearn.utils import shuffle
import numpy as np
from collections import defaultdict

In [0]:
base_path = os.getcwd()
sys.path.append(base_path)
import spectral_clustering, save_utils, validate

In [None]:
class options:
    def __init__(self):
        self.save_objs = False
        self.load_objs = False
        self.split_dataset = True

opt = options()

In [None]:
if opt.save_objs or opt.load_objs:
    os.makedirs(base_path + '/obj/', exist_ok=True)
    pre = base_path + '/obj/'

In [None]:
raw_df = pd.read_csv( base_path + "train.csv")
raw_df = shuffle(raw_df)
raw_df = raw_df.reset_index(drop=True)

if opt.split_dataset:
    train_df, validate_df = spectral_clustering.split_df(raw_df, 0.8)
else:
    train_df, validate_df = spectral_clustering.split_df(raw_df, 0.0)

In [None]:
train_customers = train_df['customer-id']
train_cid_sorted = train_customers.sort_values()
train_movies = train_df['movie-id']
train_mid_sorted = train_movies.sort_values()

train_cid = spectral_clustering.create_dict(train_cid_sorted)
train_mid = spectral_clustering.create_dict(train_mid_sorted)

train_movies_n = len(train_mid)
train_cust_n = len(train_cid)
print(train_movies_n, train_cust_n)

In [None]:
train_ratings = spectral_clustering.make_matrix(train_df, train_cid, train_mid)

if opt.save_objs:
    save_utils.save_obj(train_ratings, pre+'train_ratings')
if opt.load_objs:
    train_ratings = save_utils.load_obj(pre+'train_ratings')

In [None]:
preprocessed_matrix = process_matrix(train_ratings)

train_movie_by_movie = create_affinity_matrix(preprocessed_matrix)

cluster_labels, label_map = cluster(train_movie_by_movie)

if opt.save_objs:
    save_utils.save_obj(cluster_labels, pre+'cluster_labels')
if opt.load_objs:
    train_ratings = save_utils.load_obj(pre+'cluster_labels')

In [None]:
result = validate.predict(validate_df, train_cid,
                    train_mid, train_ratings, train_movie_by_movie, 
                    cluster_labels, label_map, block_size=10000)

np.savetxt(base_path + 'pred_final.txt', np.around(result[:, 2]))

mse, rmse = calculate_metric(result, validate_df)