In [26]:
import numpy as np
import pandas as pd
from scipy.sparse import csr_matrix, csgraph, csc_matrix
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.metrics import mean_squared_error
from sklearn.cluster import SpectralClustering

In [2]:
# load dataset
df = pd.read_csv("file:/home/jvanover/Documents/ucd/271/programming-assignments/recommendation-system/data/my_train.csv")

# sort by customer-id and movie-id
df.sort_values(by=['customer-id','movie-id'], inplace=True)

# dicts for mapping ids to indices in the customer_nodes matrix
cid_to_index = {key:value for value, key in enumerate(df["customer-id"].unique())}
mid_to_index = {key:value for value, key in enumerate(df["movie-id"].unique())}
# mid_to_index = {x:x-1 for x in range(1,17771)}

# construct customer_nodes matrix
row = np.array(df['customer-id'].apply(lambda x : cid_to_index[x]).tolist())
col = np.array(df['movie-id'].apply(lambda x : mid_to_index[x]).tolist())
data = np.array(df['rating'].tolist())
customer_nodes = csc_matrix( (data,(row,col)), shape=(len(cid_to_index), len(mid_to_index)))

In [3]:
# calculate cosine similarities to make the affinity matrix
W = cosine_similarity(customer_nodes)

In [82]:
# perform clustering
clustering =  SpectralClustering(
                                    n_clusters=25,
                                    n_components=7, # number of eigenvectors to use for spectral embedding
                                    random_state=25, 
                                    n_init=10,       # number of k-means iterations
                                    affinity='precomputed_nearest_neighbors',
                                    n_neighbors=500
                                ).fit(W)

In [None]:
# let's see that clustering distribution!
 df_cid_clustering = pd.DataFrame.from_dict(
                                             { key:value for key,value in zip( df["customer-id"].unique() , clustering.labels_ ) },
                                             orient='index',
                                             columns=["cluster"]
                                         ).reset_index()
 df_cid_clustering["cluster"].value_counts()

In [83]:
# create mapping from cid to cluster and vice-versa
cid_to_cluster = { key:value for key,value in zip( df["customer-id"].unique() , clustering.labels_ ) }
cluster_to_cids = [[] for x in range(len(clustering.labels_))]
for cid, cluster_no in cid_to_cluster.items():
    cluster_to_cids[cluster_no].append(cid)

In [113]:
def estimate_rating(cid, mid):
    
    # what other customers are in this cluster?
    cids = cluster_to_cids[cid_to_cluster[cid]]
    
    cid_indices = [cid_to_index[x] for x in cids]
    mid_index = mid_to_index[mid]
    
    # get slice of cids from the same cluster for the particular mid
    temp = customer_nodes.getcol(mid_index).tocsr()[cid_indices]
    
    # eliminate zeros and calculate mean of nonzero data
    temp.eliminate_zeros()
    if temp.nnz > 0:
        average_rating = round(temp.data.mean())
    else:
        average_rating = 0
        
    return average_rating

In [114]:
df_eval = pd.read_csv("file:/home/jvanover/Documents/ucd/271/programming-assignments/recommendation-system/data/my_eval_blank.csv")
df_eval_answers = pd.read_csv("file:/home/jvanover/Documents/ucd/271/programming-assignments/recommendation-system/data/my_eval_answers.csv")

df_eval["rating"] = df_eval.apply(lambda row : estimate_rating(row["customer-id"],row["movie-id"]), axis=1)

mean_squared_error(df_eval_answers["rating"].tolist(),df_eval["rating"])

1.5394103136379655

In [115]:
df_eval["rating"].value_counts()

4.0    56415
3.0    28146
5.0     3279
0.0     2956
2.0     2520
1.0      279
6.0       47
7.0        1
Name: rating, dtype: int64

In [112]:
df_eval_answers["rating"].value_counts()

4    31755
3    26585
5    22226
2     9044
1     4033
Name: rating, dtype: int64