In [120]:
import numpy as np
import pandas as pd
import math
import matplotlib.pyplot as plt
from scipy import sparse
import seaborn as sns
import random

In [2]:
#import data
data = pd.read_csv("./data/combined_data_1.txt", header = None, names = ['cust_id', 'rating', 'date'])

In [153]:
print("full dataset:" + str(data.shape))
# To build cleaning logic, take the first 100,000 rows (0.5%)
df = data[:10000]
print("for cleaning logic purposes: " + str(df.shape))

full dataset:(24058263, 3)
for cleaning logic purposes: (10000, 3)


In [154]:
# Iterate and fill a list of python dicts. At end of loop, 
# put dict list in another pandas df. Want to look into more efficient method, but for now this'll do.

df_clean = pd.DataFrame({'rating' : [], 'cust_id' : [], 'movie_id' : []})
movie_id = -1
append_dict = {}
cust_conv_dict = {}
movie_conv_dict = {}
append_matrix = []
for row in df.itertuples():
    if(math.isnan(row[2])):
        movie_ext_id = row[1][:-1]
        if(movie_ext_id not in movie_conv_dict):
            movie_conv_dict[movie_ext_id] = len(movie_conv_dict)
        movie_id = movie_conv_dict[movie_ext_id]
        continue
    else:
        cust_ext_id = float(row[1])
        if(cust_ext_id not in cust_conv_dict):
            cust_conv_dict[cust_ext_id] = len(cust_conv_dict)
        cust_id = cust_conv_dict[cust_ext_id]
        append_dict["rating"] = float(row[2])
        append_dict["cust_id"] = cust_id
        append_dict["movie_id"] = movie_id
        append_matrix.append(append_dict.copy())

df_clean = df_clean.append(append_matrix,ignore_index=True)

In [155]:
print(df_clean.head(10))

   rating  cust_id  movie_id
0     3.0      0.0       0.0
1     5.0      1.0       0.0
2     4.0      2.0       0.0
3     4.0      3.0       0.0
4     3.0      4.0       0.0
5     3.0      5.0       0.0
6     4.0      6.0       0.0
7     3.0      7.0       0.0
8     4.0      8.0       0.0
9     3.0      9.0       0.0


In [156]:
# Check for how many unique movies and customer
print("Customers: " + str(df_clean["cust_id"].nunique()))
print("Movies: " + str(df_clean["movie_id"].nunique()))
print("overlap%: " + str(df_clean["cust_id"].nunique()/df.shape[0]))

Customers: 9619
Movies: 8
overlap%: 0.9619


In [157]:
# Create sparse matrix and test train splits
df_clean = df_clean.sample(frac=1)
n_users = df_clean["cust_id"].nunique()
n_movies = df_clean["movie_id"].nunique()
train_percent = 0.8
msk = np.random.rand(len(df_clean)) < train_percent
df_raw_train = df_clean[msk]
df_raw_test = df_clean[~msk]


df_train = sparse.coo_matrix((df_raw_train["rating"], (df_raw_train["cust_id"], df_raw_train["movie_id"])), 
                          shape=(n_users, n_movies))
df_test = sparse.coo_matrix((df_raw_test["rating"], (df_raw_test["cust_id"], df_raw_test["movie_id"])), 
                         shape=(n_users, n_movies))

In [158]:
# FIRST REC ALGO!!
from sklearn.metrics.pairwise import pairwise_distances

# User Similarity Matrix
user_correlation = 1 - pairwise_distances(df_train.todense(), metric='correlation')
user_correlation[np.isnan(user_correlation)] = 0

# Function to predict ratings
def predict(ratings, similarity):
    mean_user_rating = ratings.mean(axis=1)
    # Use np.newaxis so that mean_user_rating has same format as ratings
    ratings_diff = (ratings - mean_user_rating[:, np.newaxis])
    pred = mean_user_rating[:, np.newaxis] + similarity.dot(ratings_diff) / np.array([np.abs(similarity).sum(axis=1)]).T
    return pred

