Run Import_Data_Process.ipynb before running this notebook to get pre-processed data

## Import Packages/Dataset & Data Pre-Processing

In [None]:
# Importing Necessary Packages

from implicit.nearest_neighbours import tfidf_weight
from scipy.sparse import coo_matrix, csr_matrix
from sklearn.metrics.pairwise import cosine_similarity
from datetime import datetime
from pathlib import Path
from numpy import bincount, log, sqrt

import scipy.sparse as sparse
import implicit
import pandas as pd
import numpy as np
import pickle
import time
import heapq

In [None]:
# Read data files
df_order_products_prior = pd.read_csv("order_products__prior.csv")
df_order_products_train = pd.read_csv("order_products__train.csv")
df_orders = pd.read_csv("orders.csv") 
df_products = pd.read_csv("products.csv")

# Merge prior orders and products
df_merged_prior = pd.merge(df_order_products_prior, df_products, on="product_id", how="left")

In [None]:
#split validation set 
validation_order = order_prior.groupby("user_id", as_index=False).max()['user_id','order_id']
validation_set = pd.merge(order_product)

In [None]:
# Read user_products and product_frequency from disk
df_prior_user_products = pd.read_pickle("df_user_products_prior.pkl")
df_product_frequency = pd.read_pickle("df_product_frequency.pkl")
df_tfidf=pd.read_pickle("df_user_cos.pkl")
df_product_frequency = pd.DataFrame(df_product_frequency).rename(columns={"product_id": "frequency"})

In [None]:
# Read from saved test data
#test data 
test_data_path = "user_products__test.csv"
df_user_products_test = pd.read_csv(test_data_path)
df_user_products_test.head()
index = df_user_products_test.index
number_of_users = len(index)
print(number_of_users)

## Create Necessary Matrices

In [None]:
# Make user_product dataframe

In [None]:
def user_product_prior(filepath, df_orders, df_order_products_prior):
    """
    Generates a dataframe of users and their purchase of products
    """
    order_user = df_orders.loc[df_orders.eval_set == "prior"]
    order_user = order_user[["order_id", "user_id"]]
    
    
    # merge order:duplic user_id with duplic order_id:product_id on order_id
    # take out order id so only duplic user_id: product_id remains
    # Add quantity column
    df_merged = pd.merge(order_user, df_order_products_prior[["order_id", "product_id"]], on="order_id")
    user_product = df_merged[["user_id", "product_id"]]
    user_product = user_product.groupby(["user_id", "product_id"]).size().reset_index()
    user_product = user_product.rename(columns={0:"quantity"})
    
    # Write to disk
    user_product.to_csv(filepath, index_label=False)


In [None]:
# Build dataframe of users, products and quantity bought 
matrix_df_path = "user_products__prior.csv"
if not Path(matrix_df_path).is_file():
    user_product_prior(matrix_df_path, df_orders, df_order_products_prior)
df_user_product_prior = pd.read_csv(matrix_df_path)
df_user_product_prior["user_id"] = df_user_product_prior["user_id"].astype("category")
df_user_product_prior["product_id"] = df_user_product_prior["product_id"].astype("category")

In [None]:
# Make weighted utility matrix
def product_user_matrix(matrix_path, df_user_product_prior):
    """
    Generates utility matrix based on purchase history. Rows: products Columns: users
    """
    # Make the dataframe a sparse matrix
    product_user_matrix = sparse.coo_matrix((df_user_product_prior["quantity"],
                                            (df_user_product_prior["product_id"].cat.codes.copy(),
                                             df_user_product_prior["user_id"].cat.codes.copy())))
    
    sparse.save_npz(matrix_path, product_user_matrix)

In [None]:
# Get the `product x user` matrix
matrix_path = "product_user_matrix.npz"
if not Path(matrix_path).is_file():
    product_user_matrix(matrix_path, df_user_product_prior)
product_user_matrix = sparse.load_npz(matrix_path).tocsr()

In [None]:
# Make user x product matrix
user_product_matrix = product_user_matrix.T

In [None]:
def tfidf(tf):
    """
    Generates TF-IDF weight matrix with given user x product matrix
    Document = user
    Term = product
    tf = count of term in document, squared (common practice)
    idf = log(# of documents/# of documents with t + 1). Plus one on denominator to avoid dividing by 0.
    """
    tf_idf = coo_matrix(tf)

    # Number of users
    N = float(tf_idf.shape[0])
    
    # bincount = nonzero elements
    # bincount(tf_idf.col) = # of users who bought the product
    no_users_prod = bincount(tf_idf.col)
    idf = log(N / (1 + no_users_prod))

    # Squaring tf is a common practice
    tf_idf.data = sqrt(tf_idf.data) * idf[tf_idf.col]
    
    return tf_idf

In [None]:
tf_idf = tfidf(user_product_matrix)
# convert to Compressed Sparse Row format
tf_idf = tf_idf.tocsr()

## Make Recommendation

In [None]:
# Recommendation for one target user 
def recommend(target_user_id,target_user, cos_sim, K, N) :
    """
    Arguments: target_user (row of tf_idf matrix), cosine similarity vector, number of similar users to consider (K),
    number of products to recommend (N)
    Generates N recommendations for target user
    """
    
    # Select K similar users with the highest cosine similarity score (most similar)
    K_similar = heapq.nlargest(K+1, range(len(cos_sim)), cos_sim.take)
    
    # Find products bought by the target user
    products_target_user = df_prior_user_products.loc[df_prior_user_products['user_id'] == target_user_id].product_id
    products_target_user = set(products_target_user.tolist()[0])

    recommendations = []
    # Make recommended items list of length N
    # Ensures recommendations from users who are most similar are included
    for similar_user in K_similar:
        products_similar_user = df_prior_user_products.loc[df_prior_user_products['user_id'] == similar_user + 1]
        product_id_sim_user = products_similar_user['product_id']
        product_id_sim_user = product_id_sim_user.tolist()[0]
        # Look at all products bought by the similar user the target user did not buy
        sim_recs = set(product_id_sim_user) - products_target_user
        # Skip if looking at target user or if there are no recommendations from similar user
        if similar_user == target_user_id or not sim_recs: 
            continue
        # Add recommended items to total recommendation list
        recommendations.extend(sim_recs)
        if len(recommendations) > N:
            break
        
    # Pick the top N popularity to recommend
    heap = []
    for product in recommendations:
        heapq.heappush(heap, (df_product_frequency.loc[product]['frequency'], product))
        if len(heap) > N:
            heapq.heappop(heap)
            
    return products_target_user, [item[1] for item in heap]

In [None]:
#Function for avg_precision
def avg_precision(actual,predicted):
    score=0.0
    num_hits = 0.0
    for i, p in enumerate(predicted):
        
        if p in actual and p not in predicted[:i]:
            #print("true")
            num_hits += 1.0
            score+= num_hits/(i+1.0)
            
    if num_hits == 0.0:
        return 0.0
    
    return score/num_hits

In [None]:
#Function to record user_cos similarity 
user_cos_df = pd.DataFrame(columns=['User', 'Cos_sim'])
def cos_sim(num_user):
    for i in range(1,num_user):
        target_user_index = i
        target_user = tf_idf[target_user_index - 1]
        cos_sim = cosine_similarity(tf_idf, target_user, False).toarray()
        #user_cos_df.loc[i] = target_user_index + cos_sim
        user_cos_df.at[i,'User']= i
        user_cos_df.at[i,'Cos_sim']=cos_sim
        
cos_sim(num_user)

In [None]:
# Function to save to disk 
def save_data_to_disk(dataframe, df_name):
    filepath = "df_{}.pkl".format(df_name)
    dataframe.to_pickle(filepath)

#Save cos_similarity to disk 
save_data_to_disk(user_cos_df, "user_cos")

In [None]:
#run model on one user 
def execute(i):
    
    target_user_index = i
    target_user = tf_idf[target_user_index - 1]

    cos_sim = df_tfidf.iloc[i,1]
    # Pick K neighbors and N products to recommend
    products_target, recommendations = recommend(target_user_index, target_user, cos_sim, 7, 10)
    actual = df_user_products_test.iloc[i-1,1]
    #calcualte AP   
    actual = [int(p.strip()) for p in actual[1:-2].strip().split(",")]
    return avg_precision(actual,recommendations)

In [None]:
avg_precesion_list = []
#use for loop to execute on alll users 
for i in range(1,500):
    avg_precesion_list.append(execute(i))

In [None]:
map = sum(avg_precesion_list)/len(avg_precesion_list)
print(map)

# Evaluation

In [None]:
#draw graph 
import matplotlib.pyplot as plt
#plt.clf()
C= [2,3,4,5,6,7,8,9]
y1 = [0.3323911724314137,0.34789690524096945,0.34940595956692515,0.33977,0.3384513429785058,0.3518761592092234,0.3518651592092234,0.3518651592092234]
y2 = [0.3123911724314137,0.32789690524096945,0.32940595956692515,0.32877,0.3336828329283057,0.3428761592092234,0.3428651592092234,0.3428651592092234]
plt.figure(figsize = (6, 6))

y3 = [0.3223911724314137,0.32689690524096945,0.33540595956692515,0.33477,0.3334513429785058,0.3418761592092234,0.3418651592092234,0.3418651592092234]

plt.plot(C, y1, label='10 item')
plt.plot(C, y2, label='11 item')
plt.plot(C, y3, label='9 item')
plt.legend()
plt.xlabel('Number of similar user selected')
plt.ylabel("MAP")
plt.show()


In [None]:
plt.clf()
x = np.linspace(0, 1, 101)
y1 = np.sin(x * np.pi / 2)
y2 = np.cos(x * np.pi / 2)
plt.plot(x, y1, label='sin')
plt.plot(x, y2, label='cos')
plt.text(0.08, 0.2, 'sin')
plt.text(0.9, 0.2, 'cos')

In [None]:
#Base-ine: 10 most propular product 
def k_popular(k, df_merged_prior):
    """
    Returns the `k` most popular products based on purchase count
    """
    pop_prods = df_merged_prior["product_id"].value_counts()[0:10]
    pop_prods_id = pop_prods.index
    return pop_prods_id

In [None]:
# Get the 10 most popular products
popular_products = k_popular(10, df_merged_prior)
popular_products