In [1]:
import matplotlib.pyplot as plt
import math
from math import log
import pandas as pd
import numpy as np
import random

### Load data

#### MovieLens

In [50]:
df_links = pd.read_csv('data/movielens/ml-latest-small/links.csv')
df_movies = pd.read_csv('data/movielens/ml-latest-small/movies.csv')
df_ratings = pd.read_csv('data/movielens/ml-latest-small/ratings.csv')
df_tags = pd.read_csv('data/movielens/ml-latest-small/tags.csv')

In [51]:
# df_tags.head()
df_ratings.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,1,1,4.0,964982703
1,1,3,4.0,964981247
2,1,6,4.0,964982224
3,1,47,5.0,964983815
4,1,50,5.0,964982931


#### Instacart

In [4]:
#instacart
df_products = pd.read_csv('data/instacart/products.csv')
df_carts_prior = pd.read_csv('data/instacart/order_products__prior.csv')
df_carts_train = pd.read_csv('data/instacart/order_products__train.csv')
df_carts = pd.concat([df_carts_prior, df_carts_train])
df_carts.head()

Unnamed: 0,order_id,product_id,add_to_cart_order,reordered
0,2,33120,1,1
1,2,28985,2,1
2,2,9327,3,0
3,2,45918,4,1
4,2,30035,5,0


In [5]:
df_carts.head()

Unnamed: 0,order_id,product_id,add_to_cart_order,reordered
0,2,33120,1,1
1,2,28985,2,1
2,2,9327,3,0
3,2,45918,4,1
4,2,30035,5,0


In [6]:
# transforms the data into a long list of lists, each containing product IDs 
# that were purchased in the same order
carts = df_carts[['order_id', 'product_id']].groupby('order_id')['product_id'].apply(list).to_list()
carts[0:3]

[[49302, 11109, 10246, 49683, 43633, 13176, 47209, 22035],
 [33120, 28985, 9327, 45918, 30035, 17794, 40141, 1819, 43668],
 [33754, 24838, 17704, 21903, 17668, 46667, 17461, 32665]]

### Ranking

Recommend 10 movies based on recent ratings

Use techniques to make sure the recommendation is reliable

In [116]:

def myfun(arr, damp_value):
    my = np.mean(arr)
    N = len(arr)
    rating_sum = np.sum(arr)
    r_i = (rating_sum + (my*damp_value)) / (N + damp_value)
    return r_i

global_damp_value = 100

data = df_ratings
ratings = pd.DataFrame(data.groupby("movieId")["rating"].mean())

ratings['num of ratings'] = pd.DataFrame(data.groupby('movieId')['rating'].count())
ratings['damped mean'] = pd.DataFrame(data.groupby('movieId')['rating'].apply(lambda arr: myfun(arr, global_damp_value)))

ratings.head()

Unnamed: 0_level_0,rating,num of ratings,damped mean
movieId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1,3.92093,215,3.92093
2,3.431818,110,3.431818
3,3.259615,52,3.259615
4,2.357143,7,2.357143
5,3.071429,49,3.071429


In [117]:
global_mean = df_ratings["rating"].mean()
global_damp_value = 100


def myfun(arr):
    N = len(arr)
    rating_sum = np.sum(arr)
    r_i = (rating_sum + (global_mean*global_damp_value)) / (N + global_damp_value)
    return r_i


data = df_ratings
ratings = pd.DataFrame(data.groupby("movieId")["rating"].mean())

ratings['num of ratings'] = pd.DataFrame(data.groupby('movieId')['rating'].count())
ratings['damped mean'] = pd.DataFrame(data.groupby('movieId')['rating'].apply(lambda arr: myfun(arr)))

ratings.head()

Unnamed: 0_level_0,rating,num of ratings,damped mean
movieId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1,3.92093,215,3.787796
2,3.431818,110,3.465027
3,3.259615,52,3.418787
4,2.357143,7,3.426689
5,3.071429,49,3.360105


### Association rule mining

Calculate the number of frequent itemsets with varying levels for support

Try to guess what value of minimum support would be reasonable

Calculate association rules and find the one whose subsequent item has the least support (the one more in the tail)

#### Priori (Apyori)

In [None]:
from apyori import apriori

In [None]:
association_rules = apriori(carts, min_support=XXX, 
                            min_confidence=XXX,
                            min_lift=XXX, min_length=XXX)
association_rules = list(association_rules)

In [None]:
association_rules

In [None]:
idx = 5 #prints the 5th association rule

rule = association_rules[idx]
frequent_itemset = rule.items
support = rule.support

antecedent = rule.ordered_statistics[0].items_base
antecedent = [df_products.iloc[a-1]['product_name'] for a in antecedent]
consequent = rule.ordered_statistics[0].items_add
consequent = [df_products.iloc[c-1]['product_name'] for c in consequent]
lift = rule.ordered_statistics[0].lift
confidence = rule.ordered_statistics[0].confidence

print(f'{antecedent}->{consequent}')
print(f'support = {support}')
print(f'confidence = {confidence}')
print(f'lift = {lift}')

#### FP-growth (mlxtend)

In [None]:
from mlxtend.preprocessing import TransactionEncoder
from mlxtend.frequent_patterns import apriori, fpmax, fpgrowth
from mlxtend.frequent_patterns import association_rules

In [None]:
# encode the dataset into a orders x items binary sparse matrix
te = TransactionEncoder()
te_data = te.fit(carts).transform(carts, sparse=True)
df = pd.DataFrame.sparse.from_spmatrix(te_data, columns=te.columns_)
# product indices must either start from 0 or be strings
df.columns = [str(i) for i in df.columns] 
# alternatively, reduce ids by 1
#carts_modified = [[carts[l][i]-1 for i in range(0, len(carts[l]))] for l in range(0, len(carts))]

In [None]:
frequent_itemsets = fpgrowth(df, min_support=xxxx, use_colnames=True, verbose=1)

In [None]:
rules = association_rules(frequent_itemsets, metric='confidence', min_threshold=xxxx)
rules