**Each solution should finish running within 5 minutes.**

In [1]:
import numpy as np
import pandas as pd
from numpy.testing import assert_equal, assert_array_equal

In [2]:
try:
    import mlxtend
    print('existing')
except:
    !pip install mlxtend

existing


In [3]:
from mlxtend.preprocessing import TransactionEncoder
import tqdm
from scipy.spatial.distance import cosine
import fim

In [4]:
df_pos = pd.read_excel('pos_data.xlsx')

# Problem 1

For this problem, you will work with the POS data (`pos_data.xlsx`) of a real restaurant. **Do not** redistribute this dataset.

## Problem 1a

Create a function `most_frequent` that returns the 10 most frequent maximal itemsets of the dataset wherein each item is identified by the item ID. Return itemsets by decreasing support then by increasing smallest item ID in the itemset. Sort items in an itemset in increasing order of their numerical value.

In [5]:
def most_frequent():
    
    from mlxtend.frequent_patterns import fpmax
    transactions = [
        x for x in df_pos.groupby('BILL_ID')['ITEM_ID']
        .apply(lambda x: list(x))
    ]
    TE = TransactionEncoder()
    TE_fit = TE.fit(transactions).transform(transactions)
    df = pd.DataFrame(TE_fit, columns = TE.columns_)
    df_out = (fpmax(df, min_support=0.01, use_colnames=True)
              .sort_values(by='support', ascending=False)
             )
    df_out['itemsets'] = df_out['itemsets'].apply(lambda x: tuple(sorted(x)))
    return list(df_out['itemsets'])


In [6]:
max_itemsets = most_frequent()
assert_equal(max_itemsets[:5], 
                   [(587,), (369, 383), (386,), (172,), (378, 383)])

## Problem 1b

Create a function `most_lift` that returns the 10 association rules with the most lift. Each rule should have a minimum relative support of 0.01 and minimum confidence of 0.6. List each rule as a tuple of antecedent and consequent. Items in the antecedent should be sorted by increasing numerical value. Sort the rules by decreasing lift, increasing consequent and increasing smallest item ID in the antecedent.

In [7]:
def most_lift():
    transactions = df_pos.groupby('BILL_ID')['ITEM_ID'].apply(set)
    rules = fim.arules(transactions,
                       supp = 1,
                       conf = 60,
                       report = 'l',
                       eval = 'l' )
    return [(tuple(sorted(val[1])), val[0]) for val in 
            sorted(rules, key = lambda x: (x[2],x[0]),reverse = True)]

In [8]:
rules = most_lift()

assert_equal(
    rules[:5],
    [((654, 657), 651),
     ((654, 656), 651),
     ((651, 657), 654),
     ((646, 651), 654),
     ((651, 656), 654)]
)

## Problem 1c [3 pts]

Provide three recommendations to the owner of the restaurant based on the frequent itemset mining and association rule mining performed on the POS dataset. Each recommendation should be supported by relevant FIM or ARM results.

### Recommendation 1

Given the top maximal itemsets, it is recommended that most frequently bought items such as mineral water, potato fries, brewed coffee, Coke 1.5 Liter should always have enough stocks in their inventory in order to maximize sales and to minimize opportunity costs.

In [9]:
from mlxtend.frequent_patterns import fpmax
transactions = [
    x for x in df_pos.groupby('BILL_ID')['ITEM_ID']
    .apply(lambda x: list(x))
]
TE = TransactionEncoder()
TE_fit = TE.fit(transactions).transform(transactions)
df = pd.DataFrame(TE_fit, columns = TE.columns_)
df_out = (fpmax(df, min_support=0.01, use_colnames=True)
          .sort_values(by='support', ascending=False)
         )
df_out['itemsets'] = df_out['itemsets'].apply(lambda x: tuple(sorted(x)))
df_out['itemsets'] = (df_out['itemsets']
                       .apply(lambda x: 
                              [df_pos.iloc[df_pos[df_pos['ITEM_ID']==y]
                                           .drop_duplicates('ITEM_ID')
                                           .index,2]
                               .tolist()[0] for y in x]))

df_out.head(10)

Unnamed: 0,support,itemsets
84,0.045486,[MINERAL WATER]
80,0.037557,"[SINIGANG NA ISDA, STEAMED RICE]"
68,0.031196,[POTATO FRIES]
65,0.02954,[BREWED COFFEE]
76,0.029366,"[NATIVE CHICKEN TINOLA, STEAMED RICE]"
85,0.02893,"[STEAMED RICE, CARAFFE LEMON CUCUMBER]"
64,0.028581,[TILAPIA 150 PER KILO]
63,0.028494,[HAWAIIAN SUPREME]
61,0.027013,[COKE 1.5 LITER]
69,0.026926,"[BEEF BULALO, STEAMED RICE]"


### Recommendation 2 and 3

Given the association rules, it is recommended to further promote their breakfast food in general. It is also recommended to bundle sunny eggs and juice with garlic rice. If the customer does not want juice, the store should recommend coffee as an alternative.  Lastly, it is recommended to bundle up any other breakfast items into a meal bundle that consists of "ulam", garlic rice, and a beverage and for non-breakfast items, create a meal bundle that consists of "ulam" and just plain steamed rice.

In [10]:
df_pos = pd.read_excel('pos_data.xlsx')
transactions = df_pos.groupby('BILL_ID')['ITEM_ID'].apply(set)
rules = fim.arules(transactions, supp = 1, conf = 60, report = 'l', eval = 'l' )


In [11]:
df_lift = (pd.DataFrame(sorted(rules, key=lambda x: -x[2]))
           .rename({0:'consequent', 1:'antecedent', 2:'lift'}, axis=1)
           [['antecedent', 'consequent', 'lift']]
          )
df_lift['antecedent'] = (df_lift['antecedent']
                         .apply(lambda x: [df_pos
                                           .iloc[df_pos[df_pos['ITEM_ID']==y]
                                                 .drop_duplicates('ITEM_ID')
                                                 .index,2]
                                           .tolist()[0] for y in x]
                               )
                        )
df_lift['consequent'] = (df_lift['consequent']
                         .apply(lambda x: 
                                df_pos
                                .iloc[df_pos[df_pos['ITEM_ID']==x]
                                      .drop_duplicates('ITEM_ID')
                                      .index,2].tolist()[0]
                               )
                        )

df_lift

Unnamed: 0,antecedent,consequent,lift
0,"[WITH SUNNY EGG - COMP, WITH JUICE - COMP]",WITH GARLIC RICE - COMP,39.338718
1,"[WITH SUNNY EGG - COMP, WITH COFFEE - COMP]",WITH GARLIC RICE - COMP,38.510067
2,"[WITH GARLIC RICE - COMP, WITH JUICE - COMP]",WITH SUNNY EGG - COMP,37.843672
3,"[WITH GARLIC RICE - COMP, FRIED DAING NA BANGU...",WITH SUNNY EGG - COMP,36.310278
4,"[WITH GARLIC RICE - COMP, WITH COFFEE - COMP]",WITH SUNNY EGG - COMP,35.45749
5,[WITH GARLIC RICE - COMP],WITH SUNNY EGG - COMP,34.245358
6,[WITH SUNNY EGG - COMP],WITH GARLIC RICE - COMP,34.245358
7,"[WITH GARLIC RICE - COMP, FRIED DAING NA BANGU...",WITH COFFEE - COMP,10.188673
8,"[WITH GARLIC RICE - COMP, WITH SUNNY EGG - COMP]",WITH COFFEE - COMP,9.563746
9,[WITH GARLIC RICE - COMP],WITH COFFEE - COMP,9.236804


# Problem 2

For this problem, you will work with the first 50000 lines of the [Book-Crossing](http://www2.informatik.uni-freiburg.de/~cziegler/BX) dataset (`/mnt/data/public/book-crossing/BX-Book-Ratings.csv`). The file itself follows `latin1` encoding. Treat zero as explicit ratings. When sorting, set `kind='mergesort'`.

## Problem 2a [2 pts]

Create a function `user_recommend` that accepts a user ID and returns the 10 best `ISBN` recommendations for that `user` by a user-based recommender system using $k=31$ nearest neighbors. Sort by decreasing estimated rating then by lexicographic order of the ISBN code.

In [12]:
def user_recommend(U):
    n=31
#     U=99
    df_utility = (pd.read_csv('/mnt/data/public/book-crossing/BX-Book-Ratings.csv',
                              nrows=50_000,
                              encoding='latin1',
                              sep=';')
                  .pivot(index='User-ID', columns='ISBN', values='Book-Rating'))
    df_out = df_utility.copy()
    df_centered = df_utility.apply(lambda x: x-x.mean(), axis=1)


    df_others = df_centered.drop(U)
    items_to_predict =df_centered.columns.difference(df_centered.loc[U].dropna().index)

    df_filtered = df_centered.loc[:, df_centered.columns.difference(items_to_predict)].dropna(axis='rows', how='all')

    d={}

    from scipy.spatial.distance import cosine
    for o in tqdm.tqdm_notebook(df_filtered.drop(U).index):
        df = df_centered.loc[[U,o]].dropna(axis='columns')
        d[o] = 1 - cosine(df.loc[U], df.loc[o])

    top_n_users = pd.Series(d).sort_values(kind='mergesort', ascending=(False))[:n]


    for items in tqdm.tqdm_notebook(items_to_predict):
        s_ratings = df_centered.loc[top_n_users.index, items].dropna()
        s_dist = top_n_users[s_ratings.index]
        df_out.loc[U,items] = ((s_ratings * (s_dist)).sum()/(s_dist).sum()) + df_utility.loc[U].mean()

    df_preds = df_out.loc[U, items_to_predict].to_frame().reset_index().sort_values([99, 'ISBN'], kind='mergesort', ascending=(False,True))
    return df_preds['ISBN'].tolist()[:10]

In [13]:
recos_user = user_recommend(99)

assert_equal(
    recos_user[:5], 
    ['0060090367', '0060922532', '0060934425', '0060956445', '006109286X'])

Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  for o in tqdm.tqdm_notebook(df_filtered.drop(U).index):


  0%|          | 0/34 [00:00<?, ?it/s]

  dist = 1.0 - uv / np.sqrt(uu * vv)
Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  for items in tqdm.tqdm_notebook(items_to_predict):


  0%|          | 0/36270 [00:00<?, ?it/s]

  df_out.loc[U,items] = ((s_ratings * (s_dist)).sum()/(s_dist).sum()) + df_utility.loc[U].mean()
  df_out.loc[U,items] = ((s_ratings * (s_dist)).sum()/(s_dist).sum()) + df_utility.loc[U].mean()


## Problem 2b [2 pts]

Create a function `item_recommend` that accepts a user ID and returns the 10 best ISBN recommendations for that user by an item-based recommender system using $k=25$ nearest neighbors. Sort the columns of the utility matrix lexicographically. Consider only the 13000th to 14000th (exclusive) items/columns (index starts from 0). Recommend only items from the first 100 columns (items) of the resulting reduced utility matrix. Sort by decreasing estimated rating then by lexicographic order of the ISBN code.

In [14]:
from scipy.spatial.distance import cosine
n=25
U=11676
df_utility = (pd.read_csv('/mnt/data/public/book-crossing/BX-Book-Ratings.csv',
                          nrows=50_000,
                          encoding='latin1',
                          sep=';')
              .pivot(index='User-ID', columns='ISBN', values='Book-Rating')
              )

df_utility = df_utility[pd.Series(df_utility.columns).sort_values(kind='mergesort').tolist()[13000:14000]]
df_out = df_utility.copy()

### user mean centering
df_centered = df_utility.apply(lambda x: x-x.mean(), axis=1)

items_to_predict = df_utility.iloc[:,:100].columns.intersection(df_utility.loc[U].drop(df_utility.loc[U].dropna().index).index)

for item in tqdm.tqdm_notebook(items_to_predict):

    df_filtered = df_centered.loc[df_centered.loc[:, [item]].dropna(axis='rows').index,:].dropna(axis='columns', how='all')


    d = {}
    if len(df_filtered.columns.difference([item])) == 0:
        continue
    for o in (df_filtered.columns.difference([item])):
        df = df_centered.loc[:, [item, o]].dropna(axis='rows')
        dist = 1-cosine(df[item], df[o])
        d[o] = dist

    top_n_items = pd.Series(d).sort_values(kind='mergesort', ascending=False)[:n]
    ratings = df_centered.loc[U, top_n_items.index].dropna()
    if len(ratings) == 0 :
        continue

    sim = top_n_items[ratings.index]
    
    ### user mean centering
    try:
        1/sum(sim)
    except:
        continue

    else:
        pred_i = (sum(ratings*sim)/(sum(sim))) + df_utility.loc[U,:].mean()
    
    df_out.loc[U, item] = pred_i
#     return df_out.loc[U, items_to_predict].sort_values(kind='mergesort', ascending=False)[:10]

Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  for item in tqdm.tqdm_notebook(items_to_predict):


  0%|          | 0/53 [00:00<?, ?it/s]

In [15]:
df_out.loc[U, items_to_predict].sort_values(kind='mergesort', ascending=False)

ISBN
0425176339    13.000000
0425178773     9.000000
0425177807     8.500000
0425183009     8.500000
0425182142     8.250000
0425181928     8.142857
0425179265     8.000000
0425177351     6.250000
0425181863     6.000000
0425177424     5.200000
0425183270     5.200000
0425181111     5.083578
0425182673     5.000000
0425183750     4.500000
0425180034     4.000000
0425180298     4.000000
0425181200     4.000000
0425181979     4.000000
0425182932     4.000000
0425183025     4.000000
0425176037     3.875000
0425180042     3.800000
0425181480     3.333333
0425177009     2.615385
042518045X     2.571429
0425178234     2.000000
0425176053     0.000000
0425183181     0.000000
042518269X    -1.500000
0425180905    -4.000000
0425175456    -8.000000
0425176428          NaN
0425176614          NaN
0425176673          NaN
0425176789          NaN
0425176940          NaN
0425177696          NaN
0425178242          NaN
0425178552          NaN
0425178781          NaN
0425178838          NaN
0425178854 

In [16]:
def item_recommend(U):
    from scipy.spatial.distance import cosine
    n=25
#     U=11676
    ### reading file and converting to user-item utility matrix
    df_utility = (pd.read_csv('/mnt/data/public/book-crossing/BX-Book-Ratings.csv',
                              nrows=50_000,
                              encoding='latin1',
                              sep=';')
                  .pivot(index='User-ID', columns='ISBN', values='Book-Rating')
                  )
    ### Including only 13k-th column up to 13999th column once it is lexicographically sorted
    df_utility = df_utility[pd.Series(df_utility.columns)
                            .sort_values(kind='mergesort')
                            .tolist()[13000:14000]]
    ### copying utility matrix to input predictions later
    df_out = df_utility.copy()
    ### user mean centering
    df_centered = df_utility.apply(lambda x: x-x.mean(), axis=1)
    ### Knowing what items to predict for user U
    items_to_predict = (df_utility
                        .iloc[:,:100]
                        .columns
                        .intersection(df_utility
                                      .loc[U]
                                      .drop(df_utility.loc[U]
                                            .dropna()
                                            .index)
                                      .index)
                       )
    ### Looping through items to predict
    for item in tqdm.tqdm_notebook(items_to_predict):
        ### knowing which items can be used to calculate cosine distance with variable "item"
        df_filtered = (df_centered
                       .loc[df_centered
                            .loc[:, [item]]
                            .dropna(axis='rows')
                            .index,:]
                       .dropna(axis='columns', how='all'))
        d = {}
        ### sometimes there are no items that can be cosine distance'd with var "item"
        if len(df_filtered.columns.difference([item])) == 0:
            continue
        ### if there are items that can, loop through them and calculate cosine similarity which is 1-cosdist    
        for o in (df_filtered.columns.difference([item])):
            df = df_centered.loc[:, [item, o]].dropna(axis='rows')
            dist = 1-cosine(df[item], df[o])
            d[o] = dist
        ### sorting top n items that are most similar with var "item"
        top_n_items = (pd.Series(d)
                       .sort_values(kind='mergesort', ascending=False)[:n])
        ### ratings of user U for the top n items
        ratings = df_centered.loc[U, top_n_items.index].dropna()
        ### sometimes user U hasn't rated top_n_items yet
        if len(ratings) == 0 :
            continue
        ### getting cosine similarities of corresponding top n items that user U has rated before
        sim = top_n_items[ratings.index]

        ### check if sum of cosine similarities is non-zero. If zero, then skip
        try:
            1/sum(sim)
        except:
            continue
        
        #### prediction for var "item" = calculate weighted average + user mean to reverse user-mean-centering. Weights are the cosine similarities
        else:
            pred_i = ((sum(ratings*sim)/(sum(sim))) 
                      + df_utility.loc[U,:].mean())
        df_out.loc[U, item] = pred_i
    return (df_out
            .loc[U, items_to_predict]
            .sort_values(kind='mergesort', ascending=False)
            .index.tolist()[:10])

In [17]:
recos_item = item_recommend(11676)

assert_equal(
    recos_item[:5],
    ['0425176339', '0425178773', '0425177807', '0425183009', '0425182142']
)

Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  for item in tqdm.tqdm_notebook(items_to_predict):


  0%|          | 0/53 [00:00<?, ?it/s]

## Problem 2c

Create a function `svd_recommend` that accepts a user ID and returns the 10 best ISBN recommendations for that user by an SVD latent factor model recommender system. Use surprise to implement `svd_recommend` with 100 factors, 20 epochs, biased `True` and random_state `1337`. Consider all of `BX-Book-Ratings.csv` not just the first 50000 rows. Treat zero as an explicit rating.

In [18]:
def svd_recommend(user):
    from surprise import Reader, Dataset, SVD
    df = pd.read_csv('/mnt/data/public/book-crossing/BX-Book-Ratings.csv',
                     encoding='latin-1', sep=';')

    reader = Reader(rating_scale=(-10, 10))
    data = Dataset.load_from_df(df, reader)
    trainset = data.build_full_trainset()

    a = SVD(random_state=1337)
    a.fit(trainset)

    isbn = np.array(df['ISBN'].unique())
    rated = np.array(df[df["User-ID"] == user]['ISBN'].unique())
    test = np.setdiff1d(isbn, rated)

    testset = [[user, i, 3] for i in test]
    predictions = a.test(testset)
    pred_ratings = np.array([pred.est for pred in predictions])

    top = pred_ratings.argsort()[::-1][:10]
    return test[top]

In [19]:
recos_svd = svd_recommend(198711)

assert_equal(
    set(recos_svd[:6]),
    {'0006513220',
     '0060248025',
     '0615116426',
     '1844262553',
     '8445071416',
     '8826703132'})