# Chapter 15 
 Machine Learning for Business Analytics<br>
Concepts, Techniques, and Applications in Python<br>
by Galit Shmueli, Peter C. Bruce, Peter Gedeck, Nitin R. Patel

Publisher: Wiley; 2nd edition (2024) <br>
<!-- ISBN-13: 978-3031075650 -->

(c) 2024 Galit Shmueli, Peter C. Bruce, Peter Gedeck, Nitin R. Patel

The code needs to be executed in sequence.

Python packages and Python itself change over time. This can cause warnings or errors.
"Warnings" are for information only and can usually be ignored.
"Errors" will stop execution and need to be fixed in order to get results.

If you come across an issue with the code, please follow these steps

- Check the repository (https://gedeck.github.io/sdsa-code-solutions/) to see if the code has been upgraded. This might solve the problem.
- Report the problem using the issue tracker at https://github.com/gedeck/sdsa-code-solutions/issues
- Paste the error message into Google and see if someone else already found a solution

In [2]:
import heapq
import random
from collections import defaultdict

import matplotlib.pyplot as plt
import mlba
import pandas as pd
import surprise
from mlxtend.frequent_patterns import apriori, association_rules
from surprise import Dataset, Reader, KNNBasic
from surprise.model_selection import train_test_split
%matplotlib inline

In [3]:
# Load and preprocess data set (convert to boolean for improved performance)
fp_df = mlba.load_data('Faceplate.csv')
fp_df = fp_df.set_index('Transaction').astype('bool')
print(fp_df)

# create frequent itemsets
itemsets = apriori(fp_df, min_support=0.2, use_colnames=True)

# convert into rules
rules = association_rules(itemsets, len(fp_df), metric='confidence', min_threshold=0.5)
rules.sort_values(by=['lift'], ascending=False).head(6)

print('Rules\n\n', rules.sort_values(by=['lift'], ascending=False)
      .loc[:, ['antecedents', 'consequents', 'support', 'confidence', 'lift', 'leverage']]
      .head(6))

               Red  White   Blue  Orange  Green  Yellow
Transaction                                            
1             True   True  False   False   True   False
2            False   True  False    True  False   False
3            False   True   True   False  False   False
4             True   True  False    True  False   False
5             True  False   True   False  False   False
6            False   True   True   False  False   False
7             True  False   True   False  False   False
8             True   True   True   False   True   False
9             True   True   True   False  False   False
10           False  False  False   False  False    True
Rules

        antecedents   consequents  support  confidence      lift  leverage
13    (Red, White)       (Green)      0.2         0.5  2.500000      0.12
15         (Green)  (Red, White)      0.2         1.0  2.500000      0.12
4          (Green)         (Red)      0.2         1.0  1.666667      0.08
14  (Green, White)      

 Partial output

In [4]:
# Prepare the dataset for table 14.6 based on table 14.5
from itertools import chain
randomTransactions = [{8}, {3,4,8}, {8}, {3,9}, {9}, {1,8}, {6,9}, {3,5,7,9}, {8}, set(),
                      {1,7,9}, {1,4,5,8,9}, {5,7,9}, {6,7,8}, {3,7,9}, {1,4,9}, {6,7,8}, {8}, set(), {9},
                      {2,5,6,8}, {4,6,9}, {4,9}, {8,9}, {6,8}, {1,6,8}, {5,8}, {4,8,9}, {9}, {8},
                      {1,5,8}, {3,6,9}, {7,9}, {7,8,9}, {3,4,6,8}, {1,4,8}, {4,7,8}, {8,9}, {4,5,7,9}, {2,8,9},
                      {2,5,9}, {1,2,7,9}, {5,8}, {1,7,8}, {8}, {2,7,9}, {4,6,9}, {9}, {9}, {6,7,8}]
print(randomTransactions)
uniqueItems = sorted(set(chain.from_iterable(randomTransactions)))
randomData = pd.DataFrame(False, index=range(len(randomTransactions)), columns=uniqueItems)
for row, transaction in enumerate(randomTransactions):
    for item in transaction:
        randomData.loc[row, item] = True
randomData.head()

[{8}, {8, 3, 4}, {8}, {9, 3}, {9}, {8, 1}, {9, 6}, {9, 3, 5, 7}, {8}, set(), {1, 9, 7}, {1, 4, 5, 8, 9}, {9, 5, 7}, {8, 6, 7}, {9, 3, 7}, {1, 4, 9}, {8, 6, 7}, {8}, set(), {9}, {8, 2, 5, 6}, {9, 4, 6}, {9, 4}, {8, 9}, {8, 6}, {8, 1, 6}, {8, 5}, {8, 9, 4}, {9}, {8}, {8, 1, 5}, {9, 3, 6}, {9, 7}, {8, 9, 7}, {8, 3, 4, 6}, {8, 1, 4}, {8, 4, 7}, {8, 9}, {9, 4, 5, 7}, {8, 9, 2}, {9, 2, 5}, {1, 2, 9, 7}, {8, 5}, {8, 1, 7}, {8}, {9, 2, 7}, {9, 4, 6}, {9}, {9}, {8, 6, 7}]


Unnamed: 0,1,2,3,4,5,6,7,8,9
0,False,False,False,False,False,False,False,True,False
1,False,False,True,True,False,False,False,True,False
2,False,False,False,False,False,False,False,True,False
3,False,False,True,False,False,False,False,False,True
4,False,False,False,False,False,False,False,False,True


In [5]:
# create frequent itemsets
itemsets = apriori(randomData, min_support=2/len(randomData), use_colnames=True)
# and convert into rules
rules = association_rules(itemsets, len(randomData), metric='confidence', min_threshold=0.7)
print(rules.sort_values(by=['lift'], ascending=False)
      .loc[:, ['antecedents', 'consequents', 'support', 'confidence', 'lift', 'leverage']]
      .head(6))

  antecedents consequents  support  confidence      lift  leverage
3      (8, 3)         (4)     0.04         1.0  4.545455    0.0312
1      (1, 5)         (8)     0.04         1.0  1.851852    0.0184
2      (2, 7)         (9)     0.04         1.0  1.851852    0.0184
4      (3, 4)         (8)     0.04         1.0  1.851852    0.0184
5      (3, 7)         (9)     0.04         1.0  1.851852    0.0184
6      (4, 5)         (9)     0.04         1.0  1.851852    0.0184


In [6]:
# load dataset
all_books_df = mlba.load_data('CharlesBookClub.csv')

# create the binary incidence matrix
ignore = ['Seq#', 'ID#', 'Gender', 'M', 'R', 'F', 'FirstPurch', 'Related Purchase',
          'Mcode', 'Rcode', 'Fcode', 'Yes_Florence', 'No_Florence']
count_books = all_books_df.drop(columns=ignore)
count_books[count_books > 0] = 1
count_books = count_books.astype('bool')

# create frequent itemsets and rules
itemsets = apriori(count_books, min_support=200/4000, use_colnames=True)
rules = association_rules(itemsets, len(count_books), metric='confidence', min_threshold=0.5)

# Display 25 rules with highest lift
columns = ['antecedents', 'consequents', 'support', 'confidence', 'lift', 'leverage']
rules[columns].sort_values(by=['lift'], ascending=False).head(25)

Unnamed: 0,antecedents,consequents,support,confidence,lift,leverage
64,"(YouthBks, RefBks)","(ChildBks, CookBks)",0.05525,0.68,2.809917,0.035588
73,"(DoItYBks, RefBks)","(ChildBks, CookBks)",0.06125,0.662162,2.736207,0.038865
60,"(DoItYBks, YouthBks)","(ChildBks, CookBks)",0.067,0.64891,2.681448,0.042014
80,"(GeogBks, RefBks)","(ChildBks, CookBks)",0.05025,0.614679,2.539995,0.030467
69,"(YouthBks, GeogBks)","(ChildBks, CookBks)",0.06325,0.605263,2.501087,0.037961
77,"(DoItYBks, GeogBks)","(ChildBks, CookBks)",0.0605,0.59901,2.475248,0.036058
66,"(ChildBks, CookBks, GeogBks)",(YouthBks),0.06325,0.577626,2.424452,0.037162
72,"(ChildBks, CookBks, RefBks)",(DoItYBks),0.06125,0.591787,2.323013,0.034883
48,"(DoItYBks, GeogBks)",(YouthBks),0.0545,0.539604,2.264864,0.030437
61,"(ChildBks, CookBks, RefBks)",(YouthBks),0.05525,0.533816,2.240573,0.030591


 Partial output (25 rules with highest lift)

In [7]:
with pd.option_context('display.max_rows', 100, 'display.width', 100):
    print(rules[columns].sort_values(by=['lift'], ascending=False))

                       antecedents          consequents  support  confidence      lift  leverage
64              (YouthBks, RefBks)  (ChildBks, CookBks)  0.05525    0.680000  2.809917  0.035588
73              (DoItYBks, RefBks)  (ChildBks, CookBks)  0.06125    0.662162  2.736207  0.038865
60            (DoItYBks, YouthBks)  (ChildBks, CookBks)  0.06700    0.648910  2.681448  0.042014
80               (GeogBks, RefBks)  (ChildBks, CookBks)  0.05025    0.614679  2.539995  0.030467
69             (YouthBks, GeogBks)  (ChildBks, CookBks)  0.06325    0.605263  2.501087  0.037961
77             (DoItYBks, GeogBks)  (ChildBks, CookBks)  0.06050    0.599010  2.475248  0.036058
66    (ChildBks, CookBks, GeogBks)           (YouthBks)  0.06325    0.577626  2.424452  0.037162
72     (ChildBks, CookBks, RefBks)           (DoItYBks)  0.06125    0.591787  2.323013  0.034883
48             (DoItYBks, GeogBks)           (YouthBks)  0.05450    0.539604  2.264864  0.030437
61     (ChildBks, CookBks, Ref

In [8]:
random.seed(0)
nratings = 5000
randomData = pd.DataFrame({
    'itemID': [random.randint(0, 99) for _ in range(nratings)],
    'userID': [random.randint(0, 999) for _ in range(nratings)],
    'rating': [random.randint(1, 5) for _ in range(nratings)],
})

def get_top_n(predictions, n=10):
    # First map the predictions to each user.
    byUser = defaultdict(list)
    for p in predictions:
        byUser[p.uid].append(p)

    # For each user, reduce predictions to top-n
    for uid, userPredictions in byUser.items():
        byUser[uid] = heapq.nlargest(n, userPredictions, key=lambda p: p.est)
    return byUser

In [9]:
# Convert the data set into the format required by the surprise package
# The columns must correspond to user id, item id, and ratings (in that order)
reader = Reader(rating_scale=(1, 5))
data = Dataset.load_from_df(randomData[['userID', 'itemID', 'rating']], reader)

# Split into training and holdout set
trainset, holdoutset = train_test_split(data, test_size=.25, random_state=1)

## User-based filtering
# compute cosine similarity between users
sim_options = {'name': 'cosine', 'user_based': True}
algo = KNNBasic(sim_options=sim_options)
algo.fit(trainset)

# predict ratings for all pairs (u, i) that are NOT in the training set.
predictions = algo.test(holdoutset)

# Print the recommended items for each user
top_n = get_top_n(predictions, n=4)
print('Top-4 recommended items for each user')
for uid, user_ratings in list(top_n.items())[:5]:
    print(f'User {uid}')
    for prediction in user_ratings:
        print(f'  Item {prediction.iid:2d} ({prediction.est:.2f})', end='')
    print()

Computing the cosine similarity matrix...
Done computing similarity matrix.
Top-4 recommended items for each user
User 6
  Item  6 (5.00)  Item 77 (2.50)  Item 60 (1.00)
User 222
  Item 77 (3.50)  Item 75 (2.78)
User 424
  Item 14 (3.50)  Item 45 (3.10)  Item 54 (2.34)
User 87
  Item 27 (3.00)  Item 54 (3.00)  Item 82 (3.00)  Item 32 (1.00)
User 121
  Item 98 (3.48)  Item 32 (2.83)


 Partial output

 Rebuild model using the full dataset

In [10]:
trainset = data.build_full_trainset()
sim_options = {'name': 'cosine', 'user_based': False}
algo = KNNBasic(sim_options=sim_options)
algo.fit(trainset)

# Predict rating for user 383 and item 7
algo.predict(383, 7)

Computing the cosine similarity matrix...
Done computing similarity matrix.


Prediction(uid=383, iid=7, r_ui=None, est=2.3661840936304324, details={'actual_k': 4, 'was_impossible': False})

 Partial output

In [11]:
# download MovieLens data
ratings = mlba.load_data('MovieLensRatings.csv')
movies = mlba.load_data('MovieLensMovies.csv')
# create a dictionary of movie titles by movieId
moviesById = {movie['movieId']: movie['title'] for _, movie in movies.iterrows()}

# convert ratings into dataset suitable for scikit-surprise and split into
# training and holdout sets
reader = Reader(rating_scale=(1, 5))
data = Dataset.load_from_df(ratings[['userId', 'movieId', 'rating']], reader)
trainset, holdoutset = train_test_split(data, test_size=0.2, random_state=1)

In [12]:
# UBCF model and prediction
sim_options = {'name': 'cosine', 'user_based': True}
ubcf = KNNBasic(sim_options=sim_options)
ubcf.fit(trainset)

# predict ratings for all pairs (u, i) that are NOT in the training set.
predictions = ubcf.test(holdoutset)
top_n = get_top_n(predictions, n=4)
print('UBCF Top-4 recommended items for each user')
for uid, user_ratings in list(top_n.items())[:3]:
    print(f'User {uid}')
    for prediction in user_ratings:
        print(f'  Est. {prediction.est:.2f} / act. {prediction.r_ui}: {moviesById[prediction.iid]:40s}')

# IBCF model and prediction
sim_options = {'name': 'cosine', 'user_based': False}
ibcf = KNNBasic(sim_options=sim_options)
ibcf.fit(trainset)

# predictions
top_n = get_top_n(ibcf.test(holdoutset), n=4)
print('IBCF Top-4 recommended items for each user')
for uid, user_ratings in list(top_n.items())[:3]:
    print(f'User {uid}')
    for prediction in user_ratings:
        print(f'  Est. {prediction.est:.2f} / act. {prediction.r_ui}: {moviesById[prediction.iid]:40s}')

Computing the cosine similarity matrix...
Done computing similarity matrix.
UBCF Top-4 recommended items for each user
User 469
  Est. 4.45 / act. 4.0: Usual Suspects, The (1995)              
  Est. 4.36 / act. 5.0: Fargo (1996)                            
  Est. 4.36 / act. 3.0: Star Wars: Episode VI - Return of the Jedi (1983)
  Est. 4.33 / act. 5.0: Harold and Maude (1971)                 
User 187
  Est. 4.39 / act. 4.5: Taxi Driver (1976)                      
  Est. 4.34 / act. 3.0: Fight Club (1999)                       
  Est. 4.33 / act. 5.0: Sunset Blvd. (a.k.a. Sunset Boulevard) (1950)
  Est. 4.32 / act. 4.5: Aliens (1986)                           
User 399
  Est. 4.24 / act. 5.0: Forrest Gump (1994)                     
  Est. 4.18 / act. 5.0: Lord of the Rings: The Two Towers, The (2002)
  Est. 4.11 / act. 5.0: Back to the Future (1985)               
  Est. 4.07 / act. 2.5: Terminator 2: Judgment Day (1991)       
Computing the cosine similarity matrix...
Done computin

 Partial output

In [13]:
ubcf_pred = ubcf.test(holdoutset)
ibcf_pred = ibcf.test(holdoutset)
random_pred = [surprise.Prediction(0, 0, random.randint(1, 5),
                                   random.randint(1, 5), None)
               for _ in range(len(holdoutset))]

pd.DataFrame({
    'UBCF': {'RMSE': surprise.accuracy.rmse(ubcf_pred, verbose=False),
             'MSE': surprise.accuracy.mse(ubcf_pred, verbose=False),
             'MAE': surprise.accuracy.mae(ubcf_pred, verbose=False)},
    'IBCF': {'RMSE': surprise.accuracy.rmse(ibcf_pred, verbose=False),
             'MSE': surprise.accuracy.mse(ibcf_pred, verbose=False),
             'MAE': surprise.accuracy.mae(ibcf_pred, verbose=False)},
    'Random': {'RMSE': surprise.accuracy.rmse(random_pred, verbose=False),
               'MSE': surprise.accuracy.mse(random_pred, verbose=False),
               'MAE': surprise.accuracy.mae(random_pred, verbose=False)},
}).transpose().round(3)

Unnamed: 0,RMSE,MSE,MAE
UBCF,0.973,0.948,0.75
IBCF,0.978,0.957,0.762
Random,2.008,4.033,1.608
