In [1]:
%matplotlib inline

from pathlib import Path
import heapq
from collections import defaultdict

import numpy as np
import pandas as pd
import matplotlib.pylab as plt
from mlxtend.frequent_patterns import apriori
from mlxtend.frequent_patterns import association_rules

from surprise import Dataset
from surprise import Reader
from surprise import KNNBasic
from surprise.model_selection import train_test_split

DATA = Path('.').resolve()

In [47]:
course_df = pd.read_csv(DATA / 'CourseTopics.csv')
course_df.columns = [s.strip().replace(' ', '_') for s in course_df.columns]
course_df.head()

Unnamed: 0,Intro,DataMining,Survey,Cat_Data,Regression,Forecast,DOE,SW
0,1,1,0,0,0,0,0,0
1,0,0,1,0,0,0,0,0
2,0,1,0,1,1,0,0,1
3,1,0,0,0,0,0,0,0
4,1,1,0,0,0,0,0,0


# Discussion Task: association rules

In [48]:
# create frequent itemsets
itemsets = apriori(course_df, min_support=0.03, use_colnames=True)
# and convert into rules
rules = association_rules(itemsets, metric='confidence', min_threshold=0.5)
rules.sort_values(by=['lift'], ascending=False).head(6)

print(rules.sort_values(by=['lift'], ascending=False)
      .drop(columns=['antecedent support', 'consequent support'])
      .head(6))


              antecedents consequents   support  confidence      lift  \
5            (Intro, DOE)        (SW)  0.030137    0.647059  2.915759   
0         (Intro, Survey)        (SW)  0.032877    0.545455  2.457912   
3     (Intro, Regression)        (SW)  0.038356    0.538462  2.426401   
4        (SW, Regression)     (Intro)  0.038356    0.700000  1.774306   
1            (SW, Survey)     (Intro)  0.032877    0.666667  1.689815   
2  (Regression, Cat_Data)     (Intro)  0.032877    0.600000  1.520833   

   leverage  conviction  
5  0.019801    2.204566  
0  0.019501    1.711781  
3  0.022548    1.685845  
4  0.016739    2.018265  
1  0.013421    1.816438  
2  0.011259    1.513699  


# Question 1: Apply user-based collaborative filtering to the data. All recommendations will be 1. Explain why this happens. (10 points)

In [60]:
newStudent = pd.DataFrame([
    [0,0,0,0,1,1,0,0]], 
    columns=['Intro', 'DataMining', 'Survey', 'Cat_Data', 'Regression', 'Forecast',
       'DOE', 'SW'])
newStudent

ratings = []
for customer, row in course_df.iterrows():
    for course, value in row.iteritems():
        if value==0: continue
        ratings.append([customer, course, value])
ratings = pd.DataFrame(ratings, columns=['customer', 'course', 'rating'])
reader = Reader(rating_scale=(1,1))
data = Dataset.load_from_df(ratings[['customer', 'course', 'rating']], reader)
trainset = data.build_full_trainset()
## User-based filtering
# compute cosine similarity between users 
sim_options = {'name': 'cosine', 'user_based': True}
algo = KNNBasic(sim_options=sim_options)
algo.fit(trainset)
predictions = []
for user in newStudent.index:
    predictions.append([algo.predict(user, course).est for course in newStudent])
predictions = pd.DataFrame(predictions, columns=course_df.columns)
predictions

Computing the cosine similarity matrix...
Done computing similarity matrix.


Unnamed: 0,Intro,DataMining,Survey,Cat_Data,Regression,Forecast,DOE,SW
0,1,1,1,1,1,1,1,1


The reason that all recommendations are 1 is because collaborative filtering method cannot generate recommendations for new users.

# Question 2:association rules to Cosmetics.csv (use min_support=0.1 and min_threshold=0.8).

In [63]:
cosmetics_df = pd.read_csv(DATA / 'Cosmetics.csv')
cosmetics_df.columns = [s.strip().replace('.', '') for s in cosmetics_df.columns]
cosmetics_df.set_index('Trans', inplace=True)
cosmetics_df.head()

Unnamed: 0_level_0,Bag,Blush,Nail Polish,Brushes,Concealer,Eyebrow Pencils,Bronzer,Lip liner,Mascara,Eye shadow,Foundation,Lip Gloss,Lipstick,Eyeliner
Trans,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
1,0,1,1,1,1,0,1,1,1,0,0,0,0,1
2,0,0,1,0,1,0,1,1,0,0,1,1,0,0
3,0,1,0,0,1,1,1,1,1,1,1,1,1,0
4,0,0,1,1,1,0,1,0,0,0,1,0,0,1
5,0,1,0,0,1,0,1,1,1,1,0,1,1,0


In [71]:
# create frequent itemsets
itemsets = apriori(cosmetics_df, min_support=0.1, use_colnames=True)
# and convert into rules
rules = association_rules(itemsets, metric='confidence', min_threshold=0.8)
rules = rules.sort_values(by=['lift'], ascending=False).drop(columns=['antecedent support', 'consequent support'])
rules.head(3)

Unnamed: 0,antecedents,consequents,support,confidence,lift,leverage,conviction
0,(Brushes),(Nail Polish),0.149,1.0,3.571429,0.10728,inf
22,"(Blush, Eye shadow, Concealer)",(Mascara),0.119,0.959677,2.688172,0.074732,15.9464
5,"(Blush, Eye shadow)",(Mascara),0.169,0.928571,2.60104,0.104026,9.002


Interpret the first three rules in the output in words. (6 points)

The first rule has confidence = 1, the if-then rule always happen in all the transactions that involve antecedent. the lift has 3.5 times efficient than random selection. conviction infinity means if-then always hold. The second and third rules have similar results with high confidence and conviction, the lift is 2.6 times efficient than random selection. But the third rule is a redundant one to the second rule.

In [72]:
rules

Unnamed: 0,antecedents,consequents,support,confidence,lift,leverage,conviction
0,(Brushes),(Nail Polish),0.149,1.0,3.571429,0.10728,inf
22,"(Blush, Eye shadow, Concealer)",(Mascara),0.119,0.959677,2.688172,0.074732,15.9464
5,"(Blush, Eye shadow)",(Mascara),0.169,0.928571,2.60104,0.104026,9.002
7,"(Nail Polish, Eye shadow)",(Mascara),0.119,0.908397,2.544529,0.072233,7.019417
12,"(Eye shadow, Concealer)",(Mascara),0.179,0.890547,2.49453,0.107243,5.874682
14,"(Eye shadow, Bronzer)",(Mascara),0.124,0.879433,2.463397,0.073663,5.333118
24,"(Eye shadow, Concealer, Eyeliner)",(Mascara),0.114,0.876923,2.456367,0.06759,5.224375
4,"(Mascara, Blush)",(Eye shadow),0.169,0.918478,2.410704,0.098896,7.593067
18,"(Eye shadow, Lipstick)",(Mascara),0.11,0.852713,2.388552,0.063947,4.365632
17,"(Mascara, Lipstick)",(Eye shadow),0.11,0.909091,2.386065,0.063899,6.809


Reviewing the first couple of dozen rules, comment on their redundancy and how you would assess their utility. (4 points)

When consequent set is the same between two rules, then all the subsets of one rule's antecedent will have redundant information. We can see the information can be compressed. This doesn not mean that the rules are not useful. But it can reduce the number of itemsets to be considered for possible action from a business perspective.