In [57]:
!pip install mlxtend
import pandas as pd
from mlxtend.frequent_patterns import apriori
from mlxtend.frequent_patterns import association_rules



In [58]:
df = pd.read_csv('stream_viewing.csv')
df.head(5)

Unnamed: 0,Stream ID,1,2,3,4,5
0,10001,Cobra Kai,Lupin,12 Monkeys,Sherlock,
1,10002,Lost,Jack Ryan,The Flash,Game of thrones,
2,10003,Sex Education,Dr. House,Kingdom,The Walking Dead,
3,10004,Ozark,Sex Education,Constantine,Preacher,
4,10005,Naruto,,,,


# 1. What were the 10 most popular shows (based on 'support')?

In [59]:
#Filling the table with dummy values
# turning our categorical data into banary representations
shows = pd.get_dummies(df)
shows.head(1)

Unnamed: 0,Stream ID,1_12 Monkeys,1_Absentia,1_Alice in Borderland,1_Altered Carbon,1_Archer,1_Arrow,1_Atypical,1_Banshee,1_Berlin Station,...,4_The Witcher,4_Travellers,4_Two and a half men,4_Upload,4_Vikings,4_Westworld,4_White Collar,4_X-Files,5_Better Call Saul,5_Peaky Blinders
0,10001,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [60]:
# Stream ID is irrelevant to creating our show sets
show_sets = shows.drop("Stream ID", axis = 1)
show_sets.head(3)

Unnamed: 0,1_12 Monkeys,1_Absentia,1_Alice in Borderland,1_Altered Carbon,1_Archer,1_Arrow,1_Atypical,1_Banshee,1_Berlin Station,1_Better Call Saul,...,4_The Witcher,4_Travellers,4_Two and a half men,4_Upload,4_Vikings,4_Westworld,4_White Collar,4_X-Files,5_Better Call Saul,5_Peaky Blinders
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [61]:
# We build association rules between our shows using the Apriori algorithm, using only items with greater than 2% support (0.02) of the total show set.

# the support of an association rule is the percentage of groups that contain all the items listed in the association rule.

apriori(show_sets, min_support=0.02).head()



Unnamed: 0,support,itemsets
0,0.028056,(1)
1,0.024048,(5)
2,0.02004,(6)
3,0.054108,(14)
4,0.082164,(16)


In [62]:
# Showing shows with a popularity greater than 2% in the store
# Tail shows 5 show sets occurring with a frequency greater than 2%
apriori(show_sets, min_support=0.03, use_colnames=True).sort_values('support', ascending=False)



Unnamed: 0,support,itemsets
1,0.082164,(1_Daredevil)
8,0.064128,(1_Sex Education)
9,0.06012,(1_The Blacklist)
0,0.054108,(1_Cobra Kai)
13,0.054108,(2_Ozark)
14,0.0501,(2_Sex Education)
3,0.046092,(1_Hanna)
17,0.044088,(2_Two and a half men)
20,0.042084,(4_Sex Education)
6,0.042084,(1_Ozark)


# The 10 most popular shows were:
- Daredevil
- Sex Education
- The Blacklist
- Cobra Kai
- Ozark
- Sex Education
- Hanna
- Two and a half Men
- Sex Education
- Ozark

# 2. Display all the combinations of the 'show' sets (viewed together)

In [63]:
common_showsets = apriori(show_sets, min_support=0.002, use_colnames=True)

#Create a Length column to include combinations of all the item sets.
#So use code 'lambda x:len' go to end of the dataset.
common_showsets['length'] = common_showsets['itemsets'].apply(lambda x:len(x))

# Has to be greater than two to be a set, so using >= 3
common_showsets[common_showsets['length'] >=3].sort_values('support', ascending=False)



Unnamed: 0,support,itemsets,length
2718,0.006012,"(2_Mr. Robot, 3_Succession, 4_Ozark)",3
1914,0.006012,"(1_Cobra Kai, 2_The Blacklist, 3_Demon Slayer)",3
2773,0.006012,"(3_Ozark, 4_Sex Education, 2_Rick And Morty)",3
2015,0.006012,"(3_Outer Banks, 2_Two and a half men, 1_Darede...",3
1919,0.006012,"(4_Atypical, 1_Cobra Kai, 2_The Blacklist)",3
...,...,...,...
2255,0.002004,"(2_12 Monkeys, 3_Stranger Things, 1_Mr. Robot)",3
2254,0.002004,"(2_Banshee, 1_Mirzapur, 3_The Wheel of Time)",3
2253,0.002004,"(3_Startup, 2_Big Little Lies, 1_Mindhunter)",3
2252,0.002004,"(4_The Alienist, 3_Absentia, 1_Mare of Easttown)",3


# 3. Identify the ‘shows’ most likely to be viewed by the same user ID (based on ‘confidence’).

In [68]:
assoc_rules = association_rules(common_showsets, metric='confidence', min_threshold=0.5).drop_duplicates()
assoc_rules.head(10)

Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,leverage,conviction
0,(1_12 Monkeys),(2_Banshee),0.004008,0.008016,0.002004,0.5,62.375,0.001972,1.983968
1,(1_12 Monkeys),(2_Dark),0.004008,0.012024,0.002004,0.5,41.583333,0.001956,1.975952
2,(1_12 Monkeys),(3_How I met your mother),0.004008,0.012024,0.002004,0.5,41.583333,0.001956,1.975952
3,(2_Brooklyn Nine Nine),(1_Absentia),0.004008,0.028056,0.002004,0.5,17.821429,0.001892,1.943888
4,(2_Lucifer),(1_Absentia),0.01002,0.028056,0.006012,0.6,21.385714,0.005731,2.42986
5,(1_Alice in Borderland),(2_The Walking Dead),0.002004,0.01002,0.002004,1.0,99.8,0.001984,inf
6,(1_Alice in Borderland),(3_How I met your mother),0.002004,0.012024,0.002004,1.0,83.166667,0.00198,inf
7,(1_Altered Carbon),(2_24),0.002004,0.004008,0.002004,1.0,249.5,0.001996,inf
8,(2_24),(1_Altered Carbon),0.004008,0.002004,0.002004,0.5,249.5,0.001996,1.995992
9,(1_Altered Carbon),(3_Stranger Things),0.002004,0.01002,0.002004,1.0,99.8,0.001984,inf


# 4. List the top 3 combinations of ‘shows’ viewed together in the same transaction as opposed to separate transactions (based on ‘lift’).

In [70]:
# The business would benefit by recommending consequents in this list to those who have watched the antecedents
# E.g., a user finishes watching 'Alice in Borderland' should also be recommended 'The Walking Dead'
rules = association_rules(common_showsets, metric='confidence', min_threshold=0.5)
rules.nlargest(3, 'confidence').head()

Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,leverage,conviction
5,(1_Alice in Borderland),(2_The Walking Dead),0.002004,0.01002,0.002004,1.0,99.8,0.001984,inf
6,(1_Alice in Borderland),(3_How I met your mother),0.002004,0.012024,0.002004,1.0,83.166667,0.00198,inf
7,(1_Altered Carbon),(2_24),0.002004,0.004008,0.002004,1.0,249.5,0.001996,inf


# 5. Display the top 5 combined ‘shows’ in one transaction (based on 'lift').

In [72]:
# Identify the top 5 combined products in 1 transaction - based on highest 'lift' value
# This is useful information for determining which promotions to run, for example.
# E.g., users who purchase 'Billions' to watch will get a 50% discount on 'How to Get Away With Murder', likely users would avail of this offer
rules[(rules['lift'] > 1.0) & (rules['confidence'] > 0.5)].nlargest(5, 'lift')

Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,leverage,conviction
33,(1_Billions),(4_How to get away with murder),0.002004,0.002004,0.002004,1.0,499.0,0.002,inf
34,(4_How to get away with murder),(1_Billions),0.002004,0.002004,0.002004,1.0,499.0,0.002,inf
93,(1_Fringe),(2_Shooter),0.002004,0.002004,0.002004,1.0,499.0,0.002,inf
94,(2_Shooter),(1_Fringe),0.002004,0.002004,0.002004,1.0,499.0,0.002,inf
140,(3_Startup),(1_Mindhunter),0.002004,0.002004,0.002004,1.0,499.0,0.002,inf


# 6. Include any extra code (or plot any graph) that you think would help StreamMedia management interpret their behaviour data.