## Apriori Algorithm

Question: Can we predict which products a customer will most likely purchase together within various product segments?

Goal: Help Amazon identify products frequently bought together by customers to increase sales and revenues (cross sell) by analyzing Amazon Marketplace segment data.

In [1]:
# Import dependencies
from sqlalchemy import create_engine
import pandas as pd

In [2]:
# 2 Connect to postgres/RDS database 
POSTGRES_ADDRESS = 'mypostgresdb.cwuhtytzosg8.us-east-2.rds.amazonaws.com' 
POSTGRES_PORT = '5432'
POSTGRES_USERNAME = 'root' 
POSTGRES_PASSWORD = 'XXXXXX' 
POSTGRES_DBNAME = 'my_data_class_db'

# A long string that contains the necessary Postgres login information
postgres_str = ('postgresql://{username}:{password}@{ipaddress}:{port}/{dbname}'
.format(username=POSTGRES_USERNAME,
password=POSTGRES_PASSWORD,
ipaddress=POSTGRES_ADDRESS,
port=POSTGRES_PORT,
dbname=POSTGRES_DBNAME))
# Create the connection
cnx = create_engine(postgres_str)

In [3]:
# 3 Load database for sentiment/topic analysis
df = pd.read_sql_query('''SELECT * FROM apriori_analysis''', con=cnx)
df.head()

Unnamed: 0,customer_id,product_id,quantity
0,25551507,0788812807,1
1,27446106,0788806270,1
2,27446106,6303646689,1
3,52562651,6304022492,1
4,194466,B00000IBN2,1


In [4]:
# 4 Create pivot table to run algorithm 
apriori_table = df.groupby(["customer_id","product_id"])["quantity"].sum().unstack().reset_index().fillna(0).set_index("customer_id")
apriori_table

product_id,0738920525,076783822X,0783116640,0783215126,0783222955,0784001847,0788802194,0788806270,078881172X,0788812408,...,B000056MO2,B00005LQ1J,B00005LQ1K,B00005LQ1L,B00005NRQG,B00005T33H,B000083C59,B00008DDJ9,B00066J4PU,B000FP5PPA
customer_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
194466,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
418042,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
510910,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
528765,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
595247,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
53087420,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
53088112,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
53090048,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
53095639,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [6]:
# 5 Importing apriori dependencies
from mlxtend.frequent_patterns import apriori
from mlxtend.frequent_patterns import association_rules

In [7]:
# 6 Cleaning Apriori table to remove unnecessary numbers

def encode_units(x):
    if x <= 0:
        return 0
    if x >= 1:
        return 1
apriori_cleaned = apriori_table.applymap(encode_units)
apriori_cleaned

product_id,0738920525,076783822X,0783116640,0783215126,0783222955,0784001847,0788802194,0788806270,078881172X,0788812408,...,B000056MO2,B00005LQ1J,B00005LQ1K,B00005LQ1L,B00005NRQG,B00005T33H,B000083C59,B00008DDJ9,B00066J4PU,B000FP5PPA
customer_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
194466,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
418042,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
510910,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
528765,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
595247,0,0,0,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
53087420,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
53088112,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
53090048,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
53095639,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [8]:
# 7 Running apriori algorithm on cleaned dataset. 

item_association = apriori(apriori_cleaned, min_support=0.0001, use_colnames=True)
item_association

Unnamed: 0,support,itemsets
0,0.009091,(0738920525)
1,0.009404,(076783822X)
2,0.009091,(0783116640)
3,0.008464,(0783215126)
4,0.010972,(0783222955)
...,...,...
1377,0.000313,"(078881172X, 6302526574, 0788806270, 630215809..."
1378,0.000313,"(6302526574, 0788806270, 6302158095, 630331401..."
1379,0.000313,"(078881172X, 6302526574, 6302158095, 630331401..."
1380,0.000313,"(078881172X, 6302526574, 0788806270, 630215809..."


In [9]:
#8 Apriori association results table and confidence levels. 
apriori_rules = association_rules(item_association, metric="lift", min_threshold=1)

In [10]:
# Show output by descending order of confidence level
apriori_rules = apriori_rules.sort_values(["confidence"], ascending=False)
apriori_rules 

Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,leverage,conviction
27856,"(0788806270, 078881172X, 6302787068)","(6302526574, 0788812807, 0788812408, 0788802194)",0.000313,0.000313,0.000313,1.000000,3190.000000,0.000313,inf
35798,"(6303314015, 078881172X, 6302787068)","(6302158095, 6302526574, 0788812408, 0788802194)",0.000313,0.000313,0.000313,1.000000,3190.000000,0.000313,inf
35776,"(6302158095, 6302526574, 6303314015, 0788812408)","(078881172X, 0788802194, 6302787068)",0.000313,0.000313,0.000313,1.000000,3190.000000,0.000313,inf
35777,"(6302158095, 6302526574, 6302787068, 0788802194)","(6303314015, 078881172X, 0788812408)",0.000313,0.000313,0.000313,1.000000,3190.000000,0.000313,inf
35778,"(6302158095, 6302526574, 6302787068, 0788812408)","(6303314015, 078881172X, 0788802194)",0.000313,0.000313,0.000313,1.000000,3190.000000,0.000313,inf
...,...,...,...,...,...,...,...,...,...
1313,(6303182135),"(6301413229, B00066J4PU)",0.018809,0.000313,0.000313,0.016667,53.166667,0.000308,1.016630
132,(094567189X),(B00004YZH1),0.019122,0.008150,0.000313,0.016393,2.011349,0.000158,1.008380
130,(094567189X),(6305650691),0.019122,0.008777,0.000313,0.016393,1.867681,0.000146,1.007743
128,(094567189X),(6305242143),0.019122,0.013793,0.000313,0.016393,1.188525,0.000050,1.002644


Interpreataion of 1st row of results, VHS bought together:

0788806270: Bambi

078881172X: Peter Pan

6302787068: Aladdin

6302526574: Beauty and the Beast

0788812807: Lady and the Tramp

0788812408: Little Mermaid

0788802194: Cinderella 