## Apriori Algorithm

Question: Can we predict which products a customer will most likely purchase together within various product segments?

Goal: Help Amazon identify products frequently bought together by customers to increase sales and revenues (cross sell) by analyzing Amazon Marketplace segment data.

In [1]:
# 1 Import dependencies
from sqlalchemy import create_engine
import pandas as pd

# Importing apriori dependencies
from mlxtend.frequent_patterns import apriori
from mlxtend.frequent_patterns import association_rules

In [2]:
# 2 Connect to postgres/RDS database 
POSTGRES_ADDRESS = 'mypostgresdb.cwuhtytzosg8.us-east-2.rds.amazonaws.com' 
POSTGRES_PORT = '5432'
POSTGRES_USERNAME = 'root' 
POSTGRES_PASSWORD = 'XXXX' 
POSTGRES_DBNAME = 'my_data_class_db'

# A long string that contains the necessary Postgres login information
postgres_str = ('postgresql://{username}:{password}@{ipaddress}:{port}/{dbname}'
.format(username=POSTGRES_USERNAME,
password=POSTGRES_PASSWORD,
ipaddress=POSTGRES_ADDRESS,
port=POSTGRES_PORT,
dbname=POSTGRES_DBNAME))
# Create the connection
cnx = create_engine(postgres_str)

In [3]:
# 3 Load database for sentiment/topic analysis
df = pd.read_sql_query('''SELECT * FROM apriori_analysis''', con=cnx)
df.head()

Unnamed: 0,customer_id,product_id,quantity
0,25551507,0788812807,1
1,27446106,0788806270,1
2,27446106,6303646689,1
3,52562651,6304022492,1
4,194466,B00000IBN2,1


In [4]:
# 4 Create pivot table to run algorithm 
apriori_table = df.groupby(["customer_id","product_id"])["quantity"].sum().unstack().reset_index().fillna(0).set_index("customer_id")
apriori_table

product_id,0738920525,076783822X,0783116640,0783215126,0783222955,0784001847,0788802194,0788806270,078881172X,0788812408,...,B000056MO2,B00005LQ1J,B00005LQ1K,B00005LQ1L,B00005NRQG,B00005T33H,B000083C59,B00008DDJ9,B00066J4PU,B000FP5PPA
customer_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
194466,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
418042,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
510910,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
528765,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
595247,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
53087420,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
53088112,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
53090048,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
53095639,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [5]:
# 5 Cleaning Apriori table to remove unnecessary numbers

def encode_units(x):
    if x <= 0:
        return 0
    if x >= 1:
        return 1
apriori_cleaned = apriori_table.applymap(encode_units)
apriori_cleaned

product_id,0738920525,076783822X,0783116640,0783215126,0783222955,0784001847,0788802194,0788806270,078881172X,0788812408,...,B000056MO2,B00005LQ1J,B00005LQ1K,B00005LQ1L,B00005NRQG,B00005T33H,B000083C59,B00008DDJ9,B00066J4PU,B000FP5PPA
customer_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
194466,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
418042,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
510910,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
528765,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
595247,0,0,0,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
53087420,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
53088112,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
53090048,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
53095639,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [6]:
# 6 Running apriori algorithm on cleaned dataset. 

item_association = apriori(apriori_cleaned, min_support=0.0001, use_colnames=True)
item_association

Unnamed: 0,support,itemsets
0,0.009091,(0738920525)
1,0.009404,(076783822X)
2,0.009091,(0783116640)
3,0.008464,(0783215126)
4,0.010972,(0783222955)
...,...,...
1377,0.000313,"(6302787068, 0788812807, 6302158095, 078880627..."
1378,0.000313,"(6302787068, 0788812807, 6302158095, 078880627..."
1379,0.000313,"(6302787068, 0788812807, 6302158095, 155890641..."
1380,0.000313,"(6302787068, 0788812807, 6302158095, 078880627..."


In [7]:
# 7 Apriori association results table and confidence levels. 
apriori_rules = association_rules(item_association, metric="lift", min_threshold=1)

In [8]:
# 8 Show output by descending order of confidence level
apriori_rules = apriori_rules.sort_values(["confidence"], ascending=False)
apriori_rules 

Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,leverage,conviction
27855,"(6302787068, 0788812807, 0788802194)","(078881172X, 0788812408, 6302526574, 0788806270)",0.000313,0.000313,0.000313,1.000000,3190.000000,0.000313,inf
35798,"(6302787068, 6303314015, 0788812408)","(6302526574, 078881172X, 6302158095, 0788802194)",0.000313,0.000313,0.000313,1.000000,3190.000000,0.000313,inf
35776,"(6302526574, 0788812408, 6302158095, 0788802194)","(6302787068, 6303314015, 078881172X)",0.000313,0.000313,0.000313,1.000000,3190.000000,0.000313,inf
35777,"(6303314015, 6302526574, 078881172X, 6302158095)","(6302787068, 0788812408, 0788802194)",0.000313,0.000313,0.000313,1.000000,3190.000000,0.000313,inf
35778,"(6303314015, 6302526574, 6302158095, 0788802194)","(6302787068, 078881172X, 0788812408)",0.000313,0.000313,0.000313,1.000000,3190.000000,0.000313,inf
...,...,...,...,...,...,...,...,...,...
1362,(6303182135),"(6302872162, B00066J4PU)",0.018809,0.000313,0.000313,0.016667,53.166667,0.000308,1.016630
132,(094567189X),(B00004YZH1),0.019122,0.008150,0.000313,0.016393,2.011349,0.000158,1.008380
131,(094567189X),(6305650691),0.019122,0.008777,0.000313,0.016393,1.867681,0.000146,1.007743
129,(094567189X),(6305242143),0.019122,0.013793,0.000313,0.016393,1.188525,0.000050,1.002644


Interpreataion of 1st row of results, VHS bought together:

0788806270: Bambi

078881172X: Peter Pan

6302787068: Aladdin

6302526574: Beauty and the Beast

0788812807: Lady and the Tramp

0788812408: Little Mermaid

0788802194: Cinderella 

#### Testing data to understand limits of pivot table (# of columns)

In [9]:
# 9 Run test with personal care data ~12k products
df_personal_care = pd.read_sql_query('''SELECT * FROM personal_care_appliances_apriori''', con=cnx)
df_personal_care.head()

Unnamed: 0,customer_id,review_id,product_id,quantity
0,32114233,R1QX6706ZWJ1P5,B00OYRW4UE,1
1,18125776,R3QWMLJHIW6P37,B0000537JQ,1
2,19917519,R14Z1VR1N0Z9G6,B00HXXO332,1
3,18277171,R25ZRJL0GH0U0,B00EOB0JA2,1
4,2593270,R3837KYH7AZNIY,B00OC2O1UC,1


In [10]:
# 10 Create pivot table to run algorithm 
apriori_table_personal_care = df_personal_care.groupby(["customer_id","product_id"])["quantity"].sum().unstack().reset_index().fillna(0).set_index("customer_id")
apriori_table_personal_care

product_id,097459363X,1574998005,1574998021,1933622199,3979000532,3979002411,3979002632,3979002829,3979004813,7391000442,...,B010Y5G5EU,B0118Q011M,B011J79PNK,B011JCDFOA,B011M6UYMO,B011OI71X0,B01291WU3W,B012BO5ETY,B012E7L3UM,B01FWK8ARW
customer_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
10470,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
11344,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
11396,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
12185,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
12674,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
53095659,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
53095725,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
53095826,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
53095923,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [11]:
# 11 Cleaning Apriori table to remove unnecessary numbers

def encode_units(x):
    if x <= 0:
        return 0
    if x >= 1:
        return 1
apriori_cleaned_pcare = apriori_table_personal_care.applymap(encode_units)
apriori_cleaned_pcare

product_id,097459363X,1574998005,1574998021,1933622199,3979000532,3979002411,3979002632,3979002829,3979004813,7391000442,...,B010Y5G5EU,B0118Q011M,B011J79PNK,B011JCDFOA,B011M6UYMO,B011OI71X0,B01291WU3W,B012BO5ETY,B012E7L3UM,B01FWK8ARW
customer_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
10470,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
11344,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
11396,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
12185,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
12674,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
53095659,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
53095725,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
53095826,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
53095923,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [12]:
# 12 Running apriori algorithm on cleaned dataset. 
item_association_pcare = apriori(apriori_cleaned_pcare, min_support=0.0001, use_colnames=True)
item_association_pcare

MemoryError: Unable to allocate 813. GiB for an array with shape (883785, 2, 61769) and data type int64

In [None]:
# 13 Apriori association results table and confidence levels. 
apriori_rules_pcare = association_rules(item_association_pcar, metric="lift", min_threshold=1)

# Show output by descending order of confidence level
apriori_rules_pcare = apriori_rules_pcare.sort_values(["confidence"], ascending=False)
apriori_rules_pcare