## Apriori Algorithm

Question: Can we predict which products a customer will most likely purchase together within various product segments?

Goal: Help Amazon identify products frequently bought together by customers to increase sales and revenues (cross sell) by analyzing Amazon Marketplace segment data.

In [1]:
# Import dependencies
from sqlalchemy import create_engine
import pandas as pd

# Importing apriori dependencies
from mlxtend.frequent_patterns import apriori
from mlxtend.frequent_patterns import association_rules

In [2]:
# Connect to postgres/RDS database 
POSTGRES_ADDRESS = 'mypostgresdb.cwuhtytzosg8.us-east-2.rds.amazonaws.com' 
POSTGRES_PORT = '5432'
POSTGRES_USERNAME = 'root' 
POSTGRES_PASSWORD = 'XXXX' 
POSTGRES_DBNAME = 'my_data_class_db'

# A long string that contains the necessary Postgres login information
postgres_str = ('postgresql://{username}:{password}@{ipaddress}:{port}/{dbname}'
.format(username=POSTGRES_USERNAME,
password=POSTGRES_PASSWORD,
ipaddress=POSTGRES_ADDRESS,
port=POSTGRES_PORT,
dbname=POSTGRES_DBNAME))
# Create the connection
cnx = create_engine(postgres_str)

### Video Analysis 

In [3]:
# Load database for sentiment/topic analysis
df_videos = pd.read_sql_query('''SELECT * FROM videos_apriori_analysis''', con=cnx)
df_videos.head()

Unnamed: 0,customer_id,product_id,quantity
0,49033728,6302503213,1
1,25551507,0788812807,1
2,25551507,6302320402,1
3,13124772,B00000INCI,1
4,31354506,6301442733,1


In [4]:
# Create pivot table to run algorithm 
apriori_table = df_videos.groupby(["customer_id","product_id"])["quantity"].sum().unstack().reset_index().fillna(0).set_index("customer_id")
apriori_table

product_id,0615115187,0738920525,0738920967,0767800958,0767813871,0767815963,076781598X,076783822X,0767849493,0773386777,...,B000TOSN7Y,B000TSHPTC,B000UFEGH8,B000WAXYGK,B0019KBJOQ,B004J0SG6C,B008Q13SL8,B00AVPYKFU,B00B9LNBWS,B00BBND56G
customer_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
15160,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
19893,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
20767,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
26626,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
37762,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
53095537,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
53095639,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
53095663,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
53096090,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [5]:
# Cleaning Apriori table to remove unnecessary numbers
def encode_units(x):
    if x <= 0:
        return 0
    if x >= 1:
        return 1
apriori_cleaned_videos = apriori_table.applymap(encode_units)
apriori_cleaned_videos

product_id,0615115187,0738920525,0738920967,0767800958,0767813871,0767815963,076781598X,076783822X,0767849493,0773386777,...,B000TOSN7Y,B000TSHPTC,B000UFEGH8,B000WAXYGK,B0019KBJOQ,B004J0SG6C,B008Q13SL8,B00AVPYKFU,B00B9LNBWS,B00BBND56G
customer_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
15160,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
19893,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
20767,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
26626,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
37762,0,0,0,0,0,0,0,0,0,0,...,0,1,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
53095537,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
53095639,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
53095663,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
53096090,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [6]:
# Running apriori algorithm on cleaned dataset. 
item_association_videos = apriori(apriori_cleaned_videos, min_support=0.0001, use_colnames=True)
item_association_videos

MemoryError: Unable to allocate 169. GiB for an array with shape (719400, 2, 15766) and data type int64

In [None]:
# Apriori association results table and confidence levels. 
apriori_rules = association_rules(item_association, metric="lift", min_threshold=1)

In [None]:
# Show output by descending order of confidence level
apriori_rules = apriori_rules.sort_values(["confidence"], ascending=False)
apriori_rules 

Interpreataion of 1st row of results, VHS bought together:

0788806270: Bambi

078881172X: Peter Pan

6302787068: Aladdin

6302526574: Beauty and the Beast

0788812807: Lady and the Tramp

0788812408: Little Mermaid

0788802194: Cinderella 

### Personal Care Analysis

In [7]:
# Run test with personal care data ~12k products
df_personal_care = pd.read_sql_query('''SELECT * FROM personal_care_appliances_apriori_analysis''', con=cnx)
df_personal_care.head()

Unnamed: 0,customer_id,product_id,quantity
0,32114233,B00OYRW4UE,1
1,18125776,B0000537JQ,1
2,19917519,B00HXXO332,1
3,18277171,B00EOB0JA2,1
4,2592955,B00HES9CMS,1


In [8]:
# Create pivot table to run algorithm 
apriori_table_personal_care = df_personal_care.groupby(["customer_id","product_id"])["quantity"].sum().unstack().reset_index().fillna(0).set_index("customer_id")
apriori_table_personal_care

product_id,B0000532OT,B0000532OV,B000053569,B0000537JN,B0000537JP,B0000537JQ,B00005B6ZC,B00005JHVY,B000068PBJ,B000068PBL,...,B00QR4JRHU,B00R2YQFO2,B00R3PFF4Q,B00RWIWFFQ,B00S02EJZW,B00TOYNBA4,B00UMAAWCY,B00V1QG4F2,B00V422E70,B00XZJ2G46
customer_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
10470,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
11344,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
12674,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
13044,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
14147,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
53092777,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
53094082,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
53094709,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
53095826,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [9]:
# Cleaning Apriori table to remove unnecessary numbers
def encode_units(x):
    if x <= 0:
        return 0
    if x >= 1:
        return 1
apriori_cleaned_pcare = apriori_table_personal_care.applymap(encode_units)
apriori_cleaned_pcare

product_id,B0000532OT,B0000532OV,B000053569,B0000537JN,B0000537JP,B0000537JQ,B00005B6ZC,B00005JHVY,B000068PBJ,B000068PBL,...,B00QR4JRHU,B00R2YQFO2,B00R3PFF4Q,B00RWIWFFQ,B00S02EJZW,B00TOYNBA4,B00UMAAWCY,B00V1QG4F2,B00V422E70,B00XZJ2G46
customer_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
10470,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
11344,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
12674,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
13044,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
14147,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
53092777,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
53094082,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
53094709,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
53095826,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [10]:
# Running apriori algorithm on cleaned dataset. 
item_association_pcare = apriori(apriori_cleaned_pcare, min_support=0.0001, use_colnames=True)
item_association_pcare

MemoryError: Unable to allocate 99.2 GiB for an array with shape (179700, 2, 37032) and data type int64

In [None]:
# 13 Apriori association results table and confidence levels. 
apriori_rules_pcare = association_rules(item_association_pcar, metric="lift", min_threshold=1)

# Show output by descending order of confidence level
apriori_rules_pcare = apriori_rules_pcare.sort_values(["confidence"], ascending=False)
apriori_rules_pcare

### Apparel Apriori Analysis

In [None]:
#  Upload df_apparel
df_apparel = pd.read_sql_query('''SELECT * FROM apparel_apriori_analysis''', con=cnx)
df_apparel

In [None]:
# Create pivot table to run algorithm 
apriori_apparel = df_apparel.groupby(["customer_id","product_id"])["quantity"].sum().unstack().reset_index().fillna(0).set_index("customer_id")
apriori_apparel

In [None]:
# 11 Cleaning Apriori table to remove unnecessary numbers

def encode_units(x):
    if x <= 0:
        return 0
    if x >= 1:
        return 1
apriori_apparel = apriori_apparel.applymap(encode_units)
apriori_apparel

In [None]:
# Running apriori algorithm on cleaned dataset. 
item_association_apparel = apriori(apriori_apparel, min_support=0.0001, use_colnames=True)
item_association_apparel

In [None]:
# 7 Apriori association results table and confidence levels. 
apriori_rules_apparel = association_rules(item_association_apparel, metric="lift", min_threshold=1)

In [None]:
# 8 Show output by descending order of confidence level
apriori_rules_apparel = apriori_rules_apparel.sort_values(["confidence"], ascending=False)
apriori_rules_apparel

### Furniture Analysis

In [None]:
#  Upload df_apparel
df_furniture = pd.read_sql_query('''SELECT * FROM furniture_apriori_analysis''', con=cnx)
df_furniture.head()

In [None]:
# Create pivot table to run algorithm 
apriori_furniture = df_furniture.groupby(["customer_id","product_id"])["quantity"].sum().unstack().reset_index().fillna(0).set_index("customer_id")
apriori_furniture

In [None]:
# 11 Cleaning Apriori table to remove unnecessary numbers

def encode_units(x):
    if x <= 0:
        return 0
    if x >= 1:
        return 1
apriori_furniture = apriori_furniture.applymap(encode_units)
apriori_furniture

In [None]:
# Running apriori algorithm on cleaned dataset. 
item_association_furniture = apriori(apriori_furniture, min_support=0.0001, use_colnames=True)
item_association_furniture

In [None]:
#  Apriori association results table and confidence levels. 
apriori_rules_furniture = association_rules(item_association_furniture, metric="lift", min_threshold=1)

In [None]:
# Show output by descending order of confidence level
apriori_rules_furniture = apriori_rules_furniture.sort_values(["confidence"], ascending=False)
apriori_rules_furniture