## Apriori Algorithm

Question: Can we predict which products a customer will most likely purchase together within various product segments?

Goal: Help Amazon identify products frequently bought together by customers to increase sales and revenues (cross sell) by analyzing Amazon Marketplace segment data.

In [1]:
# Import dependencies
from sqlalchemy import create_engine
import pandas as pd

# Importing apriori dependencies
from mlxtend.frequent_patterns import apriori
from mlxtend.frequent_patterns import association_rules

In [2]:
# Connect to postgres/RDS database 
POSTGRES_ADDRESS = 'mypostgresdb.cwuhtytzosg8.us-east-2.rds.amazonaws.com' 
POSTGRES_PORT = '5432'
POSTGRES_USERNAME = 'root' 
POSTGRES_PASSWORD = 'XXXX' 
POSTGRES_DBNAME = 'my_data_class_db'

# A long string that contains the necessary Postgres login information
postgres_str = ('postgresql://{username}:{password}@{ipaddress}:{port}/{dbname}'
.format(username=POSTGRES_USERNAME,
password=POSTGRES_PASSWORD,
ipaddress=POSTGRES_ADDRESS,
port=POSTGRES_PORT,
dbname=POSTGRES_DBNAME))
# Create the connection
cnx = create_engine(postgres_str)

### Video Analysis 

In [18]:
# Load database for sentiment/topic analysis
df_videos = pd.read_sql_query('''SELECT * FROM videos_apriori_analysis''', con=cnx)
df_videos.head()

Unnamed: 0,customer_id,product_id,quantity
0,25551507,0788812807,1
1,25551507,6302320402,1
2,31354506,6301442733,1
3,42622115,B00003CX7L,1
4,27446106,0788806270,1


In [19]:
# Create pivot table to run algorithm 
apriori_table = df_videos.groupby(["customer_id","product_id"])["quantity"].sum().unstack().reset_index().fillna(0).set_index("customer_id")
apriori_table

product_id,0738920525,0767800958,0767815963,076781598X,076783822X,0773386777,0780614097,078062128X,0780625900,0780626028,...,B000E4C2TY,B000FP5PPA,B000H61X62,B000QX1SUC,B000TOSN7Y,B0019KBJOQ,B004J0SG6C,B00AVPYKFU,B00B9LNBWS,B00BBND56G
customer_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
15160,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
19893,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
20767,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
26626,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
41909,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
53094662,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
53094728,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
53095639,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
53096090,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [20]:
# Cleaning Apriori table to remove unnecessary numbers
def encode_units(x):
    if x <= 0:
        return 0
    if x >= 1:
        return 1
apriori_cleaned_videos = apriori_table.applymap(encode_units)
apriori_cleaned_videos

product_id,0738920525,0767800958,0767815963,076781598X,076783822X,0773386777,0780614097,078062128X,0780625900,0780626028,...,B000E4C2TY,B000FP5PPA,B000H61X62,B000QX1SUC,B000TOSN7Y,B0019KBJOQ,B004J0SG6C,B00AVPYKFU,B00B9LNBWS,B00BBND56G
customer_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
15160,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
19893,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
20767,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
26626,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
41909,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
53094662,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
53094728,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
53095639,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
53096090,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [21]:
# Running apriori algorithm on cleaned dataset. 
item_association_videos = apriori(apriori_cleaned_videos, min_support=0.0001, use_colnames=True)
item_association_videos

Unnamed: 0,support,itemsets
0,0.002396,(0738920525)
1,0.001322,(0767800958)
2,0.001652,(0767815963)
3,0.001652,(076781598X)
4,0.002479,(076783822X)
...,...,...
1615,0.000165,"(0788802194, 0788812807, 155890641X, 630027419..."
1616,0.000165,"(0788802194, 0788812807, 078881172X, 155890641..."
1617,0.000165,"(0788802194, 0788812807, 078881172X, 155890641..."
1618,0.000165,"(0788812807, 078881172X, 155890641X, 630027419..."


In [22]:
# Apriori association results table and confidence levels. 
apriori_rules_videos = association_rules(item_association_videos, metric="lift", min_threshold=1)

In [24]:
# Show output by descending order of confidence level
apriori_rules_videos = apriori_rules_videos.sort_values(["confidence"], ascending=False)
apriori_rules_videos 

Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,leverage,conviction
9175,"(6304401132, 0788802194, 0788812408)","(0788806270, 0788805533, 0788812807)",0.000165,0.000165,0.000165,1.000000,6052.000000,0.000165,inf
11817,"(0788802194, 0788812807, 6300274195, 630440113...",(155890641X),0.000165,0.002148,0.000165,1.000000,465.538462,0.000165,inf
11819,"(0788812807, 155890641X, 6300274195, 630440113...",(0788802194),0.000165,0.002396,0.000165,1.000000,417.379310,0.000165,inf
11820,"(0788802194, 0788812807, 155890641X, 6300274195)","(6304401132, 0788812408)",0.000165,0.000248,0.000165,1.000000,4034.666667,0.000165,inf
11821,"(6304401132, 0788802194, 0788812807, 155890641X)","(0788812408, 6300274195)",0.000165,0.000248,0.000165,1.000000,4034.666667,0.000165,inf
...,...,...,...,...,...,...,...,...,...
431,(B00004U8H5),(6305107807),0.004957,0.001404,0.000165,0.033333,23.733333,0.000158,1.033030
387,(6303182135),(6303182232),0.004957,0.002396,0.000165,0.033333,13.912644,0.000153,1.032004
323,(6302208661),(6302967945),0.004957,0.002726,0.000165,0.033333,12.226263,0.000152,1.031662
175,(094567189X),(6303243606),0.005040,0.002892,0.000165,0.032787,11.338642,0.000151,1.030909


Interpreataion of 1st row of results, VHS bought together:

0788806270: Bambi

078881172X: Peter Pan

6302787068: Aladdin

6302526574: Beauty and the Beast

0788812807: Lady and the Tramp

0788812408: Little Mermaid

0788802194: Cinderella 

### Personal Care Analysis

In [10]:
# Run test with personal care data ~12k products
df_personal_care = pd.read_sql_query('''SELECT * FROM personal_care_appliances_apriori_analysis''', con=cnx)
df_personal_care.head()

Unnamed: 0,customer_id,product_id,quantity
0,32114233,B00OYRW4UE,1
1,18125776,B0000537JQ,1
2,19917519,B00HXXO332,1
3,18277171,B00EOB0JA2,1
4,2592955,B00HES9CMS,1


In [11]:
# Create pivot table to run algorithm 
apriori_table_personal_care = df_personal_care.groupby(["customer_id","product_id"])["quantity"].sum().unstack().reset_index().fillna(0).set_index("customer_id")
apriori_table_personal_care

product_id,B0000532OT,B0000532OV,B0000537JP,B0000537JQ,B00005B6ZC,B00005JHVY,B000068PBJ,B00008J1ZZ,B00008KA7Q,B000094ZGG,...,B00QH8QJ2C,B00QH96JQC,B00QH9M6QY,B00QR4JRHU,B00R3PFF4Q,B00RWIWFFQ,B00S02EJZW,B00TOYNBA4,B00UMAAWCY,B00XZJ2G46
customer_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
10470,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
11344,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
12674,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
13044,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
14147,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
53092777,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
53094082,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
53094709,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
53095826,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [12]:
# Cleaning Apriori table to remove unnecessary numbers
apriori_cleaned_pcare = apriori_table_personal_care.applymap(encode_units)

In [13]:
# Running apriori algorithm on cleaned dataset. 
item_association_pcare = apriori(apriori_cleaned_pcare, min_support=0.0001, use_colnames=True)
item_association_pcare

MemoryError: Unable to allocate 66.0 GiB for an array with shape (124750, 2, 35491) and data type int64

In [None]:
# Apriori association results table and confidence levels. 
apriori_rules_pcare = association_rules(item_association_pcar, metric="lift", min_threshold=1)

# Show output by descending order of confidence level
apriori_rules_pcare = apriori_rules_pcare.sort_values(["confidence"], ascending=False)
apriori_rules_pcare

In [15]:
apriori_rules_apparel.to_csv(r'C:\Users\li_mi\Class\final_project\dev\apriori_rules_apparel.csv', index = False)

### Video games Analysis 

In [3]:
#  Upload df
df_video_games = pd.read_sql_query('''SELECT * FROM video_games_apriori_analysis''', con=cnx)
df_video_games.head()

Unnamed: 0,customer_id,product_id,quantity
0,12039526,B001CXYMFS,1
1,48880662,B0053OLY9O,1
2,45205407,B00KVP78FE,1
3,10548951,B00273Z9WM,1
4,50165446,B003O6E800,1


In [4]:
# Create pivot table to run algorithm 
apriori_video_games = df_video_games.groupby(["customer_id","product_id"])["quantity"].sum().unstack().reset_index().fillna(0).set_index("customer_id")
apriori_video_games

product_id,B0009VXBAQ,B000ERVMI8,B000FQ9R4E,B000M4KIME,B000NUBY0C,B000OYMYZQ,B000XJNTNS,B000ZKA0J6,B0013OL0BK,B0015AARJI,...,B00JK00S0S,B00KSQHX1K,B00KTNSKZU,B00KVOVBGM,B00KVP78FE,B00KVSQ848,B00MU1YENG,B00NFXON1Q,B00O9JLAX4,B00RSXRLUE
customer_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
10018,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
10481,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
10670,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
10866,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
11026,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
53092633,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
53092767,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
53093124,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
53093730,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [5]:
# Cleaning Apriori table to remove unnecessary numbers
apriori_video_games = apriori_video_games.applymap(encode_units)

In [6]:
# Running apriori algorithm on cleaned dataset. 
item_association_video_games = apriori(apriori_video_games, min_support=0.0001, use_colnames=True)

In [7]:
#  Apriori association results table and confidence levels. 
apriori_rules_video_games = association_rules(item_association_video_games, metric="lift", min_threshold=1)

In [8]:
# Show output by descending order of confidence level
apriori_rules_video_games  = apriori_rules_video_games.sort_values(["confidence"], ascending=False)
apriori_rules_video_games

Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,leverage,conviction
691,"(B00JK00S0S, B00BGA9X9W)",(B00BGA9WK2),0.000213,0.058590,0.000117,0.551724,9.416636,0.000105,2.100068
674,"(B00BI83EVU, B00BGA9X9W)",(B00BGA9WK2),0.000308,0.058590,0.000161,0.523810,8.940199,0.000143,1.976960
668,"(B00BGAA3S2, B00BGA9X9W)",(B00BGA9WK2),0.000896,0.058590,0.000382,0.426230,7.274737,0.000329,1.640743
662,"(B0086V5UF0, B003O6FV8S)",(B0053BG26C),0.000257,0.009579,0.000103,0.400000,41.757548,0.000100,1.650701
678,"(B00BGA9WK2, B00ENFVJJO)",(B00BGA9X9W),0.000749,0.013763,0.000279,0.372549,27.068716,0.000269,1.571815
...,...,...,...,...,...,...,...,...,...
659,(B0050SXKU4),"(B003O6CBIG, B007XVTR5S)",0.020259,0.000448,0.000110,0.005435,12.137741,0.000101,1.005014
681,(B00BGA9WK2),"(B00ENFVJJO, B00BGA9X9W)",0.058590,0.000844,0.000279,0.004761,5.639746,0.000229,1.003935
675,(B00BGA9WK2),"(B00BI83EVU, B00BGA9X9W)",0.058590,0.000308,0.000161,0.002756,8.940199,0.000143,1.002455
694,(B00BGA9WK2),"(B00JK00S0S, B00BGA9X9W)",0.058590,0.000213,0.000117,0.002005,9.416636,0.000105,1.001795


In [10]:
apriori_rules_video_games.to_csv(r'C:\Users\li_mi\Class\final_project\dev\apriori_rules_video_games.csv', index = False)

### Watches Analysis 

In [11]:
#  Upload df
df_watches = pd.read_sql_query('''SELECT * FROM watches_apriori_analysis''', con=cnx)
df_watches.head()

Unnamed: 0,customer_id,product_id,quantity
0,27324930,B00DKYC7TK,1
1,7211452,B000EQS1JW,1
2,912779,B005JVP0FU,1
3,805483,B000JQJS6M,1
4,32555369,B00NC8PMUK,1


In [12]:
# Create pivot table to run algorithm 
apriori_watches = df_watches.groupby(["customer_id","product_id"])["quantity"].sum().unstack().reset_index().fillna(0).set_index("customer_id")

In [13]:
# Cleaning Apriori table to remove unnecessary numbers
apriori_watches = apriori_watches.applymap(encode_units)

In [14]:
# Running apriori algorithm on cleaned dataset. 
item_association_watches = apriori(apriori_watches, min_support=0.0001, use_colnames=True)

In [15]:
#  Apriori association results table and confidence levels. 
apriori_rules_watches = association_rules(item_association_watches, metric="lift", min_threshold=1)

In [16]:
# Show output by descending order of confidence level
apriori_rules_watches  = apriori_rules_watches.sort_values(["confidence"], ascending=False)
apriori_rules_watches

Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,leverage,conviction
205,"(B005KKLEWS, B004P0UUE2)",(B004P0UUBK),0.000275,0.004040,0.000122,0.444444,110.016835,0.000121,1.792728
194,"(B003EKNMAI, B003EKIS4S)",(B008D902Q2),0.000286,0.017822,0.000122,0.428571,24.047346,0.000117,1.718812
204,"(B005KKLEWS, B004P0UUBK)",(B004P0UUE2),0.000296,0.004458,0.000122,0.413793,92.819380,0.000121,1.698277
198,"(B003EKIU3W, B003EKNMAI)",(B008D902Q2),0.000337,0.017822,0.000122,0.363636,20.403809,0.000116,1.543423
199,"(B003EKIU3W, B008D902Q2)",(B003EKNMAI),0.000418,0.007753,0.000122,0.292683,37.750321,0.000119,1.402832
...,...,...,...,...,...,...,...,...,...
195,(B008D902Q2),"(B003EKNMAI, B003EKIS4S)",0.017822,0.000286,0.000122,0.006869,24.047346,0.000117,1.006629
122,(B005JVP0LE),(B004D35W8A),0.019740,0.004142,0.000122,0.006202,1.497308,0.000041,1.002073
108,(B004YM2FV2),(B003S7T8NM),0.028115,0.004254,0.000153,0.005443,1.279419,0.000033,1.001195
45,(B000T9VK56),(B000HFRO8O),0.042540,0.003387,0.000224,0.005276,1.557706,0.000080,1.001899


In [17]:
apriori_rules_watches.to_csv(r'C:\Users\li_mi\Class\final_project\dev\apriori_rules_watches.csv', index = False)

higher confidence analysis:

XOXO Women's XO5429 Rhinestone-Accented Two-Tone Bracelet Watch (B005KKLEWS)

XOXO Women's XO5302A Rhinestone-Accented Gold-Tone Bracelet Watch (B004P0UUE2)

XOXO Women's XO5301A Rhinestone-Accented Silver-Tone Bracelet Watch (B004P0UUBK)