## Apriori Algorithm

Question: Can we predict which products a customer will most likely purchase together within various product segments?

Goal: Help Amazon identify products frequently bought together by customers to increase sales and revenues (cross sell) by analyzing Amazon Marketplace segment data.

In [1]:
# Import dependencies
from sqlalchemy import create_engine
import pandas as pd

# Importing apriori dependencies
from mlxtend.frequent_patterns import apriori
from mlxtend.frequent_patterns import association_rules

In [2]:
# Connect to postgres/RDS database 
POSTGRES_ADDRESS = 'mypostgresdb.cwuhtytzosg8.us-east-2.rds.amazonaws.com' 
POSTGRES_PORT = '5432'
POSTGRES_USERNAME = 'root' 
POSTGRES_PASSWORD = 'XXXX' 
POSTGRES_DBNAME = 'my_data_class_db'

# A long string that contains the necessary Postgres login information
postgres_str = ('postgresql://{username}:{password}@{ipaddress}:{port}/{dbname}'
.format(username=POSTGRES_USERNAME,
password=POSTGRES_PASSWORD,
ipaddress=POSTGRES_ADDRESS,
port=POSTGRES_PORT,
dbname=POSTGRES_DBNAME))
# Create the connection
cnx = create_engine(postgres_str)

### Apparel Apriori Analysis

In [3]:
#  Upload df_apparel
df_apparel = pd.read_sql_query('''SELECT * FROM apparel_apriori_analysis''', con=cnx)
df_apparel

Unnamed: 0,customer_id,product_id,quantity
0,22145489,B00ORZIYBQ,1
1,27094225,B00ORZIYBQ,1
2,12194341,B00ORZIYBQ,1
3,7639933,B00ORZIYBQ,1
4,15603921,B00ORZIYBQ,1
...,...,...,...
92237,49261910,B0002TOS7A,1
92238,46583704,B0002TOS7A,1
92239,48423435,B0002TOS7A,1
92240,39861437,B0002TOS7A,1


In [4]:
# Create pivot table to run algorithm 
apriori_apparel = df_apparel.groupby(["customer_id","product_id"])["quantity"].sum().unstack().reset_index().fillna(0).set_index("customer_id")
apriori_apparel

product_id,B0002TOS7A,B0002TOZ1E,B0002TOZ1O,B0002TOZ1Y,B0002TP1CQ,B0002TP1D0,B00080M1OU,B00080NYU0,B000QW6LE6,B000QWA2GY,...,B00HJKU3WE,B00HLRPQZO,B00HNXNTDW,B00JM8MWUC,B00JSJHQP6,B00K5AFQ22,B00LLIVQM6,B00LMI9A6Y,B00MAVN0R2,B00ORZIYBQ
customer_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
10151,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
10393,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
11299,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
11539,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
11757,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
53095899,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
53096054,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
53096456,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
53096471,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [5]:
# Cleaning Apriori table to remove unnecessary numbers
def encode_units(x):
    if x <= 0:
        return 0
    if x >= 1:
        return 1
apriori_apparel = apriori_apparel.applymap(encode_units)

In [None]:
# Running apriori algorithm on cleaned dataset. 
item_association_apparel = apriori(apriori_apparel, min_support=0.0001, use_colnames=True)
item_association_apparel

In [None]:
# Apriori association results table and confidence levels. 
apriori_rules_apparel = association_rules(item_association_apparel, metric="lift", min_threshold=1)

In [None]:
# Show output by descending order of confidence level
apriori_rules_apparel = apriori_rules_apparel.sort_values(["confidence"], ascending=False)
apriori_rules_apparel

In [33]:
# Add category column (to be used in viz)
apriori_rules_apparel['Category'] = 'Apparel'
apriori_rules_apparel

Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,leverage,conviction,Category
33,(B00817DYUM),(B00817DZWO),0.003326,0.005259,0.000607,0.182432,34.68789,0.000589,1.216708,Apparel
32,(B00817DZWO),(B00817DYUM),0.005259,0.003326,0.000607,0.115385,34.68789,0.000589,1.126675,Apparel
35,(B0083QLPY8),(B0083QLQ0G),0.004113,0.005844,0.000348,0.084699,14.494357,0.000324,1.086153,Apparel
36,(B0083QLQ10),(B0083QLQ0G),0.004428,0.005844,0.000303,0.068528,11.726972,0.000278,1.067296,Apparel
20,(B003828R16),(B00382E71U),0.003326,0.003371,0.000202,0.060811,18.037703,0.000191,1.061159,Apparel
21,(B00382E71U),(B003828R16),0.003371,0.003326,0.000202,0.06,18.037703,0.000191,1.060291,Apparel
34,(B0083QLQ0G),(B0083QLPY8),0.005844,0.004113,0.000348,0.059615,14.494357,0.000324,1.059021,Apparel
37,(B0083QLQ0G),(B0083QLQ10),0.005844,0.004428,0.000303,0.051923,11.726972,0.000278,1.050097,Apparel
50,(B00GUBHBRS),(B00GBT8QM8),0.003866,0.005956,0.000146,0.037791,6.344987,0.000123,1.033085,Apparel
26,(B005GYGFZO),(B005GYGEXC),0.003394,0.006922,0.000124,0.036424,5.261708,0.0001,1.030617,Apparel


In [34]:
apriori_rules_apparel.to_csv(r'C:\Users\li_mi\Class\final_project\dev\apriori_rules_apparel.csv', index = False)

Interpretaion of 1st row of results:
    
B00817DYUM: Fruit of the Loom Men's Pocket T-Shirt Multipack

B00817DZWO: Fruit of the Loom Men's Pocket Crew Neck T-Shirt

### Furniture Analysis

In [10]:
#  Upload df
df_furniture = pd.read_sql_query('''SELECT * FROM furniture_apriori_analysis''', con=cnx)
df_furniture.head()

Unnamed: 0,customer_id,product_id,quantity
0,45284262,B005G02ESA,1
1,26622950,B006MISZOC,1
2,23665632,B0046EC1D0,1
3,20252477,B003HEPHUE,1
4,12546512,B007EEG7M0,1


In [11]:
# Create pivot table to run algorithm 
apriori_furniture = df_furniture.groupby(["customer_id","product_id"])["quantity"].sum().unstack().reset_index().fillna(0).set_index("customer_id")
apriori_furniture

product_id,B000067PTO,B000209YT6,B00024JOVA,B0002KNM08,B0002KNM9O,B0002KNPFU,B000BHB09W,B000ELQY7S,B000EWZ1Q2,B000IHJF60,...,B00BUKEC4E,B00D93AT24,B00EYN1NEG,B00GEXJH4W,B00GTCAY04,B00HETRYOU,B00IGGJQ6O,B00JJZ18PS,B00PBQ7EA2,B00Q52VN1W
customer_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
10813,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
10998,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
11972,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
12070,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
12209,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
53095873,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
53095923,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
53096362,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
53096363,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [12]:
# Cleaning Apriori table to remove unnecessary numbers
apriori_furniture = apriori_furniture.applymap(encode_units)

In [13]:
# Running apriori algorithm on cleaned dataset. 
item_association_furniture = apriori(apriori_furniture, min_support=0.0001, use_colnames=True)
item_association_furniture

Unnamed: 0,support,itemsets
0,0.006322,(B000067PTO)
1,0.005060,(B000209YT6)
2,0.003156,(B00024JOVA)
3,0.005177,(B0002KNM08)
4,0.003498,(B0002KNM9O)
...,...,...
227,0.000150,"(B005G5RZIY, B006MISZOC)"
228,0.000150,"(B00HETRYOU, B006MIPW70)"
229,0.000225,"(B00B9JIGOS, B006MIUM20)"
230,0.000267,"(B00GTCAY04, B006MIUM20)"


In [14]:
#  Apriori association results table and confidence levels. 
apriori_rules_furniture = association_rules(item_association_furniture, metric="lift", min_threshold=1)

In [15]:
# Show output by descending order of confidence level
apriori_rules_furniture = apriori_rules_furniture.sort_values(["confidence"], ascending=False)
apriori_rules_furniture

Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,leverage,conviction
36,(B004UMDLPK),(B004UME2PS),0.003134,0.003936,0.000396,0.12628,32.080233,0.000383,1.140026
32,(B006MIPW70),(B004LQ1RJ2),0.004065,0.018976,0.00046,0.113158,5.963242,0.000383,1.106199
46,(B005A4OP8Y),(B006MIUM20),0.008696,0.028346,0.000888,0.102091,3.601579,0.000641,1.08213
37,(B004UME2PS),(B004UMDLPK),0.003936,0.003134,0.000396,0.100543,32.080233,0.000383,1.108298
17,(B003R50PL4),(B00302KB5O),0.004321,0.004867,0.000417,0.096535,19.834583,0.000396,1.101462
16,(B00302KB5O),(B003R50PL4),0.004867,0.004321,0.000417,0.085714,19.834583,0.000396,1.089023
3,(B000NPTXNW),(B000NPSJP0),0.003412,0.004236,0.000278,0.081505,19.24149,0.000264,1.084125
56,(B00GTCAY04),(B006MIUM20),0.003733,0.028346,0.000267,0.071633,2.527085,0.000162,1.046627
58,(B00JJZ18PS),(B006MIUM20),0.003295,0.028346,0.000235,0.071429,2.519865,0.000142,1.046396
35,(B006MISZOC),(B004LQ1RKQ),0.011199,0.012184,0.000781,0.069723,5.722736,0.000644,1.061852


Interpretaion of 1st row of results:
    
B004UMDLPK: Furinno Coffee Table with Bins, Espresso/Brown

B004UME2PS: Furinno End Table Bedroom Night Stand w/Bin Drawer, Espresso/Brown

In [35]:
# Add category column (to be used in viz)
apriori_rules_furniture['Category'] = 'Furniture'
apriori_rules_furniture

Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,leverage,conviction,Category
36,(B004UMDLPK),(B004UME2PS),0.003134,0.003936,0.000396,0.12628,32.080233,0.000383,1.140026,Furniture
32,(B006MIPW70),(B004LQ1RJ2),0.004065,0.018976,0.00046,0.113158,5.963242,0.000383,1.106199,Furniture
46,(B005A4OP8Y),(B006MIUM20),0.008696,0.028346,0.000888,0.102091,3.601579,0.000641,1.08213,Furniture
37,(B004UME2PS),(B004UMDLPK),0.003936,0.003134,0.000396,0.100543,32.080233,0.000383,1.108298,Furniture
17,(B003R50PL4),(B00302KB5O),0.004321,0.004867,0.000417,0.096535,19.834583,0.000396,1.101462,Furniture
16,(B00302KB5O),(B003R50PL4),0.004867,0.004321,0.000417,0.085714,19.834583,0.000396,1.089023,Furniture
3,(B000NPTXNW),(B000NPSJP0),0.003412,0.004236,0.000278,0.081505,19.24149,0.000264,1.084125,Furniture
56,(B00GTCAY04),(B006MIUM20),0.003733,0.028346,0.000267,0.071633,2.527085,0.000162,1.046627,Furniture
58,(B00JJZ18PS),(B006MIUM20),0.003295,0.028346,0.000235,0.071429,2.519865,0.000142,1.046396,Furniture
35,(B006MISZOC),(B004LQ1RKQ),0.011199,0.012184,0.000781,0.069723,5.722736,0.000644,1.061852,Furniture


In [16]:
apriori_rules_furniture.to_csv(r'C:\Users\li_mi\Class\final_project\dev\apriori_rules_furniture.csv', index = False)

### Music Analysis

In [17]:
#  Upload df
df_music = pd.read_sql_query('''SELECT * FROM music_apriori_analysis''', con=cnx)
df_music.head()

Unnamed: 0,customer_id,product_id,quantity
0,27664622,B00B6QXN6U,1
1,16794688,B00N1F0BKK,1
2,3285047,B00005YW4H,1
3,28049396,B00GFXRKHW,1
4,14400668,B00MIA0KGY,1


In [18]:
# Create pivot table to run algorithm 
apriori_music =df_music.groupby(["customer_id","product_id"])["quantity"].sum().unstack().reset_index().fillna(0).set_index("customer_id")
apriori_music

product_id,B000000XB9,B000001DZO,B000006AKD,B00000J2PH,B00000JFG3,B00004R84V,B00004XQ83,B00005JGA4,B00005YW4H,B0000669JL,...,B00O0MBH42,B00O0MBJPO,B00O3UBB1U,B00OPMI04M,B00QR7ZM8A,B00R5DXFS4,B00RHYBCN6,B00TKNQQHO,B00UCFVIDQ,B00VXGTJMU
customer_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
10807,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
16080,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
16485,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
17241,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
17990,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
53095471,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
53095663,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
53095821,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
53095826,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [19]:
# Cleaning Apriori table to remove unnecessary numbers
apriori_music = apriori_music.applymap(encode_units)
apriori_music

product_id,B000000XB9,B000001DZO,B000006AKD,B00000J2PH,B00000JFG3,B00004R84V,B00004XQ83,B00005JGA4,B00005YW4H,B0000669JL,...,B00O0MBH42,B00O0MBJPO,B00O3UBB1U,B00OPMI04M,B00QR7ZM8A,B00R5DXFS4,B00RHYBCN6,B00TKNQQHO,B00UCFVIDQ,B00VXGTJMU
customer_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
10807,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
16080,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
16485,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
17241,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
17990,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
53095471,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
53095663,0,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
53095821,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
53095826,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [20]:
# Running apriori algorithm on cleaned dataset. 
item_association_music = apriori(apriori_music, min_support=0.0001, use_colnames=True)
item_association_music

Unnamed: 0,support,itemsets
0,0.005069,(B000000XB9)
1,0.003631,(B000001DZO)
2,0.003941,(B000006AKD)
3,0.005726,(B00000J2PH)
4,0.005020,(B00000JFG3)
...,...,...
632,0.000161,"(B00MIA0KGY, B00NCGXYJS, B00MU79IL8)"
633,0.000186,"(B009G7ZYPY, B003GAMPWM, B005G618JU, B0026P3G12)"
634,0.000112,"(B003GAMPWM, B00F0O8SZK, B005G618JU, B0026P3G12)"
635,0.000149,"(B003GAMPWM, B009G7ZYPY, B00F0O8SZK, B005G618JU)"


In [21]:
#  Apriori association results table and confidence levels. 
apriori_rules_music = association_rules(item_association_music, metric="lift", min_threshold=1)

In [22]:
# Show output by descending order of confidence level
apriori_rules_music  = apriori_rules_music.sort_values(["confidence"], ascending=False)
apriori_rules_music 

Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,leverage,conviction
725,"(B007ZRZWOA, B001BKVWYG)",(B001TCHDPS),0.000806,0.006730,0.000657,0.815385,121.158946,6.514539e-04,5.380213
974,"(B003GAMPWM, B009G7ZYPY, B00F0O8SZK)",(B005G618JU),0.000186,0.005912,0.000149,0.800000,135.320755,1.476275e-04,4.970441
990,"(B00NCFX7NM, B00F0O8SZK, B005G618JU)",(B009G7ZYPY),0.000149,0.009444,0.000112,0.750000,79.414370,1.101403e-04,3.962223
724,"(B007ZRZWOA, B001TCHDPS)",(B001BKVWYG),0.000917,0.008267,0.000657,0.716216,86.638539,6.492937e-04,3.494679
947,"(B003GAMPWM, B009G7ZYPY, B0026P3G12)",(B005G618JU),0.000260,0.005912,0.000186,0.714286,120.822102,1.843695e-04,3.479308
...,...,...,...,...,...,...,...,...,...
735,(B0026P3G12),"(B003GAMPWM, B004UB2WAQ)",0.021950,0.000235,0.000112,0.005082,21.580582,1.063761e-04,1.004871
648,(B00MIA0KGY),(B00MHS56AC),0.030762,0.003854,0.000136,0.004432,1.149803,1.776218e-05,1.000580
920,(B00NEJ7MMI),"(B00HQF9UZ8, B00NJQ007U)",0.028865,0.000967,0.000124,0.004294,4.441490,9.603400e-05,1.003341
932,(B00NEJ7MMI),"(B00KR6332Y, B00NJQ007U)",0.028865,0.000186,0.000112,0.003864,20.786174,1.061786e-04,1.003693


In [37]:
# Add category column (to be used in viz)
apriori_rules_music['Category'] = 'Music'
apriori_rules_music

Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,leverage,conviction,Category
725,"(B007ZRZWOA, B001BKVWYG)",(B001TCHDPS),0.000806,0.006730,0.000657,0.815385,121.158946,6.514539e-04,5.380213,Music
974,"(B003GAMPWM, B009G7ZYPY, B00F0O8SZK)",(B005G618JU),0.000186,0.005912,0.000149,0.800000,135.320755,1.476275e-04,4.970441,Music
990,"(B00NCFX7NM, B00F0O8SZK, B005G618JU)",(B009G7ZYPY),0.000149,0.009444,0.000112,0.750000,79.414370,1.101403e-04,3.962223,Music
724,"(B007ZRZWOA, B001TCHDPS)",(B001BKVWYG),0.000917,0.008267,0.000657,0.716216,86.638539,6.492937e-04,3.494679,Music
947,"(B003GAMPWM, B009G7ZYPY, B0026P3G12)",(B005G618JU),0.000260,0.005912,0.000186,0.714286,120.822102,1.843695e-04,3.479308,Music
...,...,...,...,...,...,...,...,...,...,...
735,(B0026P3G12),"(B003GAMPWM, B004UB2WAQ)",0.021950,0.000235,0.000112,0.005082,21.580582,1.063761e-04,1.004871,Music
648,(B00MIA0KGY),(B00MHS56AC),0.030762,0.003854,0.000136,0.004432,1.149803,1.776218e-05,1.000580,Music
920,(B00NEJ7MMI),"(B00HQF9UZ8, B00NJQ007U)",0.028865,0.000967,0.000124,0.004294,4.441490,9.603400e-05,1.003341,Music
932,(B00NEJ7MMI),"(B00KR6332Y, B00NJQ007U)",0.028865,0.000186,0.000112,0.003864,20.786174,1.061786e-04,1.003693,Music


In [38]:
apriori_rules_music.to_csv(r'C:\Users\li_mi\Class\final_project\dev\apriori_rules_music.csv', index = False)

Interpretaion of 1st row of results:

B001BKVWYG: Cold Fact, Sixto Rodriguez

B007ZRZWOA: Coming from Reality, Sixto Rodriguez

### Office Products

In [24]:
#  Upload df
df_office = pd.read_sql_query('''SELECT * FROM office_products_apriori_analysis''', con=cnx)
df_office.head()

Unnamed: 0,customer_id,product_id,quantity
0,52782374,B00D7H8XB6,1
1,50773856,B00EANURCE,1
2,14300387,B000050FZP,1
3,16457777,B00GTXWZN2,1
4,26344256,B000QX77WK,1


In [25]:
# Create pivot table to run algorithm 
apriori_office = df_office.groupby(["customer_id","product_id"])["quantity"].sum().unstack().reset_index().fillna(0).set_index("customer_id")

In [26]:
# Cleaning Apriori table to remove unnecessary numbers
apriori_office = apriori_office.applymap(encode_units)

In [27]:
# Running apriori algorithm on cleaned dataset. 
item_association_office = apriori(apriori_office, min_support=0.0001, use_colnames=True)

In [28]:
#  Apriori association results table and confidence levels. 
apriori_rules_office = association_rules(item_association_office, metric="lift", min_threshold=1)

In [29]:
# Show output by descending order of confidence level
apriori_rules_office  = apriori_rules_office.sort_values(["confidence"], ascending=False)
apriori_rules_office

Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,leverage,conviction
36,(B007VBXB48),(B0010JEJPC),0.007572,0.017735,0.003242,0.428224,24.145568,0.003108,1.717919
145,"(B003H2GBM4, B00CIDQ470)",(B003YT6RNS),0.000393,0.013529,0.000154,0.390625,28.874156,0.000148,1.618825
146,"(B00CIDQ470, B003YT6RNS)",(B003H2GBM4),0.000491,0.010827,0.000154,0.312500,28.864329,0.000148,1.438798
128,(B00ATZ9JZI),(B00AVWKUJS),0.006030,0.028291,0.001548,0.256619,9.070570,0.001377,1.307148
37,(B0010JEJPC),(B007VBXB48),0.017735,0.007572,0.003242,0.182825,24.145568,0.003108,1.214463
...,...,...,...,...,...,...,...,...,...
54,(B003H2GBM4),(B00E3KP7HO),0.010827,0.005122,0.000104,0.009643,1.882757,0.000049,1.004565
67,(B003YT6RNS),(B003YGZIY0),0.013529,0.005822,0.000117,0.008625,1.481476,0.000038,1.002827
97,(B0062ZNYFK),(B00EU6VACM),0.013713,0.007394,0.000117,0.008509,1.150806,0.000015,1.001125
31,(B00007M1TZ),(B00BP7SW0C),0.022464,0.006835,0.000190,0.008475,1.239900,0.000037,1.001654


In [39]:
# Add category column (to be used in viz)
apriori_rules_office['Category'] = 'Office'
apriori_rules_office

Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,leverage,conviction,Category
36,(B007VBXB48),(B0010JEJPC),0.007572,0.017735,0.003242,0.428224,24.145568,0.003108,1.717919,Office
145,"(B003H2GBM4, B00CIDQ470)",(B003YT6RNS),0.000393,0.013529,0.000154,0.390625,28.874156,0.000148,1.618825,Office
146,"(B00CIDQ470, B003YT6RNS)",(B003H2GBM4),0.000491,0.010827,0.000154,0.312500,28.864329,0.000148,1.438798,Office
128,(B00ATZ9JZI),(B00AVWKUJS),0.006030,0.028291,0.001548,0.256619,9.070570,0.001377,1.307148,Office
37,(B0010JEJPC),(B007VBXB48),0.017735,0.007572,0.003242,0.182825,24.145568,0.003108,1.214463,Office
...,...,...,...,...,...,...,...,...,...,...
54,(B003H2GBM4),(B00E3KP7HO),0.010827,0.005122,0.000104,0.009643,1.882757,0.000049,1.004565,Office
67,(B003YT6RNS),(B003YGZIY0),0.013529,0.005822,0.000117,0.008625,1.481476,0.000038,1.002827,Office
97,(B0062ZNYFK),(B00EU6VACM),0.013713,0.007394,0.000117,0.008509,1.150806,0.000015,1.001125,Office
31,(B00007M1TZ),(B00BP7SW0C),0.022464,0.006835,0.000190,0.008475,1.239900,0.000037,1.001654,Office


In [40]:
apriori_rules_office.to_csv(r'C:\Users\li_mi\Class\final_project\dev\apriori_rules_office.csv', index = False)

Interpretaion of 1st row of results:

B001BKVWYG: Scotch Thermal Laminating Pouches, 100-Pack, 8.9 x 11.4 Inches, Letter Size Sheets (TP3854-100)
    
B007ZRZWOA: Scotch Thermal Laminator, 2 Roller System for a Professional Finish, Use for Home, Office or School, Suitable for use with Photos (TL901X)

In [58]:
# Concatenate dfs for visualization 
df_viz_1 = pd.concat([apriori_rules_apparel,apriori_rules_furniture,apriori_rules_music,apriori_rules_office], axis=0)
df_viz_1

Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,leverage,conviction,Category
33,(B00817DYUM),(B00817DZWO),0.003326,0.005259,0.000607,0.182432,34.687890,0.000589,1.216708,Apparel
32,(B00817DZWO),(B00817DYUM),0.005259,0.003326,0.000607,0.115385,34.687890,0.000589,1.126675,Apparel
35,(B0083QLPY8),(B0083QLQ0G),0.004113,0.005844,0.000348,0.084699,14.494357,0.000324,1.086153,Apparel
36,(B0083QLQ10),(B0083QLQ0G),0.004428,0.005844,0.000303,0.068528,11.726972,0.000278,1.067296,Apparel
20,(B003828R16),(B00382E71U),0.003326,0.003371,0.000202,0.060811,18.037703,0.000191,1.061159,Apparel
...,...,...,...,...,...,...,...,...,...,...
54,(B003H2GBM4),(B00E3KP7HO),0.010827,0.005122,0.000104,0.009643,1.882757,0.000049,1.004565,Office
67,(B003YT6RNS),(B003YGZIY0),0.013529,0.005822,0.000117,0.008625,1.481476,0.000038,1.002827,Office
97,(B0062ZNYFK),(B00EU6VACM),0.013713,0.007394,0.000117,0.008509,1.150806,0.000015,1.001125,Office
31,(B00007M1TZ),(B00BP7SW0C),0.022464,0.006835,0.000190,0.008475,1.239900,0.000037,1.001654,Office


In [59]:
# Filter results confidence level > 60%
df_viz_1 = df_viz_1[df_viz_1['confidence'] >= 0.6]
df_viz_1 

Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,leverage,conviction,Category
725,"(B007ZRZWOA, B001BKVWYG)",(B001TCHDPS),0.000806,0.00673,0.000657,0.815385,121.158946,0.000651,5.380213,Music
974,"(B003GAMPWM, B009G7ZYPY, B00F0O8SZK)",(B005G618JU),0.000186,0.005912,0.000149,0.8,135.320755,0.000148,4.970441,Music
990,"(B00NCFX7NM, B00F0O8SZK, B005G618JU)",(B009G7ZYPY),0.000149,0.009444,0.000112,0.75,79.41437,0.00011,3.962223,Music
724,"(B007ZRZWOA, B001TCHDPS)",(B001BKVWYG),0.000917,0.008267,0.000657,0.716216,86.638539,0.000649,3.494679,Music
947,"(B003GAMPWM, B009G7ZYPY, B0026P3G12)",(B005G618JU),0.00026,0.005912,0.000186,0.714286,120.822102,0.000184,3.479308,Music
976,"(B003GAMPWM, B005G618JU, B00F0O8SZK)",(B009G7ZYPY),0.000211,0.009444,0.000149,0.705882,74.742937,0.000147,3.36789,Music
923,"(B00MIA0LDQ, B00KR6332Y)",(B00NEJ7MMI),0.000942,0.028865,0.000657,0.697368,24.159369,0.00063,3.208967,Music
846,"(B008R3EYWM, B005G618JU)",(B004UB2WAQ),0.000186,0.007548,0.000124,0.666667,88.325123,0.000123,2.977356,Music
961,"(B003GAMPWM, B00F0O8SZK, B0026P3G12)",(B005G618JU),0.000174,0.005912,0.000112,0.642857,108.739892,0.000111,2.783447,Music
802,"(B00NCFX7NM, B003GAMPWM)",(B005G618JU),0.000174,0.005912,0.000112,0.642857,108.739892,0.000111,2.783447,Music


In [60]:
df_viz_1  = df_viz_1 .drop(['antecedent support', 'consequent support', 'support','lift','leverage','conviction'], axis=1)
df_viz_1

Unnamed: 0,antecedents,consequents,confidence,Category
725,"(B007ZRZWOA, B001BKVWYG)",(B001TCHDPS),0.815385,Music
974,"(B003GAMPWM, B009G7ZYPY, B00F0O8SZK)",(B005G618JU),0.8,Music
990,"(B00NCFX7NM, B00F0O8SZK, B005G618JU)",(B009G7ZYPY),0.75,Music
724,"(B007ZRZWOA, B001TCHDPS)",(B001BKVWYG),0.716216,Music
947,"(B003GAMPWM, B009G7ZYPY, B0026P3G12)",(B005G618JU),0.714286,Music
976,"(B003GAMPWM, B005G618JU, B00F0O8SZK)",(B009G7ZYPY),0.705882,Music
923,"(B00MIA0LDQ, B00KR6332Y)",(B00NEJ7MMI),0.697368,Music
846,"(B008R3EYWM, B005G618JU)",(B004UB2WAQ),0.666667,Music
961,"(B003GAMPWM, B00F0O8SZK, B0026P3G12)",(B005G618JU),0.642857,Music
802,"(B00NCFX7NM, B003GAMPWM)",(B005G618JU),0.642857,Music


In [61]:
df_viz_1.to_csv(r'C:\Users\li_mi\Class\final_project\dev\df_viz_1.csv', index = False)