**Imports and Reading Dataset**

In [2]:
!pip install mlxtend

Collecting mlxtend
  Downloading mlxtend-0.21.0-py2.py3-none-any.whl (1.3 MB)
Installing collected packages: mlxtend
Successfully installed mlxtend-0.21.0


In [6]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', 10)
pd.set_option('display.expand_frame_repr', False)
from mlxtend.frequent_patterns import apriori, association_rules

import warnings
warnings.filterwarnings("ignore")

df = pd.read_csv("armut_data.csv")
df.head()

Unnamed: 0,UserId,ServiceId,CategoryId,CreateDate
0,25446,4,5,2017-08-06 16:11:00
1,22948,48,5,2017-08-06 16:12:00
2,10618,0,8,2017-08-06 16:13:00
3,7256,9,4,2017-08-06 16:14:00
4,25446,48,5,2017-08-06 16:16:00


**General Information About Dataset**

In [7]:
def information(df):
    print("###############################    Shape  ##################################")
    print(df.shape)
    print("###############################    Types  ##################################")
    print(df.dtypes)
    print("###############################    Head   ##################################")
    print(df.head())
    print("###############################    Tail   ##################################")
    print(df.tail())
    print("###############################    NA     ##################################")
    print(df.isnull().sum())
    print("############################### Quantiles ##################################")
    print(df.describe([0, 0.05, 0.50, 0.95, 0.99, 1]).T)

information(df)

###############################    Shape  ##################################
(162523, 4)
###############################    Types  ##################################
UserId         int64
ServiceId      int64
CategoryId     int64
CreateDate    object
dtype: object
###############################    Head   ##################################
   UserId  ServiceId  CategoryId           CreateDate
0   25446          4           5  2017-08-06 16:11:00
1   22948         48           5  2017-08-06 16:12:00
2   10618          0           8  2017-08-06 16:13:00
3    7256          9           4  2017-08-06 16:14:00
4   25446         48           5  2017-08-06 16:16:00
###############################    Tail   ##################################
        UserId  ServiceId  CategoryId           CreateDate
162518   10591         25           0  2018-08-06 14:40:00
162519   10591          2           0  2018-08-06 14:43:00
162520   10591         31           6  2018-08-06 14:47:00
162521   12666        

**Data Preparation**

ServiceID represents a different service for each CategoryID. Let's combine ServiceID and CategoryID with "_" to create a new variable to represent these services

In [9]:
df['Service'] = df['ServiceId'].apply(str) + '_' + df['CategoryId'].apply(str)
df.head()

Unnamed: 0,UserId,ServiceId,CategoryId,CreateDate,Service
0,25446,4,5,2017-08-06 16:11:00,4_5
1,22948,48,5,2017-08-06 16:12:00,48_5
2,10618,0,8,2017-08-06 16:13:00,0_8
3,7256,9,4,2017-08-06 16:14:00,9_4
4,25446,48,5,2017-08-06 16:16:00,48_5


The data set consists of the date and time the services are received, there is no basket definition (invoice, etc.). In order to apply Association Rule Learning, a basket (invoice, etc.) definition must be created.

For the basket definition, let's say the services that each customer receives monthly.

Let's create a new date variable that contains only the year and month.

In [None]:
df['New_Date'] = pd.to_datetime(df['CreateDate'], format='%Y-%m').dt.to_period('M')
df.head()

Unnamed: 0,UserId,ServiceId,CategoryId,CreateDate,Service,New_Date
0,25446,4,5,2017-08-06 16:11:00,4_5,2017-08
1,22948,48,5,2017-08-06 16:12:00,48_5,2017-08
2,10618,0,8,2017-08-06 16:13:00,0_8,2017-08
3,7256,9,4,2017-08-06 16:14:00,9_4,2017-08
4,25446,48,5,2017-08-06 16:16:00,48_5,2017-08


Let's combine the UserID and the newly created date variable with "_" and assign it to a new variable called BasketId.

In [13]:
df['Basket_Id'] = df['UserId'].apply(str) + '_' + df['New_Date'].apply(str)
df.head()

Unnamed: 0,UserId,ServiceId,CategoryId,CreateDate,Service,New_Date,Basket_Id
0,25446,4,5,2017-08-06 16:11:00,4_5,2017-08,25446_2017-08
1,22948,48,5,2017-08-06 16:12:00,48_5,2017-08,22948_2017-08
2,10618,0,8,2017-08-06 16:13:00,0_8,2017-08,10618_2017-08
3,7256,9,4,2017-08-06 16:14:00,9_4,2017-08,7256_2017-08
4,25446,48,5,2017-08-06 16:16:00,48_5,2017-08,25446_2017-08


**Creating Association Rules**

We will use Apriori Algorithm which is an algorithm that allows the implementation of Association Rules

Apriori Algorithm consists of Support, Confidence, Lift metrics

Support: Probability of X and Y being bought together

Confidence: Probability of buying Y when X is bought

Lift: When X is purchased, the probability of buying Y also increases by a factor of lift

In [15]:
df.groupby(['Basket_Id', "Service"]).agg({"Service": "count"}).unstack()

Unnamed: 0_level_0,Service,Service,Service,Service,Service,Service,Service,Service,Service,Service,Service,Service,Service,Service,Service,Service,Service,Service,Service,Service,Service,Service,Service,Service,Service,Service,Service,Service,Service,Service,Service,Service,Service,Service,Service,Service,Service,Service,Service,Service,Service,Service,Service,Service,Service,Service,Service,Service,Service,Service
Service,0_8,10_9,11_11,12_7,13_11,14_7,15_1,16_8,17_5,18_4,19_6,1_4,20_5,21_5,22_0,23_10,24_10,25_0,26_7,27_7,28_4,29_0,2_0,30_2,31_6,32_4,33_4,34_6,35_11,36_1,37_0,38_4,39_10,3_5,40_8,41_3,42_1,43_2,44_0,45_6,46_4,47_7,48_5,49_1,4_5,5_11,6_7,7_3,8_5,9_4
Basket_Id,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2,Unnamed: 17_level_2,Unnamed: 18_level_2,Unnamed: 19_level_2,Unnamed: 20_level_2,Unnamed: 21_level_2,Unnamed: 22_level_2,Unnamed: 23_level_2,Unnamed: 24_level_2,Unnamed: 25_level_2,Unnamed: 26_level_2,Unnamed: 27_level_2,Unnamed: 28_level_2,Unnamed: 29_level_2,Unnamed: 30_level_2,Unnamed: 31_level_2,Unnamed: 32_level_2,Unnamed: 33_level_2,Unnamed: 34_level_2,Unnamed: 35_level_2,Unnamed: 36_level_2,Unnamed: 37_level_2,Unnamed: 38_level_2,Unnamed: 39_level_2,Unnamed: 40_level_2,Unnamed: 41_level_2,Unnamed: 42_level_2,Unnamed: 43_level_2,Unnamed: 44_level_2,Unnamed: 45_level_2,Unnamed: 46_level_2,Unnamed: 47_level_2,Unnamed: 48_level_2,Unnamed: 49_level_2,Unnamed: 50_level_2
0_2017-08,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,1.0,,1.0,,,,,,,
0_2017-09,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,1.0,,1.0,,,,,
0_2018-01,,,,,,,,,,,,,,,,,,,,,,,,1.0,,,,,,,,,,,,,,1.0,,,,,,,,,,1.0,,
0_2018-04,,,,,,1.0,,,,,,,,,,,,,,,,,,1.0,,,,,,,,,,,,,,1.0,,,,,,,,,,,,
10000_2017-08,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,1.0,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
99_2017-12,3.0,,,,,,,,,,,,1.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
99_2018-01,1.0,,,,,,,,,,,,1.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
99_2018-02,1.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
9_2018-03,,,,,,,,,,,,,,,,,,1.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,


We need to write 1 instead of values greater than 0 in the dataframe. And also we need to change NaN to 0

In [17]:
df1 = df.groupby(['Basket_Id', "Service"])["Service"].count().unstack().fillna(0).applymap(lambda x: 1 if x > 0 else 0)
df1.head()

Service,0_8,10_9,11_11,12_7,13_11,14_7,15_1,16_8,17_5,18_4,19_6,1_4,20_5,21_5,22_0,23_10,24_10,25_0,26_7,27_7,28_4,29_0,2_0,30_2,31_6,32_4,33_4,34_6,35_11,36_1,37_0,38_4,39_10,3_5,40_8,41_3,42_1,43_2,44_0,45_6,46_4,47_7,48_5,49_1,4_5,5_11,6_7,7_3,8_5,9_4
Basket_Id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1,Unnamed: 29_level_1,Unnamed: 30_level_1,Unnamed: 31_level_1,Unnamed: 32_level_1,Unnamed: 33_level_1,Unnamed: 34_level_1,Unnamed: 35_level_1,Unnamed: 36_level_1,Unnamed: 37_level_1,Unnamed: 38_level_1,Unnamed: 39_level_1,Unnamed: 40_level_1,Unnamed: 41_level_1,Unnamed: 42_level_1,Unnamed: 43_level_1,Unnamed: 44_level_1,Unnamed: 45_level_1,Unnamed: 46_level_1,Unnamed: 47_level_1,Unnamed: 48_level_1,Unnamed: 49_level_1,Unnamed: 50_level_1
0_2017-08,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,1,0,0,0,0,0,0,0
0_2017-09,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,1,0,0,0,0,0
0_2018-01,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,1,0,0
0_2018-04,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0
10000_2017-08,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0


In [20]:
print("Total Number of Baskets -->", df1.shape[0])
print("Total Number of Services -->", df1.shape[1])

Total Number of Baskets --> 71220
Total Number of Services --> 50


Apriori Algorithm:

support > probability of finding the product in the basket (we gave min support = 0.01, accordingly, those greater than 0.01 will come)

In [21]:
frequent_itemsets = apriori(df1,
                            min_support=0.001,
                            use_colnames=True)

frequent_itemsets.sort_values("support", ascending=False)

Unnamed: 0,support,itemsets
9,0.238121,(18_4)
22,0.130286,(2_0)
6,0.120963,(15_1)
43,0.067762,(49_1)
31,0.066568,(38_4)
...,...,...
365,0.001011,"(4_5, 43_2)"
407,0.001011,"(2_0, 31_6, 11_11)"
265,0.001011,"(25_0, 34_6)"
53,0.001011,"(0_8, 38_4)"


In [22]:
rules = association_rules(frequent_itemsets,
                          metric="support",
                          min_threshold=0.001)

rules

Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,leverage,conviction
0,(0_8),(15_1),0.019728,0.120963,0.001334,0.067616,0.558977,-0.001052,0.942784
1,(15_1),(0_8),0.120963,0.019728,0.001334,0.011027,0.558977,-0.001052,0.991203
2,(0_8),(16_8),0.019728,0.014659,0.002429,0.123132,8.399845,0.002140,1.123705
3,(16_8),(0_8),0.014659,0.019728,0.002429,0.165709,8.399845,0.002140,1.174976
4,(2_0),(0_8),0.130286,0.019728,0.001474,0.011316,0.573606,-0.001096,0.991492
...,...,...,...,...,...,...,...,...,...
1279,"(2_0, 22_0)","(29_0, 25_0)",0.016568,0.005378,0.001123,0.067797,12.606983,0.001034,1.066958
1280,(29_0),"(25_0, 2_0, 22_0)",0.026580,0.005041,0.001123,0.042261,8.383916,0.000989,1.038863
1281,(25_0),"(29_0, 2_0, 22_0)",0.042895,0.002387,0.001123,0.026187,10.970636,0.001021,1.024440
1282,(2_0),"(29_0, 25_0, 22_0)",0.130286,0.002162,0.001123,0.008622,3.987219,0.000842,1.006515


antecedents > previous product (first product)

consequents > 2nd product

antecedent support > probability of first product being observed alone

consequent support > probability of 2nd product being observed alone

support > probability of seeing 2 products together

confidence > probability of getting 2nd item when 1st item is bought

lift > When the 1st item is bought, the probability of getting the 2nd item increases x times


**Let's suggest a service to a user who has received the 2_0 service in the last 1 month**

In [23]:
df.loc[df["Service"] == "2_0"].sort_values("CreateDate", ascending=False).head()

Unnamed: 0,UserId,ServiceId,CategoryId,CreateDate,Service,New_Date,Basket_Id
162519,10591,2,0,2018-08-06 14:43:00,2_0,2018-08,10591_2018-08
162502,11769,2,0,2018-08-06 09:30:00,2_0,2018-08,11769_2018-08
162497,12022,2,0,2018-08-06 08:47:00,2_0,2018-08,12022_2018-08
162484,11656,2,0,2018-08-06 07:17:00,2_0,2018-08,11656_2018-08
162469,18900,2,0,2018-08-06 04:30:00,2_0,2018-08,18900_2018-08


In [26]:
rules[rules["antecedents"] == {'2_0'}].sort_values("confidence", ascending=False).head() #confidence

Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,leverage,conviction
158,(2_0),(15_1),0.130286,0.120963,0.033951,0.260588,2.154278,0.018191,1.188833
360,(2_0),(22_0),0.130286,0.047515,0.016568,0.127169,2.676409,0.010378,1.09126
425,(2_0),(25_0),0.130286,0.042895,0.013437,0.103136,2.404371,0.007849,1.067168
90,(2_0),(13_11),0.130286,0.056627,0.012819,0.098394,1.737574,0.005442,1.046325
506,(2_0),(38_4),0.130286,0.066568,0.011191,0.085893,1.290295,0.002518,1.02114


In [27]:
rules[rules["antecedents"] == {'2_0'}].sort_values("lift", ascending=False).head() #lift

Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,leverage,conviction
1268,(2_0),"(15_1, 25_0, 22_0)",0.130286,0.001797,0.001095,0.008406,4.677194,0.000861,1.006665
1253,(2_0),"(25_0, 22_0, 13_11)",0.130286,0.001881,0.001137,0.008729,4.639605,0.000892,1.006908
1003,(2_0),"(15_1, 37_0)",0.130286,0.003145,0.001797,0.013795,4.385941,0.001387,1.010798
955,(2_0),"(15_1, 22_0)",0.130286,0.006908,0.003651,0.02802,4.056104,0.002751,1.021721
974,(2_0),"(25_0, 15_1)",0.130286,0.00469,0.002471,0.018968,4.04452,0.00186,1.014554


**Functionalization**

In [28]:
def arl_recommender(rules_df, product_id, rec_count=1):
    sorted_rules = rules_df.sort_values("lift", ascending=False)
    recommendation_list = []
    for i, product in enumerate(sorted_rules["antecedents"]):
        for j in list(product):
            if j == product_id:
                recommendation_list.append(list(sorted_rules.iloc[i]["consequents"])[:])
    return recommendation_list[0:rec_count]

In [29]:
arl_recommender(rules,'2_0',1) # if customer take servie 2_0, suggest 22_0 and 25_0

[['25_0', '22_0']]

In [40]:
arl_recommender(rules,'15_1', 3) # if customer take servie 15_1, suggest '2_0', '25_0', '11_11'

[['2_0', '22_0'], ['25_0', '2_0'], ['11_11']]