In [1]:
import numpy as np
import pandas as pd 
from mlxtend.frequent_patterns import apriori, association_rules
from mlxtend.preprocessing import TransactionEncoder

In [2]:
df = pd.read_csv('satislar.csv',sep=';',low_memory=False, header=None)

In [3]:
df.columns = ['BranchId', 'PosId', 'InvoiceDate', 'InvoiceNo','StockCode','Line','Quantity','CategoryCode','CategoryName']

In [4]:
df.dropna(inplace= True)

In [5]:
# Stripping extra spaces in the description 
df['CategoryName'] = df['CategoryName'].str.strip().str.strip(',')

In [6]:
df.head()

Unnamed: 0,BranchId,PosId,InvoiceDate,InvoiceNo,StockCode,Line,Quantity,CategoryCode,CategoryName
0,1145,2236,2020-05-01 08:29:49.000,11450001261903,426410,0,1,05.07.014,Tıraş Bıçak ve Jiletleri
1,1129,2156,2020-05-01 08:30:44.000,11290001181201,59010,0,15,03.03.002,Dana Eti Kg
2,1141,2216,2020-05-01 08:36:56.000,11410001201443,314720,0,1,02.02.004,Meyve Suları Küçük Boy
3,1141,2216,2020-05-01 08:36:56.000,11410001201443,180122,1,1,01.07.002,Çikolata
4,1141,2216,2020-05-01 08:36:56.000,11410001201443,162169,2,1,01.04.002,Bisküvi Çeşitleri


In [7]:
df.InvoiceNo.nunique()

256495

In [9]:
basket = df.groupby(['InvoiceNo', 'CategoryName'])['Quantity'].count()

In [10]:
basket = basket.unstack()

In [11]:
basket = basket.reset_index().fillna(0).set_index('InvoiceNo')

In [12]:
for col in basket.columns:
    basket[col] = basket[col].apply(lambda x : 0 if x<=0 else 1)    
    
basket_sets = basket

In [13]:
basket_sets.head()

CategoryName,Agız Bakım Suyu,Ahsap Temizleyici,Ahsap Urunler,Aluminyum Folyo,Ampul Cesitleri,Arap Sabunu,Ayakkabı Boya Malzemeleri,Ayran,Ayçiçek Yağı,Ağda Malzemeleri,...,Çay Bardak,Çay Bitki,Çay Demlik,Çay Kg,Çikolata,Çocuk Bezi,Çorba,İrmik,Şalgam Suyu,Şampuan
InvoiceNo,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
11010001071064,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
11010001071065,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
11010001071066,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
11010001071067,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
11010001071068,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [14]:
frequent_itemsets = apriori(basket_sets, min_support=0.005, use_colnames=True)

In [15]:
frequent_itemsets

Unnamed: 0,support,itemsets
0,0.010608,(Ayran)
1,0.017182,(Ayçiçek Yağı)
2,0.018928,(Baharat Paketli)
3,0.006893,(Banyo Sabunları)
4,0.006039,(Bebek Bisküvi ve Ekmeği)
5,0.067303,(Bisküvi Çeşitleri)
6,0.017248,(Bulaşık Sıvı Yıkama Deterjanı)
7,0.008351,(Bulgur)
8,0.044964,(Cips)
9,0.090719,(Dana Eti Kg)


In [16]:
rules = association_rules(frequent_itemsets, metric="lift", min_threshold=1)
rules.head()

Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,leverage,conviction
0,(Baharat Paketli),(Sebze),0.018928,0.304661,0.00731,0.3862,1.267638,0.001543,1.132843
1,(Sebze),(Baharat Paketli),0.304661,0.018928,0.00731,0.023994,1.267638,0.001543,1.00519
2,(Bisküvi Çeşitleri),(Cips),0.067303,0.044964,0.009185,0.136477,3.035258,0.006159,1.105976
3,(Cips),(Bisküvi Çeşitleri),0.044964,0.067303,0.009185,0.204283,3.035258,0.006159,1.172147
4,(Gofret),(Bisküvi Çeşitleri),0.024203,0.067303,0.009228,0.381282,5.665121,0.007599,1.507467


In [17]:
rules[ (rules['lift'] >= 6) &
       (rules['confidence'] >= 0.8)]

Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,leverage,conviction
150,"(Sebze, Sarf Urunler Et )",(Dana Eti Kg),0.006772,0.090719,0.006039,0.891767,9.829983,0.005425,8.401175
