## Source Code 1 (Frequent Itemset & Association Rules)
Program untuk menghasilkan frequent itemset dan association rules melalui penggunaan algoritma apriori

# Import Library

In [1]:
#import library
import pandas as pd
import numpy as np 
from mlxtend.frequent_patterns import apriori, association_rules
import warnings
warnings.filterwarnings('ignore')

## Display Dataset

In [2]:
#menampilkan dataset
pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)
df = pd.read_csv("BreadBasket_DMS.csv")
print("\nRaw Dataset:")
print(df.head(),"\n")
print(df.tail())
print(df.shape)


Raw Dataset:
         Date      Time  Transaction           Item
0  10/30/2016   9:58:11            1          Bread
1  10/30/2016  10:05:34            2   Scandinavian
2  10/30/2016  10:05:34            2   Scandinavian
3  10/30/2016  10:07:57            3  Hot chocolate
4  10/30/2016  10:07:57            3            Jam 

           Date      Time  Transaction       Item
21288  4/9/2017  14:32:58         9682     Coffee
21289  4/9/2017  14:32:58         9682        Tea
21290  4/9/2017  14:57:06         9683     Coffee
21291  4/9/2017  14:57:06         9683     Pastry
21292  4/9/2017  15:04:24         9684  Smoothies
(21293, 4)


## Missing Value Detection

In [3]:
#deteksi missing value
missing_values = ["NaN","NA"," ","NONE"]
df = pd.read_csv('BreadBasket_DMS.csv', na_values = missing_values)
print("\nTotal Missing value:")
print(df.isnull().sum())
print(df.shape)
totalRow = len(df.index)
missingCount = df.isnull().sum()
totalMissing = missingCount.sum()
total_mv = round(((totalMissing/totalRow) * 100), 2)
print(f"\nDataset BreadBasket_DMS Memiliki = {total_mv}% missing values\n")


Total Missing value:
Date             0
Time             0
Transaction      0
Item           786
dtype: int64
(21293, 4)

Dataset BreadBasket_DMS Memiliki = 3.69% missing values



## Data Cleaning

In [4]:
#data cleaning dengan metode remove row
print("Dataset setelah proses cleaning:")
df.dropna(inplace = True)
df['Item'] = df['Item'].str.lower()
df_clean = df
print(df_clean.isnull().sum())
print(df_clean.shape)
print("\nTipe data tiap Atribut:")
print(df_clean.dtypes,"\n")

Dataset setelah proses cleaning:
Date           0
Time           0
Transaction    0
Item           0
dtype: int64
(20507, 4)

Tipe data tiap Atribut:
Date           object
Time           object
Transaction     int64
Item           object
dtype: object 



## Data Reduction

In [5]:
#data reduction untuk atribut Date dan Time
to_drop = ['Date',
           'Time']
df.drop(to_drop, inplace=True, axis=1)
print("Dataset setelah reduksi:")
print(df.head())
print(df.shape)
print("\nTipe data tiap Atribut:")
print(df.dtypes)

Dataset setelah reduksi:
   Transaction           Item
0            1          bread
1            2   scandinavian
2            2   scandinavian
3            3  hot chocolate
4            3            jam
(20507, 2)

Tipe data tiap Atribut:
Transaction     int64
Item           object
dtype: object


## Display Total Sales of Each Items

In [6]:
#menampilkan total terjualnya masing-masing item
total_items = df_clean['Item'].value_counts()
print("\nPenjualan masing-masing item: ")
print(total_items)
a1 = total_items.sum()
print(f"\nTotal Item yang Terjual: {a1:,} pcs")
print()


Penjualan masing-masing item: 
coffee                           5471
bread                            3325
tea                              1435
cake                             1025
pastry                            856
sandwich                          771
medialuna                         616
hot chocolate                     590
cookies                           540
brownie                           379
farm house                        374
muffin                            370
juice                             369
alfajores                         369
soup                              342
scone                             327
toast                             318
scandinavian                      277
truffles                          193
coke                              185
spanish brunch                    172
fudge                             159
baguette                          152
jam                               149
tiffin                            146
mineral water     

## Grouped Data

In [7]:
#mengelompokan data berdasarkan transaksi
trx_data = df_clean.groupby('Transaction').agg(','.join).reset_index()
print(f"Mengelompokan Data berdasarkan Transaksinya: \n{trx_data.head()}")

Mengelompokan Data berdasarkan Transaksinya: 
   Transaction                       Item
0            1                      bread
1            2  scandinavian,scandinavian
2            3  hot chocolate,jam,cookies
3            4                     muffin
4            5        coffee,pastry,bread


## Data Transformation

In [8]:
#data transformation menjadi bentuk biner
df = df.groupby(['Transaction','Item']).size().reset_index(name='count')
itemset = (df.groupby(['Transaction', 'Item'])['count']
          .sum().unstack().reset_index().fillna(0)
          .set_index('Transaction'))
def encode_units(x):
    if x <= 0:
        return 0
    if x >= 1:
        return 1
basket = itemset.applymap(encode_units)
print(f"\nTransformasi menjadi bentuk biner: \n{basket.head()}\n")


Transformasi menjadi bentuk biner: 
Item         alfajores  argentina night  bacon  baguette  bakewell  \
Transaction                                                          
1                    0                0      0         0         0   
2                    0                0      0         0         0   
3                    0                0      0         0         0   
4                    0                0      0         0         0   
5                    0                0      0         0         0   

Item         bare popcorn  bowl nic pitt  bread  bread pudding  \
Transaction                                                      
1                       0              0      1              0   
2                       0              0      0              0   
3                       0              0      0              0   
4                       0              0      0              0   
5                       0              0      1              0   

Item     

## Generate Rules Bassist

In [9]:
#apriori dengan ukuran kinerja support
frequent_items = apriori(basket, min_support = 0.02, use_colnames = True, verbose = 1)
print(frequent_items, "\n")

#association rules yang difilter dengan ukuran kinerja lift
df_ar = association_rules(frequent_items, metric = "lift", min_threshold = 1)
print(f"Aturan Asosiasi: \n{df_ar}")

Processing 174 combinations | Sampling itemset size 3
     support                 itemsets
0   0.036344              (alfajores)
1   0.327205                  (bread)
2   0.040042                (brownie)
3   0.103856                   (cake)
4   0.478394                 (coffee)
5   0.054411                (cookies)
6   0.039197             (farm house)
7   0.058320          (hot chocolate)
8   0.038563                  (juice)
9   0.061807              (medialuna)
10  0.038457                 (muffin)
11  0.086107                 (pastry)
12  0.071844               (sandwich)
13  0.029054           (scandinavian)
14  0.034548                  (scone)
15  0.034443                   (soup)
16  0.142631                    (tea)
17  0.033597                  (toast)
18  0.020285               (truffles)
19  0.023349            (bread, cake)
20  0.090016          (bread, coffee)
21  0.029160          (pastry, bread)
22  0.028104             (bread, tea)
23  0.054728           (coffee, ca

## Generate Best Rule

In [10]:
#filter aturan asosiasi terbaik dengan 3 ukuran kinerja (support, confidence, dan lift)
result = df_ar[ (df_ar['lift'] >= 1) &
                (df_ar['confidence'] >= 0.7)]

best_ar = result.sort_values(by='confidence', ascending=False)
print(f"\nHasil Aturan Asosiasi Terbaik: \n{best_ar}")


Hasil Aturan Asosiasi Terbaik: 
   antecedents consequents  antecedent support  consequent support   support  \
18     (toast)    (coffee)            0.033597            0.478394  0.023666   

    confidence      lift  leverage  conviction  
18    0.704403  1.472431  0.007593    1.764582  


## Saving to Excel

In [11]:
#save to excel
with pd.ExcelWriter('ar.xlsx') as writer:
    df_clean.to_excel(writer, sheet_name='data_clean')
    total_items.to_excel(writer, sheet_name='frequent_items')
    trx_data.to_excel(writer, sheet_name='data_transaction')
    basket.to_excel(writer, sheet_name='data_transform')
    frequent_items.to_excel(writer, sheet_name='support_items')
    df_ar.to_excel(writer, sheet_name='association_rules')
    best_ar.to_excel(writer, sheet_name='best_ar')