In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
df = pd.read_csv("D1.csv", na_values = '?', low_memory=False)

In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 19663 entries, 0 to 19662
Data columns (total 8 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   InvoiceNo    19663 non-null  int64  
 1   StockCode    19663 non-null  object 
 2   Description  19075 non-null  object 
 3   Quantity     19663 non-null  int64  
 4   InvoiceDate  19663 non-null  object 
 5   UnitPrice    19663 non-null  float64
 6   CustomerID   15678 non-null  float64
 7   Country      19663 non-null  object 
dtypes: float64(2), int64(2), object(4)
memory usage: 1.2+ MB


In [4]:
missing_values = df.isna() 
print(missing_values.sum())

InvoiceNo         0
StockCode         0
Description     588
Quantity          0
InvoiceDate       0
UnitPrice         0
CustomerID     3985
Country           0
dtype: int64


In [5]:
df.drop(['StockCode', 'Quantity', 'UnitPrice', 'CustomerID', 'Country'], axis=1, inplace=True)

In [6]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 19663 entries, 0 to 19662
Data columns (total 3 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   InvoiceNo    19663 non-null  int64 
 1   Description  19075 non-null  object
 2   InvoiceDate  19663 non-null  object
dtypes: int64(1), object(2)
memory usage: 461.0+ KB


In [7]:
df.dropna(subset=['Description'], inplace=True)

In [8]:
missing_values = df.isna() 
print(missing_values.sum())

InvoiceNo      0
Description    0
InvoiceDate    0
dtype: int64


In [9]:
df['InvoiceNo']=df['InvoiceNo'].astype(str)

In [10]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 19075 entries, 0 to 19662
Data columns (total 3 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   InvoiceNo    19075 non-null  object
 1   Description  19075 non-null  object
 2   InvoiceDate  19075 non-null  object
dtypes: object(3)
memory usage: 596.1+ KB


In [11]:
transactions = df.groupby(['InvoiceNo'])['Description'].apply(list)

In [12]:
print(transactions.head(10))

InvoiceNo
536365                 [WHITE HANGING HEART T-LIGHT HOLDER]
536367                      [ASSORTED COLOUR BIRD ORNAMENT]
536373                 [WHITE HANGING HEART T-LIGHT HOLDER]
536375                 [WHITE HANGING HEART T-LIGHT HOLDER]
536378    [JUMBO BAG PINK POLKADOT, LUNCH BAG RED RETROS...
536384    [NATURAL SLATE HEART CHALKBOARD , HEART OF WIC...
536386                            [JUMBO BAG RED RETROSPOT]
536388                              [HEART OF WICKER SMALL]
536390    [WHITE HANGING HEART T-LIGHT HOLDER, JUMBO BAG...
536392                      [ASSORTED COLOUR BIRD ORNAMENT]
Name: Description, dtype: object


In [13]:
from apyori import apriori

In [14]:
transaction_list = list(transactions)

In [15]:
results = list(apriori(transaction_list, min_support=0.05))
print(results[:5])

[RelationRecord(items=frozenset({'ASSORTED COLOUR BIRD ORNAMENT'}), support=0.15206939799331104, ordered_statistics=[OrderedStatistic(items_base=frozenset(), items_add=frozenset({'ASSORTED COLOUR BIRD ORNAMENT'}), confidence=0.15206939799331104, lift=1.0)]), RelationRecord(items=frozenset({'HEART OF WICKER SMALL'}), support=0.12552257525083613, ordered_statistics=[OrderedStatistic(items_base=frozenset(), items_add=frozenset({'HEART OF WICKER SMALL'}), confidence=0.12552257525083613, lift=1.0)]), RelationRecord(items=frozenset({'JUMBO BAG PINK POLKADOT'}), support=0.12729933110367894, ordered_statistics=[OrderedStatistic(items_base=frozenset(), items_add=frozenset({'JUMBO BAG PINK POLKADOT'}), confidence=0.12729933110367894, lift=1.0)]), RelationRecord(items=frozenset({'JUMBO BAG RED RETROSPOT'}), support=0.21864548494983277, ordered_statistics=[OrderedStatistic(items_base=frozenset(), items_add=frozenset({'JUMBO BAG RED RETROSPOT'}), confidence=0.21864548494983277, lift=1.0)]), Relatio

In [16]:
def convert_apriori_results_to_pandas_df(results):
    rules = []
    
    for rule_set in results:
        for rule in rule_set.ordered_statistics:
            rules.append([','.join(rule.items_base), ','.join(rule.items_add),
                         rule_set.support, rule.confidence, rule.lift]) 
    
    return pd.DataFrame(rules, columns=['Left_side', 'Right_side', 'Support', 'Confidence', 'Lift']) 

result_df = convert_apriori_results_to_pandas_df(results)
print (result_df.head(20))

                  Left_side                                       Right_side  \
0                                              ASSORTED COLOUR BIRD ORNAMENT   
1                                                      HEART OF WICKER SMALL   
2                                                    JUMBO BAG PINK POLKADOT   
3                                                    JUMBO BAG RED RETROSPOT   
4                                                    LUNCH BAG  BLACK SKULL.   
5                                                    LUNCH BAG RED RETROSPOT   
6                                            NATURAL SLATE HEART CHALKBOARD    
7                                            PACK OF 72 RETROSPOT CAKE CASES   
8                                                              PARTY BUNTING   
9                                                   REGENCY CAKESTAND 3 TIER   
10                                         SET OF 3 CAKE TINS PANTRY DESIGN    
11                                      

In [17]:
result_df = result_df.sort_values(by='Lift', ascending=False)
print(result_df.head(10))

                  Left_side                                       Right_side  \
14  JUMBO BAG RED RETROSPOT                          JUMBO BAG PINK POLKADOT   
13  JUMBO BAG PINK POLKADOT                          JUMBO BAG RED RETROSPOT   
20  LUNCH BAG RED RETROSPOT                          LUNCH BAG  BLACK SKULL.   
19  LUNCH BAG  BLACK SKULL.                          LUNCH BAG RED RETROSPOT   
16  JUMBO BAG RED RETROSPOT                          LUNCH BAG RED RETROSPOT   
17  LUNCH BAG RED RETROSPOT                          JUMBO BAG RED RETROSPOT   
11                                        WHITE HANGING HEART T-LIGHT HOLDER   
18                           LUNCH BAG  BLACK SKULL.,LUNCH BAG RED RETROSPOT   
15                           JUMBO BAG RED RETROSPOT,LUNCH BAG RED RETROSPOT   
12                           JUMBO BAG RED RETROSPOT,JUMBO BAG PINK POLKADOT   

     Support  Confidence      Lift  
14  0.086225    0.394359  3.097891  
13  0.086225    0.677340  3.097891  
20  0.06

In [18]:
results2 = list(apriori(transaction_list, min_support=0.01))
result2_df = convert_apriori_results_to_pandas_df(results2)
# sort all acquired rules descending by lift
result2_df = result2_df.sort_values(by='Lift', ascending=False)
print(result2_df.head(10))

                                             Left_side  \
607    JUMBO BAG RED RETROSPOT,LUNCH BAG  BLACK SKULL.   
606    JUMBO BAG PINK POLKADOT,LUNCH BAG RED RETROSPOT   
608    JUMBO BAG RED RETROSPOT,LUNCH BAG RED RETROSPOT   
605    LUNCH BAG  BLACK SKULL.,JUMBO BAG PINK POLKADOT   
610  JUMBO BAG RED RETROSPOT,LUNCH BAG  BLACK SKULL...   
603                            LUNCH BAG RED RETROSPOT   
613  JUMBO BAG RED RETROSPOT,LUNCH BAG  BLACK SKULL...   
600                            JUMBO BAG PINK POLKADOT   
485   REGENCY CAKESTAND 3 TIER,LUNCH BAG  BLACK SKULL.   
482                    PACK OF 72 RETROSPOT CAKE CASES   

                                            Right_side   Support  Confidence  \
607    JUMBO BAG PINK POLKADOT,LUNCH BAG RED RETROSPOT  0.013064    0.332447   
606    JUMBO BAG RED RETROSPOT,LUNCH BAG  BLACK SKULL.  0.013064    0.386997   
608    LUNCH BAG  BLACK SKULL.,JUMBO BAG PINK POLKADOT  0.013064    0.215889   
605    JUMBO BAG RED RETROSPOT,LUNCH BAG 

In [19]:
results3 = list(apriori(transaction_list, min_support=0.025))
result3_df = convert_apriori_results_to_pandas_df(results3)
# sort all acquired rules descending by lift
result3_df = result3_df.sort_values(by='Lift', ascending=False)
print(result3_df.head(11))

                                           Left_side  \
128  JUMBO BAG PINK POLKADOT,LUNCH BAG RED RETROSPOT   
125                          JUMBO BAG RED RETROSPOT   
129  JUMBO BAG RED RETROSPOT,LUNCH BAG RED RETROSPOT   
124                          JUMBO BAG PINK POLKADOT   
28                           JUMBO BAG PINK POLKADOT   
29                           JUMBO BAG RED RETROSPOT   
64                           LUNCH BAG  BLACK SKULL.   
65                           LUNCH BAG RED RETROSPOT   
23                   NATURAL SLATE HEART CHALKBOARD    
22                             HEART OF WICKER SMALL   
127  JUMBO BAG RED RETROSPOT,JUMBO BAG PINK POLKADOT   

                                          Right_side   Support  Confidence  \
128                          JUMBO BAG RED RETROSPOT  0.026024    0.770898   
125  JUMBO BAG PINK POLKADOT,LUNCH BAG RED RETROSPOT  0.026024    0.119025   
129                          JUMBO BAG PINK POLKADOT  0.026024    0.430052   
124  JUMBO BAG 

In [20]:
results4 = list(apriori(transaction_list, min_support=0.05))
result4_df = convert_apriori_results_to_pandas_df(results4)
# sort all acquired rules descending by lift
result4_df = result4_df.sort_values(by='Lift', ascending=False)
print(result4_df.head(10))

                  Left_side                                       Right_side  \
14  JUMBO BAG RED RETROSPOT                          JUMBO BAG PINK POLKADOT   
13  JUMBO BAG PINK POLKADOT                          JUMBO BAG RED RETROSPOT   
20  LUNCH BAG RED RETROSPOT                          LUNCH BAG  BLACK SKULL.   
19  LUNCH BAG  BLACK SKULL.                          LUNCH BAG RED RETROSPOT   
16  JUMBO BAG RED RETROSPOT                          LUNCH BAG RED RETROSPOT   
17  LUNCH BAG RED RETROSPOT                          JUMBO BAG RED RETROSPOT   
11                                        WHITE HANGING HEART T-LIGHT HOLDER   
18                           LUNCH BAG  BLACK SKULL.,LUNCH BAG RED RETROSPOT   
15                           JUMBO BAG RED RETROSPOT,LUNCH BAG RED RETROSPOT   
12                           JUMBO BAG RED RETROSPOT,JUMBO BAG PINK POLKADOT   

     Support  Confidence      Lift  
14  0.086225    0.394359  3.097891  
13  0.086225    0.677340  3.097891  
20  0.06

In [21]:
ans3 = result3_df.loc[result3_df['Left_side']=='JUMBO BAG PINK POLKADOT']
ans3.sort_values(by = 'Lift', ascending = False).head(5)

Unnamed: 0,Left_side,Right_side,Support,Confidence,Lift
124,JUMBO BAG PINK POLKADOT,"JUMBO BAG RED RETROSPOT,LUNCH BAG RED RETROSPOT",0.026024,0.204433,3.378272
28,JUMBO BAG PINK POLKADOT,JUMBO BAG RED RETROSPOT,0.086225,0.67734,3.097891
34,JUMBO BAG PINK POLKADOT,LUNCH BAG RED RETROSPOT,0.033758,0.265189,1.622332
31,JUMBO BAG PINK POLKADOT,LUNCH BAG BLACK SKULL.,0.026756,0.210181,1.579739
37,JUMBO BAG PINK POLKADOT,WHITE HANGING HEART T-LIGHT HOLDER,0.02686,0.211002,0.893303


In [22]:
# Convert the "InvoiceDate" column to datetime with the correct format
df['InvoiceDate'] = pd.to_datetime(df['InvoiceDate'], format='%d/%m/%Y %H:%M')

# Sort the DataFrame by "InvoiceDate"
df = df.sort_values(by='InvoiceDate')

In [23]:
transactions = pd.DataFrame(transactions).reset_index()
transactions['count']=transactions['Description'].apply(lambda x: str(x).count(','))
print (transactions)


     InvoiceNo                                        Description  count
0       536365               [WHITE HANGING HEART T-LIGHT HOLDER]      0
1       536367                    [ASSORTED COLOUR BIRD ORNAMENT]      0
2       536373               [WHITE HANGING HEART T-LIGHT HOLDER]      0
3       536375               [WHITE HANGING HEART T-LIGHT HOLDER]      0
4       536378  [JUMBO BAG PINK POLKADOT, LUNCH BAG RED RETROS...      2
...        ...                                                ...    ...
9563    581498  [JUMBO BAG PINK POLKADOT, NATURAL SLATE HEART ...      2
9564    581538  [HEART OF WICKER SMALL, LUNCH BAG  BLACK SKULL...      2
9565    581579                          [JUMBO BAG PINK POLKADOT]      0
9566    581583                          [LUNCH BAG RED RETROSPOT]      0
9567    581585                    [ASSORTED COLOUR BIRD ORNAMENT]      0

[9568 rows x 3 columns]


In [24]:
transactions = df.groupby(['InvoiceNo'])['Description'].apply(list)
sequences = transactions.values.tolist()
# show the first 5 sequences
print(sequences[:5])

[['WHITE HANGING HEART T-LIGHT HOLDER'], ['ASSORTED COLOUR BIRD ORNAMENT'], ['WHITE HANGING HEART T-LIGHT HOLDER'], ['WHITE HANGING HEART T-LIGHT HOLDER'], ['JUMBO BAG PINK POLKADOT', 'LUNCH BAG RED RETROSPOT', 'PACK OF 72 RETROSPOT CAKE CASES']]


In [25]:
from collections import defaultdict
import subprocess
import re

In [26]:
''' Uses SPMF to find association rules in supplied transactions '''
def get_association_rules(sequences, min_sup, min_conf):
    # step 1: create required input for SPMF

    # prepare a dict to uniquely assign each item in the transactions to an int ID
    item_dict = defaultdict(int)
    output_dict = defaultdict(str)
    item_id = 1

    # write your sequences in SPMF format
    with open('seq_rule_input.txt', 'w+') as f:
        for sequence in sequences:
            z = []
            for itemset in sequence:
                # if there are multiple items in one itemset
                if isinstance(itemset, list):
                    for item in itemset:
                        if item not in item_dict:
                            item_dict[item] = item_id
                            item_id += 1
                        z.append(item_dict[item])
                else:
                    if itemset not in item_dict:
                        item_dict[itemset] = item_id
                        output_dict[str(item_id)] = itemset
                        item_id += 1
                    z.append(item_dict[itemset])

                # end of itemset
                z.append(-1)

            # end of a sequence
            z.append(-2)
            f.write(' '.join([str(x) for x in z]))
            f.write('\n')

    # run SPMF with supplied parameters
    supp_param = '{}%'.format(int(min_sup * 100))
    conf_param = '{}%'.format(int(min_conf * 100))
    subprocess.call(['java', '-jar', 'spmf.jar', 'run', 'RuleGrowth', 'seq_rule_input.txt', 'seq_rule_output.txt', supp_param, conf_param], shell=True)

    # read back the output rules
    outputs = open('seq_rule_output.txt', 'r').read().strip().split('\n')
    output_rules = []
    for rule in outputs:
        print("****", rule)
        match = re.search(pattern=r'([0-9\,]+) ==> ([0-9\,]+) #SUP: ([0-9]+) #CONF: ([0-9\,]+)', string=rule)
        if match:
            left, right, sup, conf = match.groups()
            sup = int(sup) / len(sequences)
            conf = float(conf)
            output_rules.append([[output_dict[x] for x in left.split(',')], [output_dict[x] for x in right.split(',')], sup, conf])
        else:
            # Handle the case when no match is found in the rule string
            print(f"Warning: No match found for rule: {rule}")

    # Return a pandas DataFrame
    return pd.DataFrame(output_rules, columns=['Left_rule', 'Right_rule', 'Support', 'Confidence'])


In [28]:
get_association_rules(sequences, 0.01,0.01)

사용법: java [-options] class [args...]
           (클래스 실행)
   또는  java [-options] -jar jarfile [args...]
           (jar 파일 실행)
여기서 options는 다음과 같습니다.
    -d32	  사용 가능한 경우 32비트 데이터 모델을 사용합니다.
    -d64	  사용 가능한 경우 64비트 데이터 모델을 사용합니다.
    -server	  "server" VM을 선택합니다.
                  기본 VM은 server입니다.,
                  서버급 시스템에서 실행 중이기 때문입니다.


    -cp <디렉토리 및 zip/jar 파일의 클래스 검색 경로>
    -classpath <디렉토리 및 zip/jar 파일의 클래스 검색 경로>
                  클래스 파일을 검색할 :(으)로 구분된 디렉토리,
                  JAR 아카이브 및 ZIP 아카이브 목록입니다.
    -D<name>=<value>
                  시스템 속성을 설정합니다.
    -verbose:[class|gc|jni]
                  상세 정보 출력을 사용으로 설��합니다.
    -version      제품 버전을 인쇄한 후 종료합니다.
    -version:<value>
                  경고: 이 기능은 사용되지 않으며
                  이후 릴리스에서 제거됩니다.
                  실행할 버전을 지정해야 합니다.
    -showversion  제품 버전을 인쇄한 후 계속합니다.
    -jre-restrict-search | -no-jre-restrict-search
                  경고: 이 기능은 사용되지 않으며
                  이후 릴리스에서 제거됩니다.
                  버전 검색에서 사용자

FileNotFoundError: [Errno 2] No such file or directory: 'seq_rule_output.txt'

In [None]:
#def mine_sequences(seq_rule_input, seq_rule_output, min_support):
    # Convert min_support to a percentage string
 #   min_support_str = f"{min_support}%"

    # Define the SPMF command and parameters
  #  spmf_command = [
        'java', '-jar', 'spmf.jar', 'run', 'PrefixSpan', seq_rule_input, seq_rule_output, min_support_str
    ]

    # Execute the SPMF command
   # subprocess.run(spmf_command)

# Call the function to mine sequences with a min_support of 2%
#mine_sequences('seq_rule_input.txt', 'seq_rule_output.txt', min_support=2)

In [None]:
import re

def get_association_rules(sequences, outputs, min_sup, min_conf):
    # ... (previous code) ...

    # Initialize an empty list to store the extracted rules
    output_rules = []

    for rule in outputs:
        print("****", rule)
        match = re.search(pattern=r'([0-9\,]+) ==> ([0-9\,]+) #SUP: ([0-9]+) #CONF: ([0-9\,]+)', string=rule)
        if match:
            left, right, sup, conf = match.groups()
            sup = int(sup) / len(sequences)
            conf = float(conf)
            output_rules.append([[output_dict[x] for x in left.split(',')], [output_dict[x] for x in right.split(',')], sup, conf])
        else:
            # Handle the case when no match is found in the rule string
            print(f"Warning: No match found for rule: {rule}")

     Return a pandas DataFrame
    return pd.DataFrame 


In [None]:
df.info()
