#https://www.philippe-fournier-viger.com/spmf/index.php?link=datasets.php

In [10]:
import requests
import numpy as np
import pandas as pd
import io

In [8]:
url_transactions = "https://www.philippe-fournier-viger.com/spmf/datasets/fruithut_original.txt"
url_taxonomy = "https://www.philippe-fournier-viger.com/spmf/datasets/Fruithut_taxonomy_data.txt"

In [9]:
response_transations = requests.get(url_transactions)
transactions_and_elements_txt = response_transations.text

response_taxonomy = requests.get(url_taxonomy)
taxonomy_txt = response_taxonomy.text

In [10]:
df_taxonomy = np.genfromtxt(io.StringIO(taxonomy_txt), delimiter=',')
df_taxonomy = pd.DataFrame(df_taxonomy)
df_taxonomy = df_taxonomy.astype(int)

In [11]:
elements_data = []
skip_first_line = True
last_index = 0
transactions_and_elements_list = transactions_and_elements_txt.split('\n')

for line in transactions_and_elements_list:
    if skip_first_line:
        skip_first_line = False
        continue

    if line.startswith("@ITEM="):
        line = line.replace("@ITEM=", "")
        parts = line.split("=")
        number = int(parts[0])
        description = parts[1].replace('\r', '')
        elements_data.append([number, description])
        last_index += 1
    elif not line.startswith("@ITEM="):
        break

In [12]:
df_elements = np.array(elements_data)
sorted_indices = np.argsort(df_elements[:, 0].astype(int))
df_elements = df_elements[sorted_indices]
transactions_list = transactions_and_elements_list[last_index+1:]

for i in range(len(transactions_list)):
    transactions_list[i] = transactions_list[i].replace('\r', '')

element_numbers = {row[0]: row[1] for row in df_elements}

transactions_data = []
for line in transactions_list:
    transaction = line.split(' ')
    transaction_data = np.array([1 if element in transaction else 0 for element in element_numbers])
    transactions_data.append(transaction_data)

In [13]:
df_transactions = pd.DataFrame(transactions_data, columns=list(element_numbers.keys()))

In [14]:
print("Taxonomy:")
print(df_taxonomy)

Taxonomy:
         0    1
0     1001  110
1     1002  150
2     1003  150
3     1004  150
4     1005  130
...    ...  ...
1290   237  230
1291   210  200
1292   220  200
1293   230  200
1294   240  200

[1295 rows x 2 columns]


In [15]:
print("\n\nElements descriptions:")
print(df_elements)



Elements descriptions:
[['1001' ' Australian Asparagus green']
 ['1002' 'Beans green']
 ['1003' 'Beans baby']
 ...
 ['9996' 'Almond Bread']
 ['9997' 'Chocolate Almond Bread']
 ['9998' 'Chilli Powder Unr 200g']]


In [16]:
print("\n\nTransactions:")
print(df_transactions)



Transactions:
        1001  1002  1003  1004  1005  1007  1008  1009  1010  1011  ...  9989  \
0          0     0     0     0     0     0     0     0     0     0  ...     0   
1          0     0     0     0     0     0     0     0     0     0  ...     0   
2          1     0     0     0     0     0     0     0     0     0  ...     0   
3          1     0     0     0     0     0     0     0     0     0  ...     0   
4          1     0     1     0     0     0     0     0     0     0  ...     0   
...      ...   ...   ...   ...   ...   ...   ...   ...   ...   ...  ...   ...   
181966     0     0     0     0     0     0     0     0     0     0  ...     0   
181967     0     0     0     0     0     0     0     0     1     0  ...     0   
181968     0     1     0     0     0     0     1     0     0     0  ...     0   
181969     0     0     0     0     0     0     0     0     0     0  ...     0   
181970     0     0     0     0     0     0     0     0     0     0  ...     0   

        999

In [17]:
df_taxonomy.to_csv('taxonomy.csv', index=False)
df_transactions.to_csv('transactions.csv', index=False)