#https://www.philippe-fournier-viger.com/spmf/index.php?link=datasets.php

In [7]:
import requests
import numpy as np
import pandas as pd
import io

In [8]:
url_transactions = "https://www.philippe-fournier-viger.com/spmf/datasets/fruithut_original.txt"
url_taxonomy = "https://www.philippe-fournier-viger.com/spmf/datasets/Fruithut_taxonomy_data.txt"

In [9]:
response_transations = requests.get(url_transactions)
transactions_and_elements_txt = response_transations.text

response_taxonomy = requests.get(url_taxonomy)
taxonomy_txt = response_taxonomy.text

In [10]:
df_taxonomy = np.genfromtxt(io.StringIO(taxonomy_txt), delimiter=',')
df_taxonomy = pd.DataFrame(df_taxonomy)
df_taxonomy = df_taxonomy.astype(int)

In [11]:
elements_data = []
skip_first_line = True
last_index = 0
transactions_and_elements_list = transactions_and_elements_txt.split('\n')

for line in transactions_and_elements_list:
    if skip_first_line:
        skip_first_line = False
        continue

    if line.startswith("@ITEM="):
        line = line.replace("@ITEM=", "")
        parts = line.split("=")
        number = int(parts[0])
        description = parts[1].replace('\r', '')
        elements_data.append([number, description])
        last_index += 1
    elif not line.startswith("@ITEM="):
        break

In [12]:
df_elements = np.array(elements_data)
sorted_indices = np.argsort(df_elements[:, 0].astype(int))
df_elements = df_elements[sorted_indices]
transactions_list = transactions_and_elements_list[last_index+1:]

for i in range(len(transactions_list)):
    transactions_list[i] = transactions_list[i].replace('\r', '')

element_numbers = {row[0]: row[1] for row in df_elements}

transactions_data = []
for line in transactions_list:
    transaction = line.split(' ')
    transaction_data = np.array([1 if element in transaction else 0 for element in element_numbers])
    transactions_data.append(transaction_data)

In [13]:
df_transactions = pd.DataFrame(transactions_data, columns=list(element_numbers.keys()))

In [14]:
print("Taxonomy:")
print(df_taxonomy)

Taxonomy:
         0    1
0     1001  110
1     1002  150
2     1003  150
3     1004  150
4     1005  130
...    ...  ...
1290   237  230
1291   210  200
1292   220  200
1293   230  200
1294   240  200

[1295 rows x 2 columns]


In [15]:
print("\n\nElements descriptions:")
print(df_elements)



Elements descriptions:
[['1001' ' Australian Asparagus green']
 ['1002' 'Beans green']
 ['1003' 'Beans baby']
 ...
 ['9996' 'Almond Bread']
 ['9997' 'Chocolate Almond Bread']
 ['9998' 'Chilli Powder Unr 200g']]


In [16]:
print("\n\nTransactions:")
print(df_transactions)



Transactions:
        1001  1002  1003  1004  1005  1007  1008  1009  1010  1011  ...  9989  \
0          0     0     0     0     0     0     0     0     0     0  ...     0   
1          0     0     0     0     0     0     0     0     0     0  ...     0   
2          1     0     0     0     0     0     0     0     0     0  ...     0   
3          1     0     0     0     0     0     0     0     0     0  ...     0   
4          1     0     1     0     0     0     0     0     0     0  ...     0   
...      ...   ...   ...   ...   ...   ...   ...   ...   ...   ...  ...   ...   
181966     0     0     0     0     0     0     0     0     0     0  ...     0   
181967     0     0     0     0     0     0     0     0     1     0  ...     0   
181968     0     1     0     0     0     0     1     0     0     0  ...     0   
181969     0     0     0     0     0     0     0     0     0     0  ...     0   
181970     0     0     0     0     0     0     0     0     0     0  ...     0   

        999

In [17]:
df_taxonomy.to_csv('taxonomy.csv', index=False)
df_transactions.to_csv('transactions.csv', index=False)

# ADD TAXONOMY TO TRANSACTION DATASET

In [30]:
import csv

In [31]:
def read_csv_as_list(file_path):
    data = []
    with open(file_path, 'r') as file:
        csv_reader = csv.reader(file)
        next(csv_reader)  # Skip the header row
        for row in csv_reader:
            data.append([int(value) for value in row])
    return data

In [32]:
file_path = 'taxonomy.csv'
df_taxonomy_basic = read_csv_as_list(file_path)
df_transactions = pd.read_csv('transactions.csv')

In [33]:
class TaxonomyDictionary:
    def __init__(self, arr, taxonomy_groups):
        self.dictionary = {}
        self.build_dictionary(arr)
        self.extend_dictionary(taxonomy_groups)

    def build_dictionary(self, arr):
        for item in arr:
            key, parent = item
            self.dictionary[key] = [parent]

    def extend_dictionary(self, taxonomy_groups):
        for key, value in self.dictionary.items():
            while True:
                found_match = False
                for group in taxonomy_groups:
                    if value[-1] == group[0]:
                        value.append(group[1])
                        found_match = True
                if not found_match:
                    break
    def get_dictionary(self):
        return self.dictionary
    def get_value(self, key):
        if isinstance(key, list):
            if len(key) > 1:
                return self.dictionary[key[0]][-1]
            nested_dict = self.dictionary
            for k in key:
                nested_dict = nested_dict.get(k, {})
                if not isinstance(nested_dict, dict):
                    return nested_dict
            return nested_dict
        else:
            return self.dictionary.get(key)

def split_base_taxonomy(base):
    first_level = []
    taxonomy_groups = []

    for item in base:
        value_1, value_2 = item
        value_1_str = str(value_1)
        if len(value_1_str) == 4:
            first_level.append(item)
        else:
            taxonomy_groups.append(item)

    return first_level, taxonomy_groups

## Split taxonomy to first level (items -> parents) and others (parents -> parents)

In [34]:
taxonomy_elements_parents, taxonomy_groups = split_base_taxonomy(df_taxonomy_basic)
taxonomy_dict = TaxonomyDictionary(taxonomy_elements_parents, taxonomy_groups)
print("Taxonomy:\n")
print(taxonomy_dict.get_dictionary())
print("Get value from key=1012: ")
print(taxonomy_dict.get_value([1012]))
print("Get value from key=1012,152,150: ")
print(taxonomy_dict.get_value([1012, 152, 150]))

Taxonomy:

{1001: [110, 100], 1002: [150, 100], 1003: [150, 100], 1004: [150, 100], 1005: [130, 100], 1007: [120, 100], 1008: [150, 100], 1009: [120, 100], 1010: [110, 100], 1011: [150, 100], 1012: [152, 150, 100], 1013: [152, 150, 100], 1014: [152, 150, 100], 1015: [152, 150, 100], 1016: [159, 150, 100], 1017: [159, 150, 100], 1018: [159, 150, 100], 1020: [130, 100], 1021: [131, 130, 100], 1022: [131, 130, 100], 1023: [150, 100], 1024: [130, 100], 1025: [110, 100], 1026: [120, 100], 1027: [158, 150, 100], 1028: [158, 150, 100], 1029: [158, 150, 100], 1030: [150, 100], 1031: [155, 150, 100], 1032: [155, 150, 100], 1033: [150, 100], 1034: [150, 100], 1035: [110, 100], 1036: [150, 100], 1037: [153, 150, 100], 1038: [153, 150, 100], 1039: [150, 100], 1040: [150, 100], 1041: [150, 100], 1042: [157, 150, 100], 1045: [157, 150, 100], 1048: [304], 1049: [157, 150, 100], 1050: [100], 1051: [157, 150, 100], 1052: [157, 150, 100], 1053: [157, 150, 100], 1054: [157, 150, 100], 1056: [156, 150, 10

### Create a list of unique values in taxonomy

In [38]:
def check_parents_length():
    unique_values = set(value for sublist in taxonomy_elements_parents for value in sublist)
    unique_values2 = set(value for sublist in taxonomy_groups for value in sublist)
    print(unique_values)
    print(unique_values2)
    data = [130, 131, 132, 140, 150, 151, 152, 153, 154, 155, 156, 157, 158, 159, 200, 210, 211, 212, 213, 220, 100, 230, 231, 232, 233, 234, 235, 236, 237, 110, 240, 120,100, 110, 120, 130, 131, 132, 140, 150, 151, 152, 153, 154, 155, 156, 157, 158, 159, 200, 210, 211, 212, 213, 220, 230, 231, 232, 233, 234, 235, 236, 237, 240, 300, 301, 302, 303, 304, 305, 306, 307, 308, 309, 310]
    data2 = [110, 120, 131, 132, 140, 151, 152, 153, 154, 155, 156, 157, 158, 159, 211, 212, 213, 220, 231, 232, 233, 234, 235, 236, 237, 240, 300, 301, 302, 303, 304, 305, 306, 307, 308, 309, 310, 130, 150 , 210, 230, 100, 200]
    if len(set(data)) == len(set(data2)):
        print("Equal lengths")
    else:
        print("Different lengths")

check_parents_length()

{100, 110, 120, 130, 131, 132, 140, 150, 151, 152, 153, 154, 155, 156, 157, 158, 159, 200, 210, 211, 212, 213, 220, 230, 231, 232, 233, 234, 235, 236, 237, 240, 300, 301, 302, 303, 304, 305, 306, 307, 308, 309, 310, 9000, 9001, 9002, 9003, 9004, 9005, 9006, 9010, 9011, 9012, 9013, 9014, 9015, 9016, 9017, 9018, 9019, 9020, 9021, 9022, 9023, 9024, 9025, 9026, 9027, 9028, 9029, 9030, 9031, 9032, 9033, 9044, 9045, 9046, 9047, 9048, 9049, 9050, 9051, 9052, 9053, 9054, 9055, 9056, 9057, 9058, 9059, 9060, 9061, 9062, 9063, 9064, 9065, 9066, 9067, 9068, 9069, 9070, 9071, 9072, 9073, 9074, 9075, 9076, 9077, 9078, 9079, 9080, 9081, 9082, 9083, 9084, 9085, 9086, 9087, 9088, 9089, 9090, 9091, 9092, 9093, 9094, 9095, 9096, 9097, 9098, 9099, 9100, 9101, 9102, 9103, 9104, 9105, 9106, 9107, 9108, 9109, 9110, 9111, 9112, 9113, 9114, 9115, 9116, 9117, 9118, 9119, 9120, 9121, 9122, 9123, 9124, 9125, 9126, 9127, 9128, 9129, 9130, 9131, 9132, 9133, 9134, 9135, 9136, 9137, 9138, 9139, 9140, 9141, 9142, 9144

In [41]:
parents_names = [110, 120, 131, 132, 140, 151, 152, 153, 154, 155, 156, 157, 158, 159, 211, 212, 213, 220, 231, 232, 233, 234, 235, 236, 237, 240, 300, 301, 302, 303, 304, 305, 306, 307, 308, 309, 310, 130, 150, 210, 230, 100, 200]

In [42]:
# Append the new rows to the existing DataFrame
df_transactions_with_parents = df_transactions.copy(deep=False)
# Add new columns with specified names and values of 0
for name in parents_names:
    df_transactions_with_parents[name] = 0

df_transactions_with_parents.to_csv('transactions_with_parents.csv', index=False)

In [47]:
if df_transactions_with_parents.shape[0] == df_transactions.shape[0]:
    print("Equal")
else:
    print("Not Equal")

Equal


In [49]:
if df_transactions_with_parents.shape[1] - len(parents_names) == df_transactions.shape[1]:
    print("Equal")
else:
    print("Not Equal")

Equal


In [50]:
df_transactions_with_parents

Unnamed: 0,1001,1002,1003,1004,1005,1007,1008,1009,1010,1011,...,307,308,309,310,130,150,210,230,100,200
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,1,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
181966,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
181967,0,0,0,0,0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0
181968,0,1,0,0,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
181969,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


### DO ROZSZERZONEJ O TAKSONOMIE LISTY NALEZY DODAC INFORMACJE O CZESTOSCI WYSTAPIEN POSZCZEGOLNYCH GRUP W POSZCZEGOLNYCH KOMORKACH

In [52]:
taxonomy = taxonomy_dict.get_dictionary()

# DO POPRAWY COS ZLE LICZY

In [55]:
# Iterate over each row
for i, row in df_transactions.iterrows():
    # Initialize the counts for each row
    row_counts = {}

    # Iterate over the taxonomy dictionary
    for group, descendants in taxonomy.items():
        count = 0
        for descendant in descendants:
            if row[descendant] == 1:
                count += 1
        if count > 0:
            row_counts[group] = count

    # Add the counts to the appropriate cells in the dataset
    for group, count in row_counts.items():
        df_transactions_with_parents.at[i, group] = count

    # Add the counts for descendants at higher levels
    for group in row_counts.keys():
        for descendant in taxonomy[group]:
            if descendant != group:
                df_transactions_with_parents.at[i, descendant] += count

print(df_transactions_with_parents)

  self.obj[key] = infer_fill_value(value)


        1001  1002  1003  1004  1005  1007  1008  1009  1010  1011  ...  7017  \
0          0     0     0     0     0     0     0     0     0     0  ...   NaN   
1          0     0     0     0     0     0     0     0     0     0  ...   NaN   
2          1     0     0     0     0     0     0     0     0     0  ...   NaN   
3          1     0     0     0     0     0     0     0     0     0  ...   NaN   
4          1     0     1     0     0     0     0     0     0     0  ...   NaN   
...      ...   ...   ...   ...   ...   ...   ...   ...   ...   ...  ...   ...   
181966     0     0     0     0     0     0     0     0     0     0  ...   NaN   
181967     0     0     0     0     0     0     0     0     1     0  ...   NaN   
181968     0     1     0     0     0     0     1     0     0     0  ...   NaN   
181969     0     0     0     0     0     0     0     0     0     0  ...   NaN   
181970     0     0     0     0     0     0     0     0     0     0  ...   NaN   

        7018  8019  8020  9

In [None]:
df_transactions_with_parents.to_csv('transactions_with_parents.csv', index=False)

# POROBIC HISTOGRAMY I INNE BAJERY Z TYCH DATAFRAMOW