In [None]:
import pandas as pd 

# Setting the file path where the data files are located.
file_path = 'SA2C_code/Kaggle/data/'

# Creating a list of file names that will be loaded.
files = ['sorted_events.df', 'data_statis.df', 'replay_buffer.df', 'sampled_val.df']

# Loading the first file in the list ('sorted_events.df') as a DataFrame using pandas.
# The file is expected to be in the 'pickle' format, which is a way to serialize and save objects.
sorted_events = pd.read_pickle(file_path + files[0])

# Extracting a unique list of item IDs from the 'item_id' column of the sorted_events DataFrame.
unique_item_id = sorted_events['item_id'].unique().tolist()


In [None]:
# Setting file paths for two CSV files containing item properties.
indata1 = 'archive/item_properties_part1.csv'
indata2 = 'archive/item_properties_part2.csv'

# Reading the first CSV file into a DataFrame named df1.
df1 = pd.read_csv(indata1)

# Reading the second CSV file into a DataFrame named df2.
df2 = pd.read_csv(indata2)

# Concatenating df1 and df2 into a single DataFrame named item_properties.
# This combines the rows of df2 directly below those of df1.
item_properties = pd.concat([df1, df2])

# Filtering the item_properties DataFrame to keep only the rows where
# the value in the 'property' column is 'categoryid'.
# This step selects only those rows which are relevant to the category ID property.
item_properties = item_properties[item_properties['property'] == 'categoryid']

# Dropping the 'property' column from the item_properties DataFrame.
# Since all rows now relate to 'categoryid', this column is no longer needed.
item_properties.drop(['property'], axis=1, inplace=True)

# Dropping the 'timestamp' column from the item_properties DataFrame.
# This is done if the timestamp is not relevant to the subsequent analysis.
item_properties.drop(['timestamp'], axis=1, inplace=True)


In [None]:
# Initialize an empty list to store new data.
new_data = []

# Loop over each unique item ID from the previously created list 'unique_item_id'.
for id in unique_item_id:
    # Filter the item_properties DataFrame for rows where the 'itemid' matches the current id in the loop.
    filtered_rows = item_properties[item_properties['itemid'] == id]
    
    # Check if the filtered_rows DataFrame is empty (i.e., no rows match the current id).
    if filtered_rows.empty:
        # If it's empty, set values to None. 
        values = None
    else:
        # If filtered_rows is not empty, check how many values it contains.
        if len(filtered_rows) > 1:
            # If there are multiple rows, convert the 'value' column to a list.
            values = filtered_rows['value'].tolist()
        else:
            # If there's only one row, retrieve the single value from the 'value' column.
            values = filtered_rows['value'].iloc[0]

    # Append a dictionary with the current item id and its corresponding values to the new_data list.
    new_data.append({'itemid': id, 'value': values})

# Convert the list of dictionaries (new_data) into a DataFrame.
new_data_df = pd.DataFrame(new_data)
# Show dataframe
new_data_df

Unnamed: 0,itemid,value
0,43511,1179
1,54408,209
2,10006,1694
3,49432,1280
4,39563,1196
...,...,...
70847,12116,1255
70848,43011,1397
70849,14255,479
70850,29598,1287


In [None]:
# Reading a CSV file into a DataFrame named 'parent'.
# This file is expected to contain a category tree, which typically represents hierarchical relationships between categories.
parent = pd.read_csv('archive/category_tree.csv')

# Renaming the column 'value' to 'categoryid' in the DataFrame 'new_data_df'.
# The 'columns' parameter specifies the change: {'value': 'categoryid'} means 'value' is changed to 'categoryid'.
# 'inplace=True' modifies the DataFrame directly, avoiding the need to assign the result to a new DataFrame.
new_data_df.rename(columns={'value': 'categoryid'}, inplace=True)


In [None]:
# Define a function named map_parentid which takes categoryid as an input.
# This function maps each categoryid to its corresponding parentid based on the 'parent' DataFrame.
def map_parentid(categoryid):
    # Check if the categoryid is None (i.e., missing or null).
    # If so, return None as there's no category to map.
    if categoryid is None:
        return None

    # If categoryid is a list (indicating multiple category IDs), process each ID in the list.
    elif isinstance(categoryid, list):
        # Convert each non-None value in the list to an integer.
        int_list = [int(val) for val in categoryid if val is not None]
        # For each integer ID in the list, find the corresponding parentid from the 'parent' DataFrame.
        # If the categoryid is not found in the 'parent' DataFrame, return None.
        return [parent[parent['categoryid'] == val]['parentid'].iloc[0] if not parent[parent['categoryid'] == val].empty else None for val in int_list]

    # If categoryid is a single value (not a list), handle it as a single category ID.
    else:
        # Convert the categoryid to an integer and find the corresponding row in the 'parent' DataFrame.
        matched_row = parent[parent['categoryid'] == int(categoryid)]
        # If a matching row is found, return the parentid; otherwise, return None.
        if not matched_row.empty:
            return matched_row['parentid'].iloc[0]
        else:
            return None

# Convert non-list, non-None 'categoryid' entries in 'new_data_df' to integers.
# This is necessary for proper matching in the map_parentid function.
new_data_df['categoryid'] = new_data_df['categoryid'].apply(lambda x: int(x) if (not isinstance(x, list) and x is not None) else x)

# Apply the map_parentid function to each row in 'new_data_df' to create a new 'parentid' column.
# This column will contain the parent ID for each category ID in 'categoryid'.
new_data_df['parentid'] = new_data_df['categoryid'].apply(map_parentid)

# Show data frame
new_data_df

Unnamed: 0,itemid,categoryid,parentid
0,43511,1179,580.0
1,54408,209,293.0
2,10006,1694,1621.0
3,49432,1280,605.0
4,39563,1196,1181.0
...,...,...,...
70847,12116,1255,587.0
70848,43011,1397,1416.0
70849,14255,479,520.0
70850,29598,1287,1009.0


In [None]:
# Define a function named aggregate_parentids to aggregate parent IDs from a pandas Series.
def aggregate_parentids(series):
    # Initialize an empty set to hold unique parent IDs.
    aggregated_parentids = set()
    # Iterate over each element in the series.
    for pid in series:
        # Check if the element is a list (implying multiple parent IDs).
        if isinstance(pid, list):
            # Add all elements of the list to the set.
            aggregated_parentids.update(pid)
        # If the element is a single parent ID and not None, add it to the set.
        elif pid is not None:
            aggregated_parentids.add(pid)
    # Return the set as a list.
    return list(aggregated_parentids)

# Group 'new_data_df' by 'itemid' and aggregate 'parentid' using the aggregate_parentids function.
aggregated_data = new_data_df.groupby('itemid')['parentid'].agg(aggregate_parentids)

# Extract all unique parent IDs from the aggregated data.
all_parentids = set()
# Iterate over each group's aggregated parent IDs.
for pids in aggregated_data:
    # Update the set with the parent IDs from each group.
    all_parentids.update(pids)
# Convert the set to a list.
all_parentids_list = list(all_parentids)

# Initialize a DataFrame named 'one_hot_matrix' with zeros.
# The DataFrame has rows for each item ID and columns for each unique parent ID.
one_hot_matrix = pd.DataFrame(0, index=aggregated_data.index, columns=all_parentids_list)

# Populate the one_hot_matrix with 1s where an item has a particular parent ID.
for itemid, parentids in aggregated_data.items():
    # Set the value to 1 for each parent ID in the corresponding row of the item.
    one_hot_matrix.loc[itemid, parentids] = 1

# Display the one_hot_matrix DataFrame.
one_hot_matrix

Unnamed: 0_level_0,1027.0,1028.0,1031.0,520.0,8.0,1034.0,1546.0,9.0,1036.0,1550.0,...,1516.0,1518.0,1519.0,1008.0,1009.0,500.0,1012.0,506.0,1531.0,1534.0
itemid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
70847,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
70848,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
70849,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
70850,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [None]:
# Save the 'one_hot_matrix' DataFrame to a CSV file. The file is named 'one_hot_parent.csv'.
one_hot_matrix.to_csv('one_hot_parent.csv', index=False)
