### How many 'Forgotten items' baskets are the re in the coop_data.json dataset?

#### Definition  of 'Forgotten items basket': a small basket with 'X' number of products (1-2) bought 'Y' number of days after a 'large' purchase of 'Z' amount of products.

In [45]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt 
import seaborn as sns
from datetime import datetime, timedelta
from glob import glob
from tqdm import tqdm
import json

import os
os.environ["OMP_NUM_THREADS"] = "50"

In [46]:
pd.set_option('display.max_columns', None)
pd.set_option('display.max_colwidth', None)

### Reading function

In [77]:
def process_shopping_history(file_path):
    """
    Process a JSON file containing shopping history and return a structured DataFrame.
    
    Parameters:
    file_path (str): Path to the JSON file where each line is a separate JSON object
    
    Returns:
    pandas.DataFrame: DataFrame with columns customer_id, transaction_date, and basket_content
    """
    # Lists to store the extracted data
    records = []
    
    # Read the file line by line
    with open(file_path, 'r') as file:
        for line in file:
            if line.strip():  # Skip empty lines
                # Parse JSON object
                customer_data = json.loads(line)
                customer_id = customer_data['customer_id']
                
                # Process each transaction date in the customer's data
                for date_key, transaction in customer_data['data'].items():
                    # Extract date from the key (format: YYYY_MM_DD_XX)
                    date_parts = date_key.split('_')
                    transaction_date = datetime(
                        int(date_parts[0]),
                        int(date_parts[1]),
                        int(date_parts[2])
                    )
                    
                    # Extract basket items (only IDs)
                    basket_items = set(str(item_id) for item_id in transaction['basket'].keys())
                    
                    # Create a record
                    records.append({
                        'customer_id': customer_id,
                        'transaction_date': transaction_date,
                        'basket_content': basket_items
                    })
    
    # Create DataFrame
    df = pd.DataFrame(records)
    
    # Sort the DataFrame by customer_id and transaction_date
    df_sorted = df.sort_values(['customer_id', 'transaction_date'])
    
    # Reset index after sorting
    df_sorted = df_sorted.reset_index(drop=True)
    
    return df_sorted

In [78]:
path = r"E:\Thesis\Legrottaglie Forgotten Items\Model\dataset\Full dataset\clean\coop_data_clean.json"


# path = r"E:\Thesis\Legrottaglie Forgotten Items\Model\dataset\tafeng\tafeng.json"

In [79]:
# Load the data
df = process_shopping_history(path)

In [80]:
# Preview the data
df.head()

Unnamed: 0,customer_id,transaction_date,basket_content
0,67656,2007-01-02,"{1830, 3226, 1826, 2134, 476, 505, 5082, 622}"
1,67656,2007-01-08,"{4436, 140, 642, 4831, 5069, 633, 1579, 1577, 4117, 5012, 657, 4196, 3607, 5028}"
2,67656,2007-01-11,"{2090, 2980, 140, 4031, 1261, 4003, 142, 2302, 2095, 2074}"
3,67656,2007-01-17,"{1012, 2038, 178, 140, 4626, 5945, 3143, 1870, 141, 2502, 234, 5082}"
4,67656,2007-01-20,"{1621, 140, 216, 1570, 1261, 1564, 1577, 1467, 4900, 142, 2551, 657, 1622, 1090, 891, 1989}"


In [81]:
def analyze_unique_items(df):
    """
    Analyzes DataFrame containing basket content data and returns the number of unique items
    across all baskets where basket contents are stored as sets.
    
    Args:
        df (pandas.DataFrame): DataFrame containing a column with basket contents as sets
        
    Returns:
        int: Number of unique items across all baskets
    """
    # Create a single set to store all unique items
    all_unique_items = set()
    
    # Process each basket set
    for basket_set in df['basket_content']:
        # Update the all_unique_items set with items from current basket
        all_unique_items.update(basket_set)
    
    return len(all_unique_items)

# Example usage:
total_unique_items = analyze_unique_items(df)
print(f"Total number of unique items: {total_unique_items}")

Total number of unique items: 5552


In [82]:
df.shape

(10867976, 3)

In [83]:
# Sort by the first column and then by the second column
# Sort by the first and second column, and reset the index
df = df.sort_values(by=[df.columns[0], df.columns[1]]).reset_index(drop=True)



In [84]:
df.head()

Unnamed: 0,customer_id,transaction_date,basket_content
0,67656,2007-01-02,"{1830, 3226, 1826, 2134, 476, 505, 5082, 622}"
1,67656,2007-01-08,"{4436, 140, 642, 4831, 5069, 633, 1579, 1577, 4117, 5012, 657, 4196, 3607, 5028}"
2,67656,2007-01-11,"{2090, 2980, 140, 4031, 1261, 4003, 142, 2302, 2095, 2074}"
3,67656,2007-01-17,"{1012, 2038, 178, 140, 4626, 5945, 3143, 1870, 141, 2502, 234, 5082}"
4,67656,2007-01-20,"{1621, 140, 216, 1570, 1261, 1564, 1577, 1467, 4900, 142, 2551, 657, 1622, 1090, 891, 1989}"


In [85]:
# Find the minimum and maximum values
min_date = df['transaction_date'].min()
max_date = df['transaction_date'].max()

print(f"Minimum date: {min_date}")
print(f"Maximum date: {max_date}")

Minimum date: 2007-01-02 00:00:00
Maximum date: 2016-08-31 00:00:00


In [86]:
df.shape

(10867976, 3)

In [87]:
df['customer_id'].nunique()

17132

### Calculating avg, number of baskets

In [88]:
def calculate_avg_baskets_per_customer(df):
    """
    Calculate the average number of baskets (transactions) per customer.
    
    Parameters:
    df (pandas.DataFrame): DataFrame containing columns 'customer_id' and 'transaction_date'
                         as created by process_shopping_history()
    
    Returns:
    dict: Dictionary containing:
          - 'avg_baskets': Average number of baskets per customer
          - 'total_customers': Total number of unique customers
          - 'total_baskets': Total number of baskets/transactions
    """
    # Count number of transactions per customer
    baskets_per_customer = df.groupby('customer_id').size()
    
    # Calculate metrics
    total_customers = len(baskets_per_customer)
    total_baskets = baskets_per_customer.sum()
    avg_baskets = total_baskets / total_customers
    
    return {
        'avg_baskets': round(avg_baskets, 2),
        'total_customers': total_customers,
        'total_baskets': total_baskets
    }

In [89]:
stats = calculate_avg_baskets_per_customer(df)

print(f"Average baskets per customer: {stats['avg_baskets']}")
print(f"Total customers: {stats['total_customers']}")
print(f"Total baskets: {stats['total_baskets']}")

Average baskets per customer: 634.37
Total customers: 17132
Total baskets: 10867976


### Calculating median number of baskets

In [90]:
def calculate_median_baskets_per_customer(df):
    """
    Calculate the median number of baskets (transactions) per customer.
    
    Parameters:
    df (pandas.DataFrame): DataFrame containing columns 'customer_id' and 'transaction_date'
                         as created by process_shopping_history()
    
    Returns:
    dict: Dictionary containing:
          - 'median_baskets': Median number of baskets per customer
          - 'q1_baskets': First quartile (25th percentile) of baskets per customer
          - 'q3_baskets': Third quartile (75th percentile) of baskets per customer
          - 'total_customers': Total number of unique customers
          - 'total_baskets': Total number of baskets/transactions
          - 'min_baskets': Minimum number of baskets for any customer
          - 'max_baskets': Maximum number of baskets for any customer
    """
    # Count number of transactions per customer
    baskets_per_customer = df.groupby('customer_id').size()
    
    # Calculate metrics
    total_customers = len(baskets_per_customer)
    total_baskets = baskets_per_customer.sum()
    
    # Calculate quartile statistics
    median_baskets = baskets_per_customer.median()
    q1_baskets = baskets_per_customer.quantile(0.25)
    q3_baskets = baskets_per_customer.quantile(0.75)
    min_baskets = baskets_per_customer.min()
    max_baskets = baskets_per_customer.max()
    
    # Calculate interquartile range (IQR)
    iqr = q3_baskets - q1_baskets
    
    return {
        'median_baskets': round(median_baskets, 2),
        'q1_baskets': round(q1_baskets, 2),
        'q3_baskets': round(q3_baskets, 2),
        'total_customers': total_customers,
        'total_baskets': total_baskets,
        'min_baskets': min_baskets,
        'max_baskets': max_baskets,
        'iqr': round(iqr, 2)
    }

In [91]:
# Calculate median baskets per customer
stats = calculate_median_baskets_per_customer(df)

print(f"Median baskets per customer: {stats['median_baskets']}")
print(f"First quartile (25%): {stats['q1_baskets']} baskets")
print(f"Third quartile (75%): {stats['q3_baskets']} baskets")
print(f"Interquartile range: {stats['iqr']} baskets")
print(f"Range: {stats['min_baskets']} to {stats['max_baskets']} baskets")
print(f"Total customers: {stats['total_customers']}")
print(f"Total baskets: {stats['total_baskets']}")

Median baskets per customer: 452.0
First quartile (25%): 220.75 baskets
Third quartile (75%): 856.0 baskets
Interquartile range: 635.25 baskets
Range: 4 to 3696 baskets
Total customers: 17132
Total baskets: 10867976


### Calculate avg timespan between first and last purchase in the dataset

In [92]:
def calculate_avg_customer_timespan(df):
    """
    Calculate the average time span between first and last purchase for each customer.
    
    Parameters:
    df (pandas.DataFrame): DataFrame containing columns 'customer_id' and 'transaction_date'
                         as created by process_shopping_history()
    
    Returns:
    dict: Dictionary containing:
          - 'avg_days': Average number of days between first and last purchase
          - 'avg_months': Average number of months between first and last purchase
          - 'total_active_customers': Number of customers with more than one purchase
          - 'single_purchase_customers': Number of customers with only one purchase
          - 'max_timespan_days': Longest customer timespan in days
          - 'min_timespan_days': Shortest customer timespan in days (excluding single-purchase customers)
    """
    # Group by customer and get their first and last purchase dates
    customer_spans = df.groupby('customer_id').agg({
        'transaction_date': ['min', 'max', 'count']
    })
    
    # Flatten column names
    customer_spans.columns = ['first_purchase', 'last_purchase', 'purchase_count']
    
    # Calculate time difference for each customer
    customer_spans['timespan_days'] = (
        customer_spans['last_purchase'] - customer_spans['first_purchase']
    ).dt.total_seconds() / (24 * 60 * 60)  # Convert to days
    
    # Count customers with single purchase
    single_purchase_customers = (customer_spans['purchase_count'] == 1).sum()
    
    # Filter for customers with more than one purchase for average calculation
    multiple_purchase_spans = customer_spans[customer_spans['purchase_count'] > 1]
    
    # Calculate metrics
    if len(multiple_purchase_spans) > 0:
        avg_days = multiple_purchase_spans['timespan_days'].mean()
        avg_months = avg_days / 30.44  # Using average month length
        max_timespan = multiple_purchase_spans['timespan_days'].max()
        min_timespan = multiple_purchase_spans['timespan_days'].min()
    else:
        avg_days = 0
        avg_months = 0
        max_timespan = 0
        min_timespan = 0
    
    return {
        'avg_days': round(avg_days, 2),
        'avg_months': round(avg_months, 2),
        'total_active_customers': len(multiple_purchase_spans),
        'single_purchase_customers': single_purchase_customers,
        'max_timespan_days': round(max_timespan, 2),
        'min_timespan_days': round(min_timespan, 2)
    }

In [93]:
# Calculate average customer timespan
timespan_stats = calculate_avg_customer_timespan(df)

print(f"Average customer timespan: {timespan_stats['avg_days']} days ({timespan_stats['avg_months']} months)")
print(f"Active customers (>1 purchase): {timespan_stats['total_active_customers']}")
print(f"Single-purchase customers: {timespan_stats['single_purchase_customers']}")
print(f"Longest customer timespan: {timespan_stats['max_timespan_days']} days")
print(f"Shortest customer timespan: {timespan_stats['min_timespan_days']} days")

Average customer timespan: 3078.29 days (101.13 months)
Active customers (>1 purchase): 17132
Single-purchase customers: 0
Longest customer timespan: 3529.0 days
Shortest customer timespan: 9.0 days


### Calculate median timespan between first and last purchase in the dataset

In [94]:
def calculate_median_customer_timespan(df):
    """
    Calculate the median time span between first and last purchase for each customer.
    
    Parameters:
    df (pandas.DataFrame): DataFrame containing columns 'customer_id' and 'transaction_date'
                         as created by process_shopping_history()
    
    Returns:
    dict: Dictionary containing:
          - 'median_days': Median number of days between first and last purchase
          - 'median_months': Median number of months between first and last purchase
          - 'total_active_customers': Number of customers with more than one purchase
          - 'single_purchase_customers': Number of customers with only one purchase
          - 'max_timespan_days': Longest customer timespan in days
          - 'min_timespan_days': Shortest customer timespan in days (excluding single-purchase customers)
          - 'q1_days': First quartile of customer timespan in days
          - 'q3_days': Third quartile of customer timespan in days
    """
    # Group by customer and get their first and last purchase dates
    customer_spans = df.groupby('customer_id').agg({
        'transaction_date': ['min', 'max', 'count']
    })
    
    # Flatten column names
    customer_spans.columns = ['first_purchase', 'last_purchase', 'purchase_count']
    
    # Calculate time difference for each customer
    customer_spans['timespan_days'] = (
        customer_spans['last_purchase'] - customer_spans['first_purchase']
    ).dt.total_seconds() / (24 * 60 * 60)  # Convert to days
    
    # Count customers with single purchase
    single_purchase_customers = (customer_spans['purchase_count'] == 1).sum()
    
    # Filter for customers with more than one purchase for median calculation
    multiple_purchase_spans = customer_spans[customer_spans['purchase_count'] > 1]
    
    # Calculate metrics
    if len(multiple_purchase_spans) > 0:
        median_days = multiple_purchase_spans['timespan_days'].median()
        median_months = median_days / 30.44  # Using average month length
        max_timespan = multiple_purchase_spans['timespan_days'].max()
        min_timespan = multiple_purchase_spans['timespan_days'].min()
        q1_days = multiple_purchase_spans['timespan_days'].quantile(0.25)
        q3_days = multiple_purchase_spans['timespan_days'].quantile(0.75)
    else:
        median_days = 0
        median_months = 0
        max_timespan = 0
        min_timespan = 0
        q1_days = 0
        q3_days = 0
    
    return {
        'median_days': round(median_days, 2),
        'median_months': round(median_months, 2),
        'total_active_customers': len(multiple_purchase_spans),
        'single_purchase_customers': single_purchase_customers,
        'max_timespan_days': round(max_timespan, 2),
        'min_timespan_days': round(min_timespan, 2),
        'q1_days': round(q1_days, 2),
        'q3_days': round(q3_days, 2)
    }

In [95]:
# Calculate median customer timespan
timespan_stats = calculate_median_customer_timespan(df)

print(f"Median customer timespan: {timespan_stats['median_days']} days ({timespan_stats['median_months']} months)")
print(f"Active customers (>1 purchase): {timespan_stats['total_active_customers']}")
print(f"Single-purchase customers: {timespan_stats['single_purchase_customers']}")
print(f"First quartile (25%): {timespan_stats['q1_days']} days")
print(f"Third quartile (75%): {timespan_stats['q3_days']} days")
print(f"Range: {timespan_stats['min_timespan_days']} to {timespan_stats['max_timespan_days']} days")

Median customer timespan: 3489.0 days (114.62 months)
Active customers (>1 purchase): 17132
Single-purchase customers: 0
First quartile (25%): 3019.0 days
Third quartile (75%): 3522.0 days
Range: 9.0 to 3529.0 days


### avg / median number of baskets

In [96]:
def calculate_basket_size_metrics(df):
    """
    Calculate average and median basket sizes (number of items) per customer.
    
    Parameters:
    df (pandas.DataFrame): DataFrame containing columns 'customer_id' and 'basket_content'
                         as created by process_shopping_history()
    
    Returns:
    dict: Dictionary containing:
          - 'overall_avg_basket_size': Average basket size across all transactions
          - 'overall_median_basket_size': Median basket size across all transactions
          - 'avg_basket_size_per_customer': Mean of customer average basket sizes
          - 'median_basket_size_per_customer': Median of customer average basket sizes
          - 'largest_basket': Size of the largest basket
          - 'smallest_basket': Size of the smallest basket
          - 'q1_basket_size': First quartile of basket sizes
          - 'q3_basket_size': Third quartile of basket sizes
    """
    # Calculate basket sizes for each transaction
    df['basket_size'] = df['basket_content'].apply(len)
    
    # Calculate overall metrics (across all transactions)
    overall_avg_size = df['basket_size'].mean()
    overall_median_size = df['basket_size'].median()
    
    # Calculate average basket size for each customer
    customer_avg_sizes = df.groupby('customer_id')['basket_size'].mean()
    
    # Calculate the mean and median of customer averages
    avg_size_per_customer = customer_avg_sizes.mean()
    median_size_per_customer = customer_avg_sizes.median()
    
    # Calculate additional statistics
    largest_basket = df['basket_size'].max()
    smallest_basket = df['basket_size'].min()
    q1_size = df['basket_size'].quantile(0.25)
    q3_size = df['basket_size'].quantile(0.75)
    
    return {
        'overall_avg_basket_size': round(overall_avg_size, 2),
        'overall_median_basket_size': round(overall_median_size, 2),
        'avg_basket_size_per_customer': round(avg_size_per_customer, 2),
        'median_basket_size_per_customer': round(median_size_per_customer, 2),
        'largest_basket': largest_basket,
        'smallest_basket': smallest_basket,
        'q1_basket_size': round(q1_size, 2),
        'q3_basket_size': round(q3_size, 2)
    }

In [97]:
# Calculate basket size metrics
basket_metrics = calculate_basket_size_metrics(df)

print("Overall Metrics (across all transactions):")
print(f"Average basket size: {basket_metrics['overall_avg_basket_size']} items")
print(f"Median basket size: {basket_metrics['overall_median_basket_size']} items")

print("\nPer-Customer Metrics:")
print(f"Average of customer average basket sizes: {basket_metrics['avg_basket_size_per_customer']} items")
print(f"Median of customer average basket sizes: {basket_metrics['median_basket_size_per_customer']} items")

print("\nDistribution:")
print(f"First quartile (25%): {basket_metrics['q1_basket_size']} items")
print(f"Third quartile (75%): {basket_metrics['q3_basket_size']} items")
print(f"Range: {basket_metrics['smallest_basket']} to {basket_metrics['largest_basket']} items")

Overall Metrics (across all transactions):
Average basket size: 10.22 items
Median basket size: 8.0 items

Per-Customer Metrics:
Average of customer average basket sizes: 10.83 items
Median of customer average basket sizes: 9.52 items

Distribution:
First quartile (25%): 4.0 items
Third quartile (75%): 13.0 items
Range: 0 to 136 items


### Flagging Forgotten-Item baskets

In [13]:
import pandas as pd
from datetime import timedelta
import time

def flag_forgotten_items_baskets(df, large_basket, max_days, min_forgotten_items, verbose=True):
    """
    Flag forgotten-item baskets based on specified parameters.
    Only flags the first qualifying transaction after each large basket.
    Handles same-day transactions based on their order in the DataFrame.
    
    Parameters:
    df (pandas.DataFrame): DataFrame with columns customer_id, transaction_date, basket_content
    large_basket (int): Minimum number of items for a basket to be considered large
    max_days (int): Maximum days window after large basket
    min_forgotten_items (int): Minimum items in subsequent basket to be flagged
    verbose (bool): Whether to print progress updates
    
    Returns:
    pandas.DataFrame: Original DataFrame with additional forgotten_item_flag column
    """
    # Create a copy of the DataFrame to avoid modifying the original
    df = df.copy()
    
    # Initialize the forgotten_item_flag column with 0
    df['forgotten_item_flag'] = 0
    
    # Get unique customers
    unique_customers = df['customer_id'].unique()
    total_customers = len(unique_customers)
    
    # Initialize counters for progress tracking
    processed_customers = 0
    last_print_time = time.time()
    
    print(f"\nStarting analysis with parameters:")
    print(f"Large basket threshold: {large_basket} items")
    print(f"Maximum days window: {max_days} days")
    print(f"Minimum forgotten items: {min_forgotten_items} items")
    print(f"\nProcessing {total_customers} customers...")
    
    # Process each customer's transactions
    for customer_id in unique_customers:
        # Get customer's transactions
        customer_df = df[df['customer_id'] == customer_id].copy()
        
        # Sort by transaction date
        customer_df = customer_df.sort_values('transaction_date')
        
        # Get indices of large baskets
        large_basket_mask = customer_df['basket_content'].apply(len) >= large_basket
        large_basket_indices = customer_df[large_basket_mask].index
        
        # For each large basket
        for large_basket_idx in large_basket_indices:
            large_basket_date = customer_df.loc[large_basket_idx, 'transaction_date']
            window_end = large_basket_date + timedelta(days=max_days)
            
            # Find subsequent purchases within the time window
            # Using index comparison to respect transaction order
            subsequent_purchases = customer_df[
                (customer_df.index > large_basket_idx) &  # After current basket in order
                (customer_df['transaction_date'] <= window_end)  # Within time window
            ]
            
            # Check each subsequent purchase until we find the first qualifying one
            for idx, row in subsequent_purchases.iterrows():
                if len(row['basket_content']) >= min_forgotten_items:
                    df.loc[idx, 'forgotten_item_flag'] = 1
                    break  # Exit the loop after finding the first qualifying transaction
        
        processed_customers += 1
        
        # Print progress every 1000 customers or if 5 seconds have passed
        current_time = time.time()
        if verbose and (processed_customers % 1000 == 0 or current_time - last_print_time >= 5):
            # Calculate current number of flagged transactions directly from the DataFrame
            current_flagged = df['forgotten_item_flag'].sum()
            print(f"Processed {processed_customers}/{total_customers} customers. "
                  f"Found {current_flagged} forgotten-item baskets so far. "
                  f"({round(processed_customers/total_customers*100, 2)}% complete)")
            last_print_time = current_time
    
    # Get final count of flagged transactions
    total_flagged = df['forgotten_item_flag'].sum()
    
    # Print final statistics
    print("\nAnalysis complete!")
    print(f"Total purchases analyzed: {len(df)}")
    print(f"Number of forgotten-item baskets identified: {total_flagged}")
    print(f"Percentage of forgotten-item baskets: "
          f"{round(total_flagged/len(df)*100, 2)}%")
    
    return df

In [14]:
# Example usage:

# Assuming df is your original DataFrame from load_shopping_data()
# Set parameters
LARGE_BASKET = 10  # baskets with 20 or more items are considered large
MAX_DAYS = 2      # look at purchases up to 2 days after
MIN_FORGOTTEN_ITEMS = 10  # following basket should have 5 or more items

# Apply the flagging
df_flagged_2 = flag_forgotten_items_baskets(df, 
                                large_basket=LARGE_BASKET,
                                max_days=MAX_DAYS,
                                min_forgotten_items=MIN_FORGOTTEN_ITEMS)

# View results
print("\nTotal purchases:", len(df_flagged_2))
print("Number of forgotten-item baskets:", df_flagged_2['forgotten_item_flag'].sum())
print("Percentage of forgotten-item baskets:", 
      round(df_flagged_2['forgotten_item_flag'].mean() * 100, 2), "%")


Starting analysis with parameters:
Large basket threshold: 10 items
Maximum days window: 2 days
Minimum forgotten items: 10 items

Processing 17132 customers...
Processed 46/17132 customers. Found 4166 forgotten-item baskets so far. (0.27% complete)
Processed 92/17132 customers. Found 6927 forgotten-item baskets so far. (0.54% complete)
Processed 162/17132 customers. Found 8659 forgotten-item baskets so far. (0.95% complete)


KeyboardInterrupt: 

In [15]:
# Optional: Save results to CSV
output_path = "flagged_baskets_results_2_tafeng.csv"
print(f"\nSaving results to {output_path}...")
df_flagged_2.to_csv(output_path, index=False)
print("Results saved successfully!")


Saving results to flagged_baskets_results_2_tafeng.csv...
Results saved successfully!


In [16]:
# Example usage:

# Assuming df is your original DataFrame from load_shopping_data()
# Set parameters
LARGE_BASKET = 10  # baskets with 20 or more items are considered large
MAX_DAYS = 1      # look at purchases up to 2 days after
MIN_FORGOTTEN_ITEMS = 10  # following basket should have 5 or more items

# Apply the flagging
df_flagged_1 = flag_forgotten_items_baskets(df, 
                                large_basket=LARGE_BASKET,
                                max_days=MAX_DAYS,
                                min_forgotten_items=MIN_FORGOTTEN_ITEMS)

# View results
print("\nTotal purchases:", len(df_flagged_1))
print("Number of forgotten-item baskets:", df_flagged_1['forgotten_item_flag'].sum())
print("Percentage of forgotten-item baskets:", 
      round(df_flagged_1['forgotten_item_flag'].mean() * 100, 2), "%")


Starting analysis with parameters:
Large basket threshold: 10 items
Maximum days window: 1 days
Minimum forgotten items: 10 items

Processing 32266 customers...
Processed 1000/32266 customers. Found 31 forgotten-item baskets so far. (3.1% complete)
Processed 2000/32266 customers. Found 43 forgotten-item baskets so far. (6.2% complete)
Processed 3000/32266 customers. Found 53 forgotten-item baskets so far. (9.3% complete)
Processed 4000/32266 customers. Found 70 forgotten-item baskets so far. (12.4% complete)
Processed 5000/32266 customers. Found 85 forgotten-item baskets so far. (15.5% complete)
Processed 6000/32266 customers. Found 94 forgotten-item baskets so far. (18.6% complete)
Processed 7000/32266 customers. Found 98 forgotten-item baskets so far. (21.69% complete)
Processed 8000/32266 customers. Found 106 forgotten-item baskets so far. (24.79% complete)
Processed 9000/32266 customers. Found 118 forgotten-item baskets so far. (27.89% complete)
Processed 10000/32266 customers. Fo

In [17]:
# # Optional: Save results to CSV
output_path = "flagged_baskets_results_1_tafeng.csv"
print(f"\nSaving results to {output_path}...")
df_flagged_1.to_csv(output_path, index=False)
print("Results saved successfully!")


Saving results to flagged_baskets_results_1_tafeng.csv...
Results saved successfully!


In [18]:
# Example usage:

# Assuming df is your original DataFrame from load_shopping_data()
# Set parameters
LARGE_BASKET = 10  # baskets with 20 or more items are considered large
MAX_DAYS = 0      # look at purchases up to 2 days after
MIN_FORGOTTEN_ITEMS = 10  # following basket should have 5 or more items

# Apply the flagging
df_flagged_0 = flag_forgotten_items_baskets(df, 
                                large_basket=LARGE_BASKET,
                                max_days=MAX_DAYS,
                                min_forgotten_items=MIN_FORGOTTEN_ITEMS)

# View results
print("\nTotal purchases:", len(df_flagged_0))
print("Number of forgotten-item baskets:", df_flagged_0['forgotten_item_flag'].sum())
print("Percentage of forgotten-item baskets:", 
      round(df_flagged_0['forgotten_item_flag'].mean() * 100, 2), "%")


Starting analysis with parameters:
Large basket threshold: 10 items
Maximum days window: 0 days
Minimum forgotten items: 10 items

Processing 32266 customers...
Processed 1000/32266 customers. Found 0 forgotten-item baskets so far. (3.1% complete)
Processed 2000/32266 customers. Found 0 forgotten-item baskets so far. (6.2% complete)
Processed 3000/32266 customers. Found 0 forgotten-item baskets so far. (9.3% complete)
Processed 4000/32266 customers. Found 0 forgotten-item baskets so far. (12.4% complete)
Processed 5000/32266 customers. Found 0 forgotten-item baskets so far. (15.5% complete)
Processed 6000/32266 customers. Found 0 forgotten-item baskets so far. (18.6% complete)
Processed 7000/32266 customers. Found 0 forgotten-item baskets so far. (21.69% complete)
Processed 8000/32266 customers. Found 0 forgotten-item baskets so far. (24.79% complete)
Processed 9000/32266 customers. Found 0 forgotten-item baskets so far. (27.89% complete)
Processed 10000/32266 customers. Found 0 forgo

In [19]:
# # Optional: Save results to CSV
output_path = "flagged_baskets_results_0_tafeng.csv"
print(f"\nSaving results to {output_path}...")
df_flagged_0.to_csv(output_path, index=False)
print("Results saved successfully!")


Saving results to flagged_baskets_results_0_tafeng.csv...
Results saved successfully!


In [20]:
# Example usage:

# Assuming df is your original DataFrame from load_shopping_data()
# Set parameters
LARGE_BASKET = 10  # baskets with 20 or more items are considered large
MAX_DAYS = 2      # look at purchases up to 2 days after
MIN_FORGOTTEN_ITEMS = 1  # following basket should have 5 or more items

# Apply the flagging
df_flagged_2_1 = flag_forgotten_items_baskets(df, 
                                large_basket=LARGE_BASKET,
                                max_days=MAX_DAYS,
                                min_forgotten_items=MIN_FORGOTTEN_ITEMS)

# View results
print("\nTotal purchases:", len(df_flagged_2_1))
print("Number of forgotten-item baskets:", df_flagged_2_1['forgotten_item_flag'].sum())
print("Percentage of forgotten-item baskets:", 
      round(df_flagged_2_1['forgotten_item_flag'].mean() * 100, 2), "%")


Starting analysis with parameters:
Large basket threshold: 10 items
Maximum days window: 2 days
Minimum forgotten items: 1 items

Processing 32266 customers...
Processed 1000/32266 customers. Found 168 forgotten-item baskets so far. (3.1% complete)
Processed 2000/32266 customers. Found 278 forgotten-item baskets so far. (6.2% complete)
Processed 3000/32266 customers. Found 364 forgotten-item baskets so far. (9.3% complete)
Processed 4000/32266 customers. Found 469 forgotten-item baskets so far. (12.4% complete)
Processed 5000/32266 customers. Found 566 forgotten-item baskets so far. (15.5% complete)
Processed 6000/32266 customers. Found 679 forgotten-item baskets so far. (18.6% complete)
Processed 7000/32266 customers. Found 780 forgotten-item baskets so far. (21.69% complete)
Processed 8000/32266 customers. Found 852 forgotten-item baskets so far. (24.79% complete)
Processed 9000/32266 customers. Found 936 forgotten-item baskets so far. (27.89% complete)
Processed 10000/32266 custome

In [21]:
# # Optional: Save results to CSV
output_path = "flagged_baskets_results_2_min_forg_1_tafeng.csv"
print(f"\nSaving results to {output_path}...")
df_flagged_2_1.to_csv(output_path, index=False)
print("Results saved successfully!")


Saving results to flagged_baskets_results_2_min_forg_1_tafeng.csv...
Results saved successfully!


In [22]:
# Example usage:

# Assuming df is your original DataFrame from load_shopping_data()
# Set parameters
LARGE_BASKET = 10  # baskets with 20 or more items are considered large
MAX_DAYS = 1      # look at purchases up to 2 days after
MIN_FORGOTTEN_ITEMS = 1  # following basket should have 5 or more items

# Apply the flagging
df_flagged_1_1 = flag_forgotten_items_baskets(df, 
                                large_basket=LARGE_BASKET,
                                max_days=MAX_DAYS,
                                min_forgotten_items=MIN_FORGOTTEN_ITEMS)

# View results
print("\nTotal purchases:", len(df_flagged_1_1))
print("Number of forgotten-item baskets:", df_flagged_1_1['forgotten_item_flag'].sum())
print("Percentage of forgotten-item baskets:", 
      round(df_flagged_1_1['forgotten_item_flag'].mean() * 100, 2), "%")


Starting analysis with parameters:
Large basket threshold: 10 items
Maximum days window: 1 days
Minimum forgotten items: 1 items

Processing 32266 customers...
Processed 1000/32266 customers. Found 104 forgotten-item baskets so far. (3.1% complete)
Processed 2000/32266 customers. Found 170 forgotten-item baskets so far. (6.2% complete)
Processed 3000/32266 customers. Found 231 forgotten-item baskets so far. (9.3% complete)
Processed 4000/32266 customers. Found 300 forgotten-item baskets so far. (12.4% complete)
Processed 5000/32266 customers. Found 356 forgotten-item baskets so far. (15.5% complete)
Processed 6000/32266 customers. Found 423 forgotten-item baskets so far. (18.6% complete)
Processed 7000/32266 customers. Found 472 forgotten-item baskets so far. (21.69% complete)
Processed 8000/32266 customers. Found 514 forgotten-item baskets so far. (24.79% complete)
Processed 9000/32266 customers. Found 559 forgotten-item baskets so far. (27.89% complete)
Processed 10000/32266 custome

In [23]:
# # Optional: Save results to CSV
output_path = "flagged_baskets_results_1_min_forg_1_tafeng.csv"
print(f"\nSaving results to {output_path}...")
df_flagged_1_1.to_csv(output_path, index=False)
print("Results saved successfully!")


Saving results to flagged_baskets_results_1_min_forg_1_tafeng.csv...
Results saved successfully!


In [24]:
# Example usage:

# Assuming df is your original DataFrame from load_shopping_data()
# Set parameters
LARGE_BASKET = 10  # baskets with 20 or more items are considered large
MAX_DAYS = 0      # look at purchases up to 2 days after
MIN_FORGOTTEN_ITEMS = 1  # following basket should have 5 or more items

# Apply the flagging
df_flagged_0_1 = flag_forgotten_items_baskets(df, 
                                large_basket=LARGE_BASKET,
                                max_days=MAX_DAYS,
                                min_forgotten_items=MIN_FORGOTTEN_ITEMS)

# View results
print("\nTotal purchases:", len(df_flagged_0_1))
print("Number of forgotten-item baskets:", df_flagged_0_1['forgotten_item_flag'].sum())
print("Percentage of forgotten-item baskets:", 
      round(df_flagged_0_1['forgotten_item_flag'].mean() * 100, 2), "%")


Starting analysis with parameters:
Large basket threshold: 10 items
Maximum days window: 0 days
Minimum forgotten items: 1 items

Processing 32266 customers...
Processed 1000/32266 customers. Found 0 forgotten-item baskets so far. (3.1% complete)
Processed 2000/32266 customers. Found 0 forgotten-item baskets so far. (6.2% complete)
Processed 3000/32266 customers. Found 0 forgotten-item baskets so far. (9.3% complete)
Processed 4000/32266 customers. Found 0 forgotten-item baskets so far. (12.4% complete)
Processed 5000/32266 customers. Found 0 forgotten-item baskets so far. (15.5% complete)
Processed 6000/32266 customers. Found 0 forgotten-item baskets so far. (18.6% complete)
Processed 7000/32266 customers. Found 0 forgotten-item baskets so far. (21.69% complete)
Processed 8000/32266 customers. Found 0 forgotten-item baskets so far. (24.79% complete)
Processed 9000/32266 customers. Found 0 forgotten-item baskets so far. (27.89% complete)
Processed 10000/32266 customers. Found 0 forgot

In [25]:
# # Optional: Save results to CSV
output_path = "flagged_baskets_results_0_min_forg_1_tafeng.csv"
print(f"\nSaving results to {output_path}...")
df_flagged_0_1.to_csv(output_path, index=False)
print("Results saved successfully!")


Saving results to flagged_baskets_results_0_min_forg_1_tafeng.csv...
Results saved successfully!


#### Identifying the baskets that are considered 'forgotten item' purchases

In [24]:
# Single dataset version
def identify_forgotten_item_baskets(df, max_days=0, max_items=1, large_basket_threshold=10):
    # Sort the dataframe by customer_id and next_basket_id (date)
    df = df.sort_values(['customer_id', 'next_basket_id'])
    
    # Convert next_basket_id to datetime
    df['next_basket_date'] = pd.to_datetime(df['next_basket_id'].str.split('_').str[:3].str.join('-'))
    
    # Initialize the forgotten_item_basket column
    df['forgotten_item_basket'] = 0
    
    # Group by customer_id
    for customer_id, group in df.groupby('customer_id'):
        large_basket_date = None
        
        for index, row in group.iterrows():
            actual_basket = str(row['actual_basket']).split(',')
            
            # Check if it's a large basket
            if len(actual_basket) >= large_basket_threshold:
                large_basket_date = row['next_basket_date']
            elif large_basket_date is not None:
                # Check if it's a potential forgotten-item basket
                date_diff = (row['next_basket_date'] - large_basket_date).days
                if 0 <= date_diff <= max_days and len(actual_basket) <= max_items:
                    df.at[index, 'forgotten_item_basket'] = 1
                    large_basket_date = None  # Reset large_basket_date
                elif date_diff > max_days:
                    large_basket_date = None  # Reset large_basket_date if more than max_days have passed
    
    # Drop the temporary next_basket_date column
    df = df.drop(columns=['next_basket_date'])
    
    return df


### Function fo concatenate datasets

In [25]:
def load_combine_predictions(model_names, lengths, directories, base_path='..'):
    """
    Load, sort, deduplicate, and combine prediction CSV files for specified models, lengths, and directories.
    
    Args:
    model_names (list): List of model names to include.
    lengths (list): List of prediction lengths to include.
    directories (list): List of directories to search in (e.g., ['1fE', '1vR']).
    base_path (str): Base path for the Experiments folder.
    
    Returns:
    pd.DataFrame: Combined dataframe of all matching predictions.
    """
    combined_df = pd.DataFrame()
    
    sort_columns = ['dataset_name', 'category_mode', 'split_mode', 'model_name', 
                    'customer_id', 'pred_length', 'next_basket_number']
    
    for directory in directories:
        for model in model_names:
            for split in ['split_10', 'split_20', 'split_30', 'split_40', 'split_50', 
                          'split_60', 'split_70', 'split_80', 'split_90', 'split_fixed']:
                pattern = os.path.join(base_path, 'Experiments', directory, model, split, 
                                       f'pred_{directory}_*_{model}_*.csv')
                for file in glob(pattern):
                    # Extract length from filename
                    file_length = int(file.split('_')[-1].split('.')[0])
                    
                    if file_length in lengths:
                        df = pd.read_csv(file)
                        
                        # Ensure all sort columns exist in the DataFrame
                        for col in sort_columns:
                            if col not in df.columns:
                                df[col] = None  # or some appropriate default value
                        
                        # Sort the DataFrame
                        df = df.sort_values(by=sort_columns)
                        
                        # Remove duplicates based on the sorting columns
                        df = df.drop_duplicates(subset=sort_columns, keep='first')
                        
                        combined_df = pd.concat([combined_df, df], ignore_index=True)
    
    # Final sort and deduplication of the combined DataFrame
    combined_df = combined_df.sort_values(by=sort_columns)
    combined_df = combined_df.drop_duplicates(subset=sort_columns, keep='first')
    
    # Save the combined dataframe as a CSV file
    output_file = 'combined_predictions_all_directories.csv'
    combined_df.to_csv(output_file, index=False)
    print(f"Combined predictions saved to {output_file}")
    
    return combined_df

# Example usage:
# models = ['tbp', 'nmf', 'hrm', 'fpmc', 'clf', 'markov', 'ibp']
# lengths = list(range(2, 21))  # 2 to 20 inclusive
# directories = ['1fE', '1vR']
# result_df = load_combine_predictions(models, lengths, directories)