In [2]:
import pandas as pd

# Load the data
file_path = 'Cleaned_POS_DATA_BAPT_2023.csv'  # Path to your file
data = pd.read_csv(file_path)

# Display the first few rows to understand the structure
print(data.head())

   Basket_ID                Date_       Barcode  Quantity  Value_  \
0      20412  2023-05-29 00:00:00  5.200120e+12       1.0    1.19   
1      20412  2023-05-29 00:00:00  5.207066e+09       1.0    3.63   
2      20412  2023-05-29 00:00:00  8.076810e+12       1.0    2.52   
3      20412  2023-05-29 00:00:00  5.200120e+12       1.0    1.35   
4      20413  2023-06-10 00:00:00  5.204420e+12       1.0    0.99   

   LoyaltyCard_ID  
0        28504821  
1        28504821  
2        28504821  
3        28504821  
4        28504821  


In [13]:
required_columns = ['Basket_ID', 'Quantity', 'Value_', 'Barcode', 'LoyaltyCard_ID']
for col in required_columns:
    if col not in data.columns:
        raise ValueError(f"Column {col} is missing from the dataset!")

total_quantity = data.groupby('Basket_ID')['Quantity'].sum().reset_index()
total_quantity.rename(columns={'Quantity': 'Total_Quantity'}, inplace=True)

data['Total_Value'] = data['Quantity'] * data['Value_']
total_value = data.groupby('Basket_ID')['Total_Value'].sum().reset_index()
total_value.rename(columns={'Total_Value': 'Total_Value_Per_Basket'}, inplace=True)

product_variation = data.groupby('Basket_ID')['Barcode'].nunique().reset_index()
product_variation.rename(columns={'Barcode': 'Product_Variation'}, inplace=True)

summary = total_quantity.merge(total_value, on='Basket_ID').merge(product_variation, on='Basket_ID')

print(summary)


       Basket_ID  Total_Quantity  Total_Value_Per_Basket  Product_Variation
0              0             5.0                   13.63                  5
1              1             8.0                   23.17                  8
2              2             2.0                    6.72                  2
3              3             1.0                    1.35                  1
4              4             2.0                    1.81                  2
...          ...             ...                     ...                ...
61265      61265             7.0                   11.54                  7
61266      61266             6.0                   11.77                  3
61267      61267             8.0                   36.82                  3
61268      61268             2.0                    3.22                  2
61269      61269             3.0                    4.89                  3

[61270 rows x 4 columns]


In [17]:
# Step 1: Create a mapping of Loyalty Card ID to Customer ID
data['Customer_ID'] = data['LoyaltyCard_ID'].astype('category').cat.codes + 1

# Step 2: Verify the mapping
loyalty_to_customer = data[['LoyaltyCard_ID', 'Customer_ID']].drop_duplicates()
print(loyalty_to_customer)


        LoyaltyCard_ID  Customer_ID
0             28504821          814
23            28504989          857
31            28505050          876
35            28505084          884
39            28505150          905
...                ...          ...
373784        28504240          797
374267        28854105         1440
374271        29011589         1727
375431        28713882         1179
376034               0            1

[2408 rows x 2 columns]


In [18]:
# Step 1: Add Customer ID to the original dataset
basket_with_customer = data[['Basket_ID', 'Customer_ID']].drop_duplicates()

# Step 2: Merge Customer ID into the summary dataset
summary_with_customers = summary.merge(basket_with_customer, on='Basket_ID')

# Display the resulting DataFrame
print(summary_with_customers.head())


   Basket_ID  Total_Quantity  Total_Value_Per_Basket  Product_Variation  \
0          0             5.0                   13.63                  5   
1          1             8.0                   23.17                  8   
2          2             2.0                    6.72                  2   
3          3             1.0                    1.35                  1   
4          4             2.0                    1.81                  2   

   Customer_ID  
0            2  
1            2  
2            2  
3            2  
4            2  


In [21]:
with pd.ExcelWriter('basket_summary_withcust.xlsx', engine='xlsxwriter') as writer:
    summary_with_customers.to_excel(writer, sheet_name='baskets with values', index=False)