**Goal:** Explore and validate retail transaction data, understand customer purchasing behavior, and define a realistic churn framework aligned with business context.

In [None]:
#Load data
import pandas as pd
import numpy as np

df = pd.read_csv(
    r"..\data\raw\online_retail.csv",
    encoding="latin1"
)

df.shape
df.head()

Unnamed: 0,InvoiceNo,StockCode,Description,Quantity,InvoiceDate,UnitPrice,CustomerID,Country
0,536365,85123A,WHITE HANGING HEART T-LIGHT HOLDER,6,2010-12-01 08:26,2.55,17850.0,United Kingdom
1,536365,71053,WHITE METAL LANTERN,6,2010-12-01 08:26,3.39,17850.0,United Kingdom
2,536365,84406B,CREAM CUPID HEARTS COAT HANGER,8,2010-12-01 08:26,2.75,17850.0,United Kingdom
3,536365,84029G,KNITTED UNION FLAG HOT WATER BOTTLE,6,2010-12-01 08:26,3.39,17850.0,United Kingdom
4,536365,84029E,RED WOOLLY HOTTIE WHITE HEART.,6,2010-12-01 08:26,3.39,17850.0,United Kingdom


In [21]:
df.info()
df.isnull().sum().head(10)
df["Country"].value_counts().head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 541909 entries, 0 to 541908
Data columns (total 8 columns):
 #   Column       Non-Null Count   Dtype  
---  ------       --------------   -----  
 0   InvoiceNo    541909 non-null  object 
 1   StockCode    541909 non-null  object 
 2   Description  540455 non-null  object 
 3   Quantity     541909 non-null  int64  
 4   InvoiceDate  541909 non-null  object 
 5   UnitPrice    541909 non-null  float64
 6   CustomerID   406829 non-null  float64
 7   Country      541909 non-null  object 
dtypes: float64(2), int64(1), object(5)
memory usage: 33.1+ MB


Country
United Kingdom    495478
Germany             9495
France              8557
EIRE                8196
Spain               2533
Name: count, dtype: int64

In [22]:
#Convert InvoiceDate to datetime
df["InvoiceDate"] = pd.to_datetime(df["InvoiceDate"])

In [23]:
#Remove invalid records
df = df[df["Quantity"] > 0]
df = df[df["UnitPrice"] > 0]
df = df.dropna(subset=["CustomerID"])

In [24]:
#Convert CustomerID to int
df["CustomerID"] = df["CustomerID"].astype(int)

In [26]:
#Defining global timeline
analysis_date = df["InvoiceDate"].max() - pd.Timedelta(days=30)
analysis_date

Timestamp('2011-11-09 12:50:00')

In [27]:
#Define time windows
OBSERVATION_WINDOW = 90  # days
PREDICTION_WINDOW = 30   # days

In [28]:
#Calculate boundaries
observation_end = analysis_date
observation_start = observation_end - pd.Timedelta(days=OBSERVATION_WINDOW)

prediction_end = observation_end + pd.Timedelta(days=PREDICTION_WINDOW)

In [29]:
#Split data by time (NO leakage)
#Observation period (features)
obs_df = df[
    (df["InvoiceDate"] >= observation_start) &
    (df["InvoiceDate"] <= observation_end)
]

In [30]:
#Prediction period (label)
pred_df = df[
    (df["InvoiceDate"] > observation_end) &
    (df["InvoiceDate"] <= prediction_end)
]


In [31]:
#Identifying active customers
active_customers = obs_df["CustomerID"].unique()

In [32]:
#Create churn label -> Customers who purchased again
customers_with_future_purchase = pred_df["CustomerID"].unique()


In [33]:
#Churn definition
churn_labels = pd.DataFrame({"CustomerID": active_customers})

churn_labels["churn"] = ~churn_labels["CustomerID"].isin(customers_with_future_purchase)
churn_labels["churn"] = churn_labels["churn"].astype(int)

churn_labels.head()

Unnamed: 0,CustomerID,churn
0,13427,0
1,17340,0
2,12428,0
3,18077,0
4,17248,1


churn = 1 → customer made NO purchase in next 30 days

churn = 0 → customer returned

In [38]:
#Keeping customers with ≥ 2 purchases in observation window
txn_counts = (
    obs_df
    .groupby("CustomerID")
    .size()
    .reset_index(name="txn_count")
)

active_customers_refined = txn_counts[
    txn_counts["txn_count"] >= 2
]["CustomerID"]

churn_labels = churn_labels[
    churn_labels["CustomerID"].isin(active_customers_refined)
]


In [40]:
#Sanity check
churn_labels["churn"].value_counts(normalize=True)

churn
1    0.568434
0    0.431566
Name: proportion, dtype: float64

In [35]:
pred_df.shape

(65574, 8)

In [36]:
prediction_end = analysis_date + pd.Timedelta(days=30)

pred_df = df[
    (df["InvoiceDate"] > analysis_date) &
    (df["InvoiceDate"] <= prediction_end)
]


In [37]:
pred_df.shape

(65574, 8)

In [41]:
df.to_csv("../data/processed/clean_transactions.csv", index=False)

In [42]:
churn_labels.to_csv("../data/processed/churn_labels.csv", index=False)