# Project invoices
*KUBIK Aleksander - KOBANA Johan - JOUYIT Matthieu - Thomas BOULAINE - DIA4*


Our problem : How can we analyze and visualize an online store’s activity using an invoice dataset to extract key indicators that support data-driven decisions?



In [59]:
import pandas as pd 
import numpy as np 
import seaborn as sns 
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans

from sklearn.preprocessing import OneHotEncoder
from category_encoders import TargetEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

import plotly_express as px
import plotly.graph_objects as go

from mlxtend.frequent_patterns import fpgrowth, association_rules

from prophet import Prophet


In [60]:
def load_data(file_path):
    """
    Function for reading a CSV file
    Input: the path to the CSV file (string)    
    Output: a dataframe containing the loaded dataset
    """
    df = pd.read_csv(file_path)   # Read the CSV into a pandas DataFrame
    return df                     # Return the full dataset

In [61]:
def basic_info(data):
    """
    Function to display basic information about a dataframe
    Input: a dataframe
    Output: printed information (shape, columns, dtypes, missing values)
    """
    print("Shape:", data.shape)                     # Print number of rows and columns
    print("\nColumns:", data.columns.tolist())      # Print list of column names

    print("\nData types:")                          # Print data types of each column
    print(data.dtypes)

    print("\nMissing values per column:")           # Print missing values for each column
    print(data.isna().sum())

    print("\nDescriptive statistics:")
    categorical_cols = [
    'first_name', 'city', 'job'
    ]
    for col in categorical_cols:                    #Ranking between categorical variables
        print(f"\nTop 5 values for {col}:")         #Clients plus présents, villes dominantes
        print(data[col].value_counts().head(5))     #professions majoritaires

    numeric_ranking = data[['qty', 'amount']].agg(['mean', 'sum', 'std']).T     #Ranking between numerical variables
    numeric_ranking = numeric_ranking.sort_values(by='sum', ascending=False)    #Most influent variable economicaly and volume
    print(numeric_ranking)

In [62]:
def preprocess_dates(data):
    """
    Function to preprocess date-related fields
    Input: a dataframe
    Output: the same dataframe with parsed dates, changing type and extracted year/month
    """
    data["invoice_date"] = pd.to_datetime(data["invoice_date"], format="%d/%m/%Y")  # Convert date string to datetime
    data["year"] = data["invoice_date"].dt.year                                      # Extract year
    data["month"] = data["invoice_date"].dt.month                                    # Extract month
    data["product_id"] = data["product_id"].astype(str)                              # Ensure product_id is string
    return data

Here we convert the `invoice_date` column into a real datetime format.  
We also create two new columns: `year` and `month`.

These will be useful later when we study trends in sales over time.

In [63]:
def sales_by_month(invoice):
    """
    Compute total monthly revenue based on 'amount'.
    """
    monthly_sales = invoice.groupby(["year", "month"])["amount"].sum().reset_index()
    monthly_sales["date"] = pd.to_datetime(
        monthly_sales["year"].astype(str) + "-" + monthly_sales["month"].astype(str) + "-01"
    )
    return monthly_sales

This function calculates the total sales for each month.  
We group the data by year and month, then sum the amounts.

It gives us our first time-based indicator: how the store’s sales evolve over time.

In [64]:
def top_products(df, n=10):
    """
    Function to compute the top-N best-selling products
    Input: 
      df : dataframe containing at least 'product_id' and 'amount'
      n  : number of products to return (default = 10)
    Output: 
      a dataframe with the N products that generate the highest total amount
    """
    top = (
        df.groupby("product_id")["amount"]      # group by product and sum revenue
          .sum()
          .reset_index()                        # back to a flat dataframe
          .sort_values(by="amount", ascending=False)  # highest revenue first
          .head(n)                              # keep only top N
    )
    return top

In [65]:
def average_basket(invoice):
    """
    Compute the average spending per customer.
    """
    avg = invoice.groupby("email")["amount"].mean().reset_index()
    avg.rename(columns={"amount": "avg_basket"}, inplace=True)
    return avg

In [66]:
def pattern_mining_by_job(df, min_support=0.01, top_n=10):
    """
    Function to extract association rules by job
    Input:  
      df (with columns 'job', 'product_id', 'qty')
      min_support
      top_n
    Output: dataframe with the strongest association rules
    """

    basket = df.groupby(['job', 'product_id'])['qty'].sum().unstack().fillna(0)  # job–product matrix
    basket = basket > 0                                                           # convert quantities to booleans

    itemsets = fpgrowth(basket, min_support=min_support, use_colnames=True)       # frequent itemsets
    if itemsets.empty:                                                            # if nothing is frequent
        return pd.DataFrame({"message": ["No frequent itemsets found"]})          # return message instead

    rules = association_rules(itemsets, metric="lift", min_threshold=0)           # generate association rules
    if rules.empty:                                                               # if no rules are found
        return pd.DataFrame({"message": ["No association rules found"]})          # return message instead

    rules["score"] = rules["lift"] * rules["confidence"]                          # combined score = lift × confidence
    rules = rules.sort_values(by="score", ascending=False).head(top_n)            # keep only top_n best rules

    rules["antecedent_txt"] = rules["antecedents"].apply(lambda x: list(x)[0])    # convert antecedent set to text
    rules["consequent_txt"] = rules["consequents"].apply(lambda x: list(x)[0])    # convert consequent set to text

    def get_jobs_supporting_rule(row):                                            # helper: jobs that support a rule
        a = row["antecedent_txt"]                                                # product A (antecedent)
        b = row["consequent_txt"]                                                # product B (consequent)

        jobs_A = set(df[df["product_id"] == a]["job"])                           # jobs that bought A
        jobs_B = set(df[df["product_id"] == b]["job"])                           # jobs that bought B

        return sorted(jobs_A.intersection(jobs_B))                               # jobs that bought both A and B

    rules["jobs_supporting_rule"] = rules.apply(get_jobs_supporting_rule, axis=1) # add supporting jobs to each rule
    rules["num_jobs"] = rules["jobs_supporting_rule"].apply(len)

    return rules[[
        "antecedent_txt",                                                        # product on the left side of rule
        "consequent_txt",                                                        # product on the right side of rule
        "confidence",                                                            # confidence of the rule
        "lift",                                                                  # lift of the rule
        "score",                                                                 # combined score (lift × confidence)
        "jobs_supporting_rule",                                                  # list of jobs supporting the rule
        "num_jobs"
    ]]

In [67]:
def spatial_analysis_by_city(df, city_col='city', amount_col='amount', top_n=15):
    """
    Perform spatial analysis using city information:
    - aggregate invoice count and total revenue per city
    - visualize the spatial distribution using a bar chart
    """

    # Aggregation
    city_agg = (
        df
        .groupby(city_col)
        .agg(
            invoice_count=(city_col, 'count'),
            total_revenue=(amount_col, 'sum')
        )
        .reset_index()
        .sort_values(by='invoice_count', ascending=False)
    )

    return city_agg

In [68]:
def temporal_analysis(df, last_n_years=5, window_months=6, forecast_months=0):
    """
    Function for temporal analysis + optional Prophet forecasting
    Input:  
      df (invoice dataset)
      last_n_years
      window_months
      forecast_months
    Output: 
      monthly_df (clean monthly series)
      forecast_df (if enabled)
    """

    df = df.copy()                                                                  # work on a local copy

    max_year = df["invoice_date"].dt.year.max()                                     # most recent year in dataset
    min_year = max_year - last_n_years + 1                                          # oldest year to keep
    df = df[df["invoice_date"].dt.year >= min_year]                                 # filter selected years

    monthly = (                                                                     # compute monthly revenue
        df.groupby(df["invoice_date"].dt.to_period("M"))["amount"]                  
        .sum()
        .to_timestamp()
        .reset_index(name="amount")
        .rename(columns={"invoice_date": "date"})                                    # rename for clarity
    )

    monthly["trend"] = monthly["amount"].rolling(window=window_months).mean()        # rolling mean trend

    if forecast_months > 0:                                                          # forecasting enabled?
        prophet_df = monthly.rename(columns={"date": "ds", "amount": "y"})            # Prophet column format
        model = Prophet(yearly_seasonality=True)                                      # Prophet model
        model.fit(prophet_df[["ds", "y"]])                                            # train model
        future = model.make_future_dataframe(periods=forecast_months, freq="ME")      # extend timeline
        forecast = model.predict(future)                                              # generate forecast
    else:
        forecast = None                                                               # no forecasting

    return monthly, forecast                                                        

In [69]:
# Main pipeline
def main():

    # --------------------------
    # 1. Load and preprocess data
    # --------------------------
    invoice = load_data("invoices.csv")
    df_cities = load_data("CitiesofFrance-VillesdeFrance.csv")

    cities = df_cities.iloc[:, 0].dropna().unique()
    invoice['city'] = np.random.choice(
        cities,
        size=len(invoice),
        replace=True
    )
    
    basic_info(invoice)
    invoice = preprocess_dates(invoice)

    # --------------------------
    # 2. Compute core indicators
    # --------------------------
    monthly_sales = sales_by_month(invoice)
    top10 = top_products(invoice)
    avg_basket = average_basket(invoice)

    # --------------------------
    # 3. Pattern mining (job-based rules)
    # --------------------------
    rules = pattern_mining_by_job(invoice, min_support=0.02, top_n=10)

    # --------------------------
    # 4. Temporal analysis + Prophet forecasting
    # --------------------------
    monthly, forecast = temporal_analysis(
    invoice,
    last_n_years=10,
    window_months=6,
    forecast_months=12
    )

    # --------------------------
    # 5. City clustering (commercial activity)
    # --------------------------
    city_amount = spatial_analysis_by_city(invoice)

    print("Pipeline executed successfully.")

    # Return all outputs
    return {
        "invoice": invoice,
        "monthly_sales": monthly_sales,
        "top10": top10,
        "avg_basket": avg_basket,
        "rules": rules,
        "monthly": monthly,
        "forecast": forecast,
        "city_amount" : city_amount
    }

In [71]:
from dash import Dash, dcc, html, dash_table
import dash_bootstrap_components as dbc
import plotly.express as px

#Main execution
if __name__ == "__main__":
    data = main()

# --------- Run your pipeline ---------
top10 = data["top10"]
rules = data["rules"]
monthly = data["monthly"]
forecast = data["forecast"]
city = data["city_amount"]

# ============================================
# 1. FIGURE Top 10 Products
# ============================================
fig_top10 = px.bar(
    top10,
    x="product_id",
    y="amount",
    title="Top 10 Products"
)

# ============================================
# 2. FIGURE Pattern Mining Rules
# ============================================
fig_rules = px.scatter(
    rules,
    x="confidence",
    y="lift",
    size="score",
    color="consequent_txt",
    hover_name="antecedent_txt",
    hover_data=["num_jobs"],
    title="Association Rules (Products Bought Together)"
)

# ============================================
# 3. FIGURE Temporal Analysis (monthly + trend + forecast)
# ============================================

fig_temp = px.line(
    monthly,
    x="date",
    y="amount",
    title="Monthly Sales (Last 10 Years)"
)
fig_temp.data[0].name = "Monthly Sales"
fig_temp.data[0].showlegend = True

# Add trend line
fig_temp.add_scatter(
    x=monthly["date"],
    y=monthly["trend"],
    mode="lines",
    name="Trend (Rolling Mean)",
    line=dict(color="orange", width=3)
)

# Add forecast
if forecast is not None:
    fig_temp.add_scatter(
        x=forecast["ds"],
        y=forecast["yhat"],
        mode="lines",
        name="Forecast",
        line=dict(color="green", width=2)
    )
    future = forecast[forecast["ds"] > monthly["date"].max()]

    fig_temp.add_scatter(
        x=list(future["ds"]) + list(future["ds"][::-1]),
        y=list(future["yhat_upper"]) + list(future["yhat_lower"][::-1]),
        fill="toself",
        fillcolor="rgba(0, 128, 0, 0.15)",
        line=dict(color="rgba(0,0,0,0)"),
        hoverinfo="skip",
        name="Confidence Interval"
    )

# ============================================
# 4. FIGURE City Clustering
# ============================================

fig_city = px.bar(
    city.head(15),
    x='city',
    y='invoice_count',
    color='total_revenue',
    title='Spatial Distribution of Invoice Activity by City',
    labels={
            'city': 'City',
            'invoice_count': 'Number of Invoices',
            'total_revenue': 'Total Revenue'
    }
    )
fig_city.update_layout(
    xaxis_tickangle=-45,
    margin=dict(r=0, t=40, l=0, b=0)
    )

# ============================================
# DASH APP LAYOUT
# ============================================

app = Dash(__name__, external_stylesheets=[dbc.themes.BOOTSTRAP])

app.layout = html.Div([
    
    html.H1("Invoices Dashboard", style={"textAlign": "center"}),

    html.H2("Top 10 Products"),
    dcc.Graph(figure=fig_top10),

    html.H2("Pattern Mining (Job-Based Rules)"),
    dcc.Graph(figure=fig_rules),

    html.H2("Temporal Analysis (Monthly + Trend + Forecast)"),
    dcc.Graph(figure=fig_temp),

    html.H2("City Clustering (Commercial Activity)"),
    dcc.Graph(figure=fig_city)
])

if __name__ == "__main__":
    app.run(debug=True)

Shape: (10000, 11)

Columns: ['first_name', 'last_name', 'email', 'product_id', 'qty', 'amount', 'invoice_date', 'address', 'city', 'stock_code', 'job']

Data types:
first_name       object
last_name        object
email            object
product_id        int64
qty               int64
amount          float64
invoice_date     object
address          object
city             object
stock_code        int64
job              object
dtype: object

Missing values per column:
first_name      0
last_name       0
email           0
product_id      0
qty             0
amount          0
invoice_date    0
address         0
city            0
stock_code      0
job             0
dtype: int64

Descriptive statistics:

Top 5 values for first_name:
first_name
David Williams        6
Daniel Johnson        5
Melissa Johnson       5
Michael Brown         5
Christopher Garcia    5
Name: count, dtype: int64

Top 5 values for city:
city
Brémoncourt           4
Les Auxons            4
Chaillé-les-Marais    4
Vill

10:58:18 - cmdstanpy - INFO - Chain [1] start processing
10:58:18 - cmdstanpy - INFO - Chain [1] done processing


Pipeline executed successfully.
