In [None]:
import re
from glob import glob

import matplotlib.pyplot as plt
import numpy as np

%matplotlib widget

co2_regex = re.compile("CO2 \(kg\) : +(-? ?[0-9,\.]+)")
user_regex = re.compile("\n([A-Z ]+) ([A-Z][a-z]+\ ?[A-Z]?[a-z]*\ ?[A-Z]?[a-z]*)\n")


def bilan_carbone(folder, silent=False):
    total = 0
    per_user = {}
    for file in glob(f"{folder}/*.txt"):
        with open(file, "r") as f:
            content = f.read()
            co2 = co2_regex.findall(content)
            user = user_regex.findall(content)
            if len(co2):
                co2 = float(co2[0].replace(",", "")) / 1000
                if len(user):
                    user = " ".join(user[0])
                    total += co2
                    per_user[user] = per_user.get(user, 0.0) + co2
                else:
                    if not silent:
                        print(f"No user {file}")
            else:
                if not silent:
                    print(f"No CO2 for {file}")
    return total, per_user


def plot(per_user, year):
    per_user = dict(sorted(per_user.items(), key=lambda item: item[1]))

    plt.figure(figsize=(18,6))
    plt.title(f"LPP travel CO2 footprint {year}")
    plt.legend(title=f"Total = {round(np.sum(list(per_user.values())))} tCO2e")
    plt.bar(per_user.keys(), per_user.values())
    plt.axhline(y=2)
    plt.axhline(y=np.mean(list(per_user.values())), color="red")
    plt.axhline(y=np.median(list(per_user.values())), color="green")
    plt.xticks(rotation=90)

    plt.tight_layout()
    return plt.show()

# PDF extraction:

Use the following command to extract text from travel PDF invoices 

In [None]:
!ls /DATA/LPP_CO2/Missions/Missions_*21/*.pdf | parallel pdf2txt -t text -o /DATA/LPP_CO2/Missions/out_2021/{/.}.txt {1}

# Data extraction and analysis:

In [None]:
for year in range(2018, 2023):
    total, per_user = bilan_carbone(f"/DATA/LPP_CO2/Missions/out_{year}/",silent=True)
    plot(per_user, year)