# Five-Number Summary

In [None]:
import json

import numpy as np
import pandas as pd
from IPython.display import display

from constants import RAW_DATA, SUMMARY_DATA, cols, df_types

- https://en.wikipedia.org/wiki/Five-number_summary
- https://numpy.org/doc/stable/reference/generated/numpy.percentile.html
- https://en.wikipedia.org/wiki/Quartile
- https://pandas.pydata.org/docs/reference/api/pandas.DataFrame.unstack.html
- https://stackoverflow.com/a/33233406

## Setup

In [None]:
def get_five_number_summary(data):
    return np.percentile(data, [0, 25, 50, 75, 100], method="midpoint").tolist()

In [None]:
df = pd.read_excel(
    RAW_DATA,
    index_col=None,
    sheet_name="Worksheet",
    verbose=True,
    skipfooter=1,
    dtype=df_types,
    usecols=cols,
)

df.columns = df.columns.str.strip()

## Data Analysis

In [None]:
df.shape

In [None]:
df.head()

In [None]:
df.dtypes

In [None]:
df.isna().sum()

In [None]:
df[df["Bricks produced"].isna()].head()

In [None]:
df[df["CO2 saved"] == 0].head()

In [None]:
df[df["Total jobs"] == 0].shape

In [None]:
with pd.option_context("display.max_columns", None):
    display(df.groupby("Type")["Total jobs"].value_counts().unstack())

In [None]:
df["Bricks produced"].describe()

In [None]:
df["Total jobs"].describe()

In [None]:
df["CO2 saved"].describe()

## Data Processing

Filter projects with missing information about the bricks (and consequently no `CO2 saved`) to compute the summary.

Consider all 0 values for the `Total jobs` column. Keep in mind that there are different `Type`s of projects. Also, there are rows with `Total jobs` values greater than 0, even though they don't have information about the number of bricks.

In [None]:
filtered_df = df[df["Bricks produced"].notnull()]

In [None]:
# filtered_df.shape
# filtered_df.isna().sum()

In [None]:
bricks_summary = get_five_number_summary(filtered_df["Bricks produced"])
bricks_summary

In [None]:
jobs_summary = get_five_number_summary(df["Total jobs"])
jobs_summary

In [None]:
carbon_summary = get_five_number_summary(filtered_df["CO2 saved"])
carbon_summary

## Output

In [None]:
output = {"bricks": bricks_summary, "jobs": jobs_summary, "carbon": carbon_summary}
output

In [None]:
with open(SUMMARY_DATA, "w") as fp:
    json.dump(output, fp, ensure_ascii=False, separators=(",", ":"))

---