# 01 Explore data

In [None]:
from pathlib import Path
from collections import Counter

import pandas as pd
import numpy as np
import geopandas as gpd

import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px

In [None]:
sns.set_style("darkgrid", {"grid.color": ".6", "grid.linestyle": ":"})

In [None]:
NOTEBOOK_FOLDER = Path().cwd()
DATA_FOLDER = NOTEBOOK_FOLDER.parent / "data"
DATA_PATH = DATA_FOLDER / "supertore.csv"

## EDA

In [None]:
data = pd.read_csv(DATA_PATH, encoding="cp1252", index_col=0)

In [None]:
data.describe(include="all")

In [None]:
data.isna().mean() * 100

In [None]:
data.dtypes.sort_values()

We got a total of 20 columns:
* Main target for the 1st stage is to predict `Sales`.
* There are 3 columns that could be correlated with target are `["Quantity", "Discount", "Profit"]`, This columns will be discarded at this stage.
* There is an `Order ID`, `Customer Name` and `Custormer ID` columns that could create data leakage as well. Woth investigating
* `Country` is useless as all deals in the usa.
* The rest of the columns are:
> * `Catgorical`:  
> > * `Spatial`: `["State", "City", "Region"]`
> > * `Product`: `["Product Name", "Catgory", "Sub-Category", "Product ID"]`
> > * `Other`: `["Ship Mode", "Segment"]`
> * `Numerical`:  
> > * `Temporal`: `["Order Data", "Ship Date"]`
> > * `Spatial`: `["Postat Code"]`

In [None]:
TARGET_COLUMNS = ["Sales", "Discount", "Quantity", "Profit"]

In [None]:
data = data.drop(columns=["Country"])

In [None]:
sns.histplot(data=data["Sales"], stat="frequency", bins=100)
plt.title("Sales distribution")
plt.show()

In [None]:
fig, ax = plt.subplots(2, 2, figsize=(16, 8))
for col, my_ax in zip(TARGET_COLUMNS, ax.flatten()):
    sns.histplot(data=data[col], stat="frequency", ax=my_ax)
    my_ax.set_title(f"{col} distribution")
plt.tight_layout()
plt.show()

* The `Sales` seems to follow a positive distribution, highly concentrated on the left side with a long tail.
* The `Discount` is a nominal field highly concentrated on values multiplier of 10.
* The `Quality` follow a csort of Gamma distribution that is concentrated on values between 0 and 7
* Thhe `Profit` column highly concentrated closely to 0 but higher than 0. But the form suggest a gaussian distribution.

In [None]:
sns.pairplot(
    data[TARGET_COLUMNS]
)

### Check categorical columns

In [None]:
pd.Series(Counter(data["State"])).plot.bar()
plt.title("State distribution")
plt.xticks(rotation=90)
plt.tight_layout()
plt.show()

In [None]:
# %%bash
# cd ../data/
# mkdir -p ne_100m
# mv ne_110m_admin_1_states_provinces.zip ne_100m/
# cd ne_100m
# unzip ne_110m_admin_1_states_provinces.zip
# ls

In [None]:
world = gpd.read_file(DATA_FOLDER/"ne_100m"/"ne_110m_admin_1_states_provinces.shp")

In [None]:
world = world.set_index("name")

In [None]:
world["Profit"] = data.groupby("State")["Profit"].mean()
world["Sales"] = data.groupby("State")["Sales"].mean()
world["Sales"] = np.log(world["Sales"])

In [None]:
world.explore(column="Sales")