In [None]:
from datetime import datetime
from pathlib import Path
from scipy.stats import mannwhitneyu, alpha

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns

sns.reset_orig()

root_path = Path().cwd().parent.parent
data_path = root_path / "common" / "resources" / "datasets"
figures_path = Path().resolve() / "figures"

# Generate data

In [None]:
n = 10000
x = np.linspace(0, 100, n)

y = x * (1 + np.sin(x / 10) ** 2)
y_noisy = y + x * np.random.randn(n) / 4

df = pd.DataFrame({"x": x, "y": y_noisy})
df.to_csv(data_path / "histogram.csv")

In [None]:
fig, axs = plt.subplots(1, 2, figsize=(8, 3))

ax = axs[0]

ax.plot(x, y, color="firebrick", zorder=1)
ax.fill_between(x, y - x / 2, y + x / 2, color="navy", alpha=0.2, zorder=0)

ax.axvspan(45, 55, alpha=0.5, color="red")
ax.axvspan(75, 85, alpha=0.5, color="navy")
ax.set_ylim(0, 220)

ax = axs[1]
dist1 = y_noisy[np.where((x >= 45) & (x <= 55))]
dist2 = y_noisy[np.where((x >= 75) & (x <= 85))]

ax.hist(
    dist1, bins=20, color="firebrick", orientation="horizontal", alpha=0.8, density=True
)
ax.hist(dist2, bins=20, color="navy", orientation="horizontal", alpha=0.8, density=True)
ax.set_ylim(0, 220)

ax.spines[["right", "top", "bottom"]].set_visible(False)
ax.set_xticks([])
ax.set_yticks([])

fig.subplots_adjust(wspace=0.05)

# The second version of the task

In [None]:
# Define seasonal multiplier
def seasonal_multiplier(month, generator: np.random.Generator):
    if month in [6, 7, 8]:  # Summer
        return generator.uniform(1.5, 2.0)
    elif month in [3, 4, 5, 9, 10, 11]:  # Spring & Fall
        return generator.uniform(1.0, 1.5)
    else:  # Winter
        return generator.uniform(0.5, 1.0)

# Define daily pattern
def daily_multiplier(hour, generator: np.random.Generator):
    if 11 <= hour <= 20:  # Peak hours (lunchtime & evening)
        return generator.uniform(1.5, 2.0)
    elif 7 <= hour < 11:  # Morning (moderate sales)
        return generator.uniform(1.0, 1.5)
    else:  # Late night / early morning (low sales)
        return generator.uniform(0.3, 0.8)

# Define weekly pattern
def weekly_multiplier(day, generator: np.random.Generator):
    if day >= 5:  # Higher sales on weekends (Fri-Sun)
        return generator.uniform(1.3, 1.8)
    else:  # Lower sales on weekdays (Mon-Thu)
        return generator.uniform(0.8, 1.2)

def generate_sales(date: datetime, generator: np.random.Generator, min_sales=0, max_sales=30):
    base_sales = generator.integers(min_sales, max_sales)  # Base sales amount
    season_factor = seasonal_multiplier(date.month, generator=generator)
    daily_factor = daily_multiplier(date.hour, generator=generator)
    weekly_factor = weekly_multiplier(date.weekday(), generator=generator)
    return round(base_sales * season_factor * daily_factor * weekly_factor)


generator = np.random.default_rng(seed=8)

yerevan_sales = pd.DataFrame(
    {
        'date': pd.date_range(datetime(2024, 7, 1), datetime(2024, 12, 31), freq='h'),
        'city': 'Yerevan',
    },
)

yerevan_sales['sales'] = yerevan_sales['date'].apply(generate_sales, generator=generator, max_sales=24)

belgrade_sales = pd.DataFrame(
    {
        'date': pd.date_range(datetime(2024, 1, 1), datetime(2024, 12, 31), freq='h'),
        'city': 'Belgrade',
    }
)

belgrade_sales['sales'] = belgrade_sales['date'].apply(generate_sales, generator=generator)

sales = pd.concat([yerevan_sales, belgrade_sales], axis=0)
sales = sales.groupby([sales['date'].dt.date, 'city'])['sales'].sum().reset_index()

sales.to_csv(data_path / "sales.csv", index=False)

print(
    mannwhitneyu(
        sales[sales['city'] == 'Yerevan']['sales'],
        sales[sales['city'] == 'Belgrade']['sales'],
    ).pvalue
)

start = (sales['sales'].min() // 100) * 100
end = ((sales['sales'].max() // 100) + 1) * 100

bins = list(range(start, end + 1, 100))

sns.displot(sales, x='sales', hue='city', bins=bins)
sns.displot(sales, x='sales', hue='city', bins=bins, common_norm=False, stat='probability')

In [None]:
from matplotlib import gridspec

fig, (ax_ind, ax_hist) = plt.subplots(2, 1, height_ratios=[1, 10])

color_map = {
    "Yerevan": "pink",
    "Belgrade": "grey"
}

edge_color_map = {
    "Yerevan": "crimson",
    "Belgrade": "black"
}

position_map = {
    "Yerevan": "right",
    "Belgrade": "left",
}

sign_map = {
    "Yerevan": -1,
    "Belgrade": 1,
}

for city in sales['city'].unique():
    city_sales = sales[sales['city'] == city]['sales']
    weights = np.ones_like(city_sales) / city_sales.shape[0]

    ax_hist.hist(x=city_sales, alpha=0.5, weights=weights, label=city, bins=bins, color=color_map[city], edgecolor=edge_color_map[city], histtype="step")

    ax_hist.axvline(city_sales.median(), linestyle='dashed', linewidth=1.5, label='Median', color=edge_color_map[city])

    ax_hist.text(city_sales.median() + sign_map[city] * 25, 0.005, city_sales.median(), horizontalalignment=position_map[city], color=edge_color_map[city])

    jitter = np.random.uniform(-5, 5, len(city_sales))  # Add slight jitter to x-axis

    ax_ind.scatter(city_sales, np.zeros_like(city_sales) + (0.1 if city == "Belgrade" else 0) + 0.1, alpha=0.1, label=f'{city} Observations',
                    color=color_map[city], edgecolors=edge_color_map[city])


ax_hist.set_ylabel('Probability')
ax_hist.set_xlabel('Sales')
ax_hist.legend()

ax_ind.set_ylim(0, 0.3)
ax_ind.spines[["top", "bottom", "left", "right"]].set_visible(False)
ax_ind.set_xticks([])
ax_ind.set_yticks([])

fig.show()