### Big data course project
<strong>T2: Number of rides per year</strong>

Jovana Videnovic & Haris Kupinic

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import os
import dask.dataframe as dd
import duckdb
from pathlib import Path

In [None]:
plt.rcParams['xtick.bottom'] = True
plt.rcParams['ytick.left'] = True

plt.rcParams['font.family'] = 'serif'
plt.rcParams['axes.labelsize'] = 12
plt.rcParams['xtick.labelsize'] = 10
plt.rcParams['ytick.labelsize'] = 10

plt.rcParams['axes.facecolor']='w'

In [None]:
service_type = "fhvhv"

In [None]:
part_df_path = Path("/d/hpc/projects/FRI/bigdata/students/jv8043/partitioned_data")
vis_path = Path("/d/hpc/home/jv8043/BD/project/T2/T2_vis") / service_type
tables_path = Path("/d/hpc/home/jv8043/BD/project/T2/T2_tables") / service_type

In [None]:
df = dd.read_parquet(part_df_path / service_type, engine="pyarrow", assume_missing=True)

In [None]:
df['pickup_datetime'] = dd.to_datetime(df['pickup_datetime'], errors='coerce')
df = df.dropna(subset=['pickup_datetime'])
df['year'] = df['pickup_datetime'].dt.year

In [None]:
rides_per_year = df.groupby('year').size().compute()

In [None]:
# Compute overall median
overall_median = rides_per_year.median()

In [None]:
# order by year
rides_per_year = rides_per_year.sort_index()

In [None]:
# save rides_per_year to csv
rides_per_year.to_csv(tables_path / "rides_per_year.csv", index=True, header=["rides"])

In [None]:
fig, ax = plt.subplots(figsize=(6, 3))
rides_per_year.plot(kind='bar', color='salmon', ax=ax)

ax.axhline(overall_median, color='gray', linestyle='--', label='Overall median')

# Add labels and style
ax.set_xlabel("Year")
ax.set_ylabel("Number of rides")
# x label rotation should be 45 degrees
ax.set_xticklabels(rides_per_year.index, rotation=45, ha='right')
# ax.set_title("Number of Rides per Year")
ax.legend(loc='upper right')
ax.spines['left'].set_color('0.5')
ax.spines['bottom'].set_color('0.5')

# remove top and right spines
ax.spines['top'].set_visible(False)
ax.spines['right'].set_visible(False)

# set dotted lines for grid at y axis
ax.yaxis.grid(True, linestyle='--', color='0.8')

# Annotate bars with value
# for i, val in enumerate(rides_per_year):
#     ax.text(i, val + val * 0.01, f"{val:,}", ha='center', va='bottom', fontsize=8)
ax.set_title(service_type.capitalize())
fig.tight_layout()
os.makedirs(vis_path, exist_ok=True)
fig.savefig(vis_path / "rides_per_year.pdf", dpi=300, bbox_inches='tight')

plt.show()