In [None]:
from pathlib import Path
import pandas as pd

import plotly.express as px

In [None]:
dataset_dir = (Path().resolve() / "data").absolute().as_posix()

In [None]:
data = pd.read_csv(f"{dataset_dir}/processed_data_2022_rail_stations.csv")
data.head(5)

In [None]:
data.info()

In [None]:
data.isnull().sum()

In [None]:
data['year'].unique()

In [None]:
data['month'].unique()

In [None]:
data['line'].unique()

In [None]:
lines = (data['line'].unique()).tolist()
len(lines)

### Analysis on rail line data

In [None]:
line_df = data[data['line'] == 'M2-YENIKAPI-HACIOSMAN']
line_df

In [None]:
monthly_passenger_cnts = line_df.groupby(['month', 
                                          'station_name']).agg(
                                          {'passanger_cnt':'sum'})
monthly_passenger_cnts.reset_index(inplace=True)
monthly_passenger_cnts

In [None]:
age_frame = line_df.groupby(['age']).agg({'passage_cnt':'sum'})
age_frame = age_frame.sort_values('passage_cnt')
age_frame

In [None]:
weekly_passenger_cnts = line_df.groupby(['date',
                                         'month',
                                         'week_number',
                                         'day_of_week',
                                         'station_name']).agg(
                                         {'passanger_cnt':'sum'})
weekly_passenger_cnts.reset_index(inplace=True)
weekly_passenger_cnts  

In [None]:
example_data = weekly_passenger_cnts[(weekly_passenger_cnts['month'] == 'January') & (weekly_passenger_cnts['week_number'] == 1)]
example_data

In [None]:
min_date = example_data['date'].min()
max_date = example_data['date'].max()

In [None]:
fig = px.bar(example_data,
             x='station_name',
             y='passanger_cnt',
             color='day_of_week',
             title=f'Number of Passangers for Each Day for Week ({min_date} - {max_date}) by Stations')
fig.show()

In [None]:
age_frame = line_df.groupby(['age']).agg({'passanger_cnt':'sum'})
age_frame = age_frame.sort_values('passanger_cnt')
age_frame

In [None]:
age_stations_frame = line_df.groupby(['age', 'station_name']).agg({'passanger_cnt':'sum'})
age_stations_frame.reset_index(inplace=True)
age_stations_frame

In [None]:
fig = px.bar(age_stations_frame[age_stations_frame['age'] == '20-30'],
             x='station_name',
             y='passanger_cnt',
             title='Number of Passangers by Stations for Age Group (20-30)')
fig.show()

In [None]:
fig = px.bar(age_frame, x=age_frame.index, y='passanger_cnt', title='Number of Passagers by Age Group')
fig.update_traces(marker_color=['#ADD8E6', '#87CEEB', '#4682B4', '#483D8B', '#191970'])
fig.show()

In [None]:
tr_holidays = pd.read_csv(f"{dataset_dir}/tr_holidays.csv")
tr_holidays

In [None]:
text = "New Year's Day"
holiday_date = tr_holidays[tr_holidays['Holiday'] == text]['Date'].values[0]
holiday_date

In [None]:
holiday_passenger_cnt = line_df[line_df['date'] == holiday_date].groupby([ 'station_name']).agg(
                                                                         {'passanger_cnt':'sum'})
holiday_passenger_cnt