<a href="https://colab.research.google.com/github/haiderali2017/my_exploratory_data_analyses/blob/main/Data_Indicator_8_PUBLIC_TRANSPORT_Analysis.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import pandas as pd
import plotly.express as px
import plotly.graph_objects as go
import numpy as np

In [None]:
df = pd.read_csv('https://data.smartdublin.ie/dataset/4997223b-13b2-4c97-9e88-cd94c6d35aec/resource/2da5e1ce-2a77-40f8-8d8d-c7f03885ab6b/download/indicator-8-public-transport.csv')

# Data Exploration


In [None]:
df.head()

Unnamed: 0,Quarter,Public Transport million trips,Public Transport million trips2,Bus Éireann,Dublin City Bus,Irish Rail,Luas,YoY Public Transport,QoQ Public Transport,%YoY Public Transport,...,Unnamed: 40,Unnamed: 41,Unnamed: 42,Unnamed: 43,Unnamed: 44,Unnamed: 45,Unnamed: 46,Unnamed: 47,Unnamed: 48,Unnamed: 49
0,Q3 07,,,,,,,,,,...,,,,,,,,,,Q3 24
1,Q4 07,,,,,,,,,,...,,,,,,,,,,69.1
2,Q1 08,,,,,,,,,,...,,,,,,,,,,9.1
3,Q2 08,,,,,,,,,,...,,,,,,,,,,
4,Q3 08,,,,,,,,,,...,,,,,,,,,,


In [None]:
df.tail()

Unnamed: 0,Quarter,Public Transport million trips,Public Transport million trips2,Bus Éireann,Dublin City Bus,Irish Rail,Luas,YoY Public Transport,QoQ Public Transport,%YoY Public Transport,...,Unnamed: 40,Unnamed: 41,Unnamed: 42,Unnamed: 43,Unnamed: 44,Unnamed: 45,Unnamed: 46,Unnamed: 47,Unnamed: 48,Unnamed: 49
70,,,,,,,,,,,...,,,,,,,,,,
71,,,,,,,,,,,...,,,,,,,,,,
72,,,,,,,,,,,...,,,,,,,,,,
73,,,,,,,,,,,...,,,,,,,,,,
74,,,,,,,,,,,...,,,,,,,,,,


In [None]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 75 entries, 0 to 74
Data columns (total 50 columns):
 #   Column                           Non-Null Count  Dtype  
---  ------                           --------------  -----  
 0   Quarter                          69 non-null     object 
 1   Public Transport million trips   32 non-null     float64
 2   Public Transport million trips2  59 non-null     float64
 3   Bus Éireann                      59 non-null     float64
 4   Dublin City Bus                  59 non-null     float64
 5   Irish Rail                       59 non-null     float64
 6   Luas                             59 non-null     float64
 7   YoY Public Transport             2 non-null      float64
 8   QoQ Public Transport             58 non-null     float64
 9   %YoY Public Transport            53 non-null     object 
 10  %QoQ Public Transport            58 non-null     object 
 11  %QoQ Bus Eireann                 58 non-null     object 
 12  %QoQ Dub City Bus       

# Data Cleaning
1. Removing unnecessary columns
2. Removing missing values
3. Removing duplicate columns
4. Removing trailing / leading spaces in column names
5. Removing '%' from column values

### 1. Removing unnecessary columns

In this step, we are dropping columns that are Unnamed as well as unnecessary.

In [None]:
df = df.drop(columns=[f'Unnamed: {i}' for i in range(22, 50)], errors='ignore') # dropping all columns that are unnamed
df = df.drop(columns=['Unnamed: 15', 'Unnamed: 16'], errors='ignore') # dropping columns that are not in range

### 2. Removing missing values

In this step, we are removing missing values.

In [None]:
# Check the number of missing values in each column
missing_values = df.isna().sum()
print(missing_values)
print(df.info())

Quarter                             6
Public Transport million trips     43
Public Transport million trips2    16
Bus Éireann                        16
Dublin City Bus                    16
Irish Rail                         16
Luas                               16
YoY Public Transport               73
QoQ Public Transport               17
%YoY Public Transport              22
%QoQ Public Transport              17
%QoQ Bus Eireann                   17
%QoQ Dub City Bus                  17
%QoQ Irish Rail                    17
%QoQ Luas                          17
Quarter.1                           6
Bus Éireann.1                      16
Dublin City Bus.1                  16
Irish Rail.1                       16
Luas.1                             16
dtype: int64
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 75 entries, 0 to 74
Data columns (total 20 columns):
 #   Column                           Non-Null Count  Dtype  
---  ------                           --------------  -----  


We are dropping the 'YoY Public Transport' column since it only has 2 values in it. The majority of missing values is harming the rest of the dataset.

In [None]:
df = df.drop(columns=['YoY Public Transport'], errors='ignore')

In [None]:
df = df.dropna().reset_index(drop=True)

In [None]:
df

Unnamed: 0,Quarter,Public Transport million trips,Public Transport million trips2,Bus Éireann,Dublin City Bus,Irish Rail,Luas,QoQ Public Transport,%YoY Public Transport,%QoQ Public Transport,%QoQ Bus Eireann,%QoQ Dub City Bus,%QoQ Irish Rail,%QoQ Luas,Quarter.1,Bus Éireann.1,Dublin City Bus.1,Irish Rail.1,Luas.1
0,Q3 11,44.0,44.0,1.73,28.01,6.93,7.31,-2.03,-1.80%,-4.40%,-1.80%,-6.10%,-2.10%,-0.20%,Q3 11,1734265,28006304,6927517,7312734
1,Q4 11,43.7,41.4,1.57,26.08,6.59,7.19,-2.55,-7.00%,-5.80%,-9.50%,-6.90%,-4.80%,-1.70%,Q4 11,1569327,26083337,6594505,7185784
2,Q1 12,43.2,43.6,1.7,27.81,6.85,7.28,2.2,-1.80%,5.30%,8.50%,6.60%,3.90%,1.30%,Q1 12,1702324,27806047,6849813,7278816
3,Q2 12,43.2,41.4,1.66,25.71,6.75,7.25,-2.26,-10.10%,-5.20%,-2.50%,-7.50%,-1.50%,-0.30%,Q2 12,1659864,25712304,6746734,7254176
4,Q3 12,43.8,43.6,1.64,27.64,6.89,7.43,2.23,-0.90%,5.40%,-0.90%,7.50%,2.10%,2.40%,Q3 12,1644481,27642452,6885962,7427313
5,Q4 12,43.7,45.6,1.71,29.55,6.84,7.53,2.03,10.10%,4.70%,4.10%,6.90%,-0.70%,1.40%,Q4 12,1712196,29548028,6837058,7533495
6,Q1 13,43.7,43.3,1.6,27.66,6.46,7.57,-2.33,-0.80%,-5.10%,-6.30%,-6.40%,-5.50%,0.40%,Q1 13,1604143,27661888,6463144,7566946
7,Q2 13,43.7,43.9,1.6,28.22,6.5,7.56,0.57,6.00%,1.30%,-0.40%,2.00%,0.50%,-0.10%,Q2 13,1597119,28218750,6495813,7556148
8,Q3 13,43.6,43.7,1.61,27.84,6.57,7.72,-0.13,0.30%,-0.30%,0.90%,-1.40%,1.20%,2.20%,Q3 13,1612266,27836048,6570973,7719576
9,Q4 13,44.2,44.0,1.58,28.36,6.36,7.73,0.29,-3.50%,0.70%,-2.20%,1.90%,-3.20%,0.10%,Q4 13,1577171,28364164,6363401,7728504


### 3. Removing duplicate columns

After having a detailed look at dataset, it is evident that columns 'Quarter.1' to 'Luas.1' are additional. The values within columns from 'Bus Éireann.1' to 'Luas.1' are the same as earlier columns but the earlier columns are rounded off to millions.

In the next steps, we will remove these additional columns.

In [None]:
def drop_columns(df, col_name):
  return df.drop(columns=[col_name], errors='ignore')

columns_to_drop = ['Quarter.1', 'Bus Éireann.1', 'Dublin City Bus.1', 'Irish Rail.1', 'Luas.1']

for col in columns_to_drop:
  df = drop_columns(df, col)

### 4. Removing trailing / leading spaces in column names


In this step, we are removing trailing and leading spaces in column names.

In [None]:
df.columns = df.columns.str.strip()

### 5. Removing '%' from column values

In this step, we are removing % from records since it causes misleading visuals.

In [None]:
def replace_percentages(df, col_name):
  df[col_name] = df[col_name].replace('%', '', regex=True)

columns_ = ['%YoY Public Transport', '%QoQ Public Transport', '%QoQ Bus Eireann', '%QoQ Dub City Bus', '%QoQ Irish Rail', '%QoQ Luas']

for col in columns_:
  replace_percentages(df, col)

In [None]:
df

Unnamed: 0,Quarter,Public Transport million trips,Public Transport million trips2,Bus Éireann,Dublin City Bus,Irish Rail,Luas,QoQ Public Transport,%YoY Public Transport,%QoQ Public Transport,%QoQ Bus Eireann,%QoQ Dub City Bus,%QoQ Irish Rail,%QoQ Luas
0,Q3 11,44.0,44.0,1.73,28.01,6.93,7.31,-2.03,-1.8,-4.4,-1.8,-6.1,-2.1,-0.2
1,Q4 11,43.7,41.4,1.57,26.08,6.59,7.19,-2.55,-7.0,-5.8,-9.5,-6.9,-4.8,-1.7
2,Q1 12,43.2,43.6,1.7,27.81,6.85,7.28,2.2,-1.8,5.3,8.5,6.6,3.9,1.3
3,Q2 12,43.2,41.4,1.66,25.71,6.75,7.25,-2.26,-10.1,-5.2,-2.5,-7.5,-1.5,-0.3
4,Q3 12,43.8,43.6,1.64,27.64,6.89,7.43,2.23,-0.9,5.4,-0.9,7.5,2.1,2.4
5,Q4 12,43.7,45.6,1.71,29.55,6.84,7.53,2.03,10.1,4.7,4.1,6.9,-0.7,1.4
6,Q1 13,43.7,43.3,1.6,27.66,6.46,7.57,-2.33,-0.8,-5.1,-6.3,-6.4,-5.5,0.4
7,Q2 13,43.7,43.9,1.6,28.22,6.5,7.56,0.57,6.0,1.3,-0.4,2.0,0.5,-0.1
8,Q3 13,43.6,43.7,1.61,27.84,6.57,7.72,-0.13,0.3,-0.3,0.9,-1.4,1.2,2.2
9,Q4 13,44.2,44.0,1.58,28.36,6.36,7.73,0.29,-3.5,0.7,-2.2,1.9,-3.2,0.1


In [None]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 26 entries, 0 to 25
Data columns (total 14 columns):
 #   Column                           Non-Null Count  Dtype  
---  ------                           --------------  -----  
 0   Quarter                          26 non-null     object 
 1   Public Transport million trips   26 non-null     float64
 2   Public Transport million trips2  26 non-null     float64
 3   Bus Éireann                      26 non-null     float64
 4   Dublin City Bus                  26 non-null     float64
 5   Irish Rail                       26 non-null     float64
 6   Luas                             26 non-null     float64
 7   QoQ Public Transport             26 non-null     float64
 8   %YoY Public Transport            26 non-null     object 
 9   %QoQ Public Transport            26 non-null     object 
 10  %QoQ Bus Eireann                 26 non-null     object 
 11  %QoQ Dub City Bus                26 non-null     object 
 12  %QoQ Irish Rail         

It is observed that there are two columns by the same name as 'Public Transport million trips'. It looks like this column was introduced to have the sum of all types of transportation (Bus Éireann,	Dublin City Bus,	Irish Rail,	Luas).

'Public Transport million trips2' has more accurate sum as compared to 1st one. So, we will stick with 2nd column.

# Data Visualisation

### Bar Chart - Public Transport year wise

In [None]:
# new DataFrame
df_public_transport = pd.DataFrame()

# split quarter from year (from original dataset) and put it into newly initialized dataframe
df_public_transport['Year'] = df['Quarter'].str.split(' ').str[-1]

# add 20 with year to make it '2012'
df_public_transport.loc[:, 'Year'] = '20' + df_public_transport['Year']

# converting the type of year column from object to int.
#  the purpose is to have correct plots
df_public_transport['Year'] = df_public_transport['Year'].astype(str).astype(int)

# removing duplicate rows
df_public_transport = df_public_transport.drop_duplicates()

# want df_emp to have a fresh index starting from 0
df_public_transport.reset_index(drop=True, inplace=True)

In [None]:
column_names = ['Public Transport million trips2',	'Bus Éireann',	'Dublin City Bus',	'Irish Rail',	'Luas']

# Loop over the column_names
for col in column_names:
  # Create a list to store the mean values for each quarter
  mean_values = []

  # Iterate over the quarters 2011 to 2017
  for i in range(11,18):

    # Filter the rows where the 'Quarter' column contains the current quarter (i)
    # and extract the corresponding values for columns
    values = df.loc[df['Quarter'].str.contains(str(i), na=False), col]
    # Calculate the mean of the filtered values
    val = values.mean()
    # Append the calculated mean to the mean_values list
    mean_values.append(int(val))

  # Add the list of mean values to a new dataframe 'df_housing' with the sector name as the column name
  df_public_transport[col] = mean_values

In [None]:
fig = px.line(df_public_transport, x='Year', y=column_names[1:], title='Trend of trips taken by different public transport in Ireland')

# Update the x-axis to show all years
fig.update_xaxes(
    tickmode='array',  # Set tick mode to 'array'
    tickvals=df_public_transport['Year'],  # Specify the tick values (all years)
    ticktext=df_public_transport['Year']   # Specify the tick labels (all years)
)

fig.show()

### Categorical Bar Chart - Public Transport year wise

In [None]:
years = df_public_transport['Year']

fig = go.Figure()
# 4 bar traces are added to the figure, one for each type of public transport
fig.add_trace(go.Bar(x=years,
                y=df_public_transport[column_names[1]],
                name=column_names[1], # Bus Éireann
                marker_color='rgb(7, 43, 186)'
                ))
fig.add_trace(go.Bar(x=years,
                y=df_public_transport[column_names[2]],
                name=column_names[2], # Dublin City Bus
                marker_color='rgb(186, 7, 7)'
                ))
fig.add_trace(go.Bar(x=years,
                y=df_public_transport[column_names[3]],
                name=column_names[3], # Irish Rail
                marker_color='rgb(7, 186, 31)'
                ))
fig.add_trace(go.Bar(x=years,
                y=df_public_transport[column_names[4]],
                name=column_names[4], # Luas
                marker_color='rgb(61, 219, 211)'
                ))

fig.update_layout(
    title=dict(text='Trend of trips with each type of public transport'),
    xaxis=dict(
        tickmode='array',  # Set tick mode to 'array'
        tickvals=years,    # Specify the tick values (years)
        ticktext=years,     # Specify the tick labels (years)
        tickfont_size=12,
        title=dict(
            text="Years",
            font=dict(
                size=16
            )
        ),
    ),

    yaxis=dict(
        title=dict(
            text="Trips",
            font=dict(
                size=16
            )
        ),
    ),
    legend=dict(
        x=0,
        y=1.0,
        bgcolor='rgba(255, 255, 255, 0)',
        bordercolor='rgba(255, 255, 255, 0)'
    ),
    barmode='group',
    bargap=0.15, # gap between bars of adjacent location coordinates.
    bargroupgap=0.1 # gap between bars of the same location coordinate.
)
fig.show()