# This notebook analyzes journey data from 2016-2019 to find out the most popular journey and the most popular departure station

In [1]:
import pandas as pd
import numpy as np

In [2]:
may2016 = pd.read_csv("Data/2016-05.csv")
may2016

Unnamed: 0,Departure,Return,Departure station id,Departure station name,Return station id,Return station name,Covered distance (m),Duration (sec.)
0,2016-05-31T23:58:00,2016-06-01T00:13:00,A27,Mannerheimintie,A35,Apollonkatu,2585,944
1,2016-05-31T23:58:00,2016-06-01T00:25:00,A01,Kaivopuisto,A12,Unioninkatu,2703,1627
2,2016-05-31T23:56:00,2016-06-01T00:08:00,A40,Lastenlehto,A06,Viiskulma,1194,705
3,2016-05-31T23:55:00,2016-06-01T00:06:00,B08,Sörnäisten metroasema,C02,Ooppera,2095,656
4,2016-05-31T23:55:00,2016-06-01T00:07:00,B08,Sörnäisten metroasema,C02,Ooppera,2111,715
...,...,...,...,...,...,...,...,...
79443,2016-05-02T14:30:00,2016-05-02T14:43:00,997,Workshop Helsinki,997,Workshop Helsinki,1737,791
79444,2016-05-02T14:02:00,2016-05-02T14:03:00,997,Workshop Helsinki,997,Workshop Helsinki,2,53
79445,2016-05-02T10:19:00,2016-05-02T10:19:00,A10,Erottaja,A10,Erottaja,0,14
79446,2016-05-02T09:58:00,2016-05-02T09:59:00,A10,Erottaja,A10,Erottaja,2,36


### First, let's find out the most popular departure station in may 2016

In [3]:
popular_may2016 = may2016["Departure station name"].value_counts()
popular_may2016.head(1)

Departure station name
Kampin metroasema    3268
Name: count, dtype: int64

### Let's then find out what journey was most popular in may 2016

In [4]:
popular_journey_may2016 = may2016[["Departure station name", "Return station name"]].value_counts()
popular_journey_may2016.head(1)

Departure station name  Return station name
Töölönlahdenkatu        Baana                  535
Name: count, dtype: int64

### Next, let's create a dataframe that contains all journeys data from 2016-2019

In [5]:
months2016_2017 = ["05", "06", "07", "08", "09","10"]
months2018_2019 = ["04","05", "06", "07", "08", "09","10"]
def generate_file_names(year, months):
    file_names = []
    for month in months:
        name = f'{year}-{month}.csv'
        file_names.append(name)
    return file_names

def create_df(files):
    all_dfs = []
    for file in files:
        path = f"Data/{file}"
        df = pd.read_csv(path, dtype={"Return station id":str})
        year_month = file.split(".")[0].split("-")
        year = year_month[0]
        month = year_month[1]
        df["Year"] = int(year)
        df["Month"] = int(month)
        all_dfs.append(df)
    return all_dfs

file_names2016 = generate_file_names("2016", months2016_2017)
file_names2017 = generate_file_names("2017", months2016_2017)
file_names2018 = generate_file_names("2018", months2018_2019)
file_names2019 = generate_file_names("2019", months2018_2019)

all_file_names = file_names2016+file_names2017+file_names2018+file_names2019
df = pd.concat(create_df(all_file_names), ignore_index = True)
df

Unnamed: 0,Departure,Return,Departure station id,Departure station name,Return station id,Return station name,Covered distance (m),Duration (sec.),Year,Month
0,2016-05-31T23:58:00,2016-06-01T00:13:00,A27,Mannerheimintie,A35,Apollonkatu,2585.0,944.0,2016,5
1,2016-05-31T23:58:00,2016-06-01T00:25:00,A01,Kaivopuisto,A12,Unioninkatu,2703.0,1627.0,2016,5
2,2016-05-31T23:56:00,2016-06-01T00:08:00,A40,Lastenlehto,A06,Viiskulma,1194.0,705.0,2016,5
3,2016-05-31T23:55:00,2016-06-01T00:06:00,B08,Sörnäisten metroasema,C02,Ooppera,2095.0,656.0,2016,5
4,2016-05-31T23:55:00,2016-06-01T00:07:00,B08,Sörnäisten metroasema,C02,Ooppera,2111.0,715.0,2016,5
...,...,...,...,...,...,...,...,...,...,...
9031375,2019-10-01T00:02:05,2019-10-01T00:14:47,291,Itäkeskus Metrovarikko,258,Abraham Wetterin tie,3168.0,761.0,2019,10
9031376,2019-10-01T00:01:12,2019-10-01T00:10:14,511,Sateentie,559,Pohjankulma,1852.0,538.0,2019,10
9031377,2019-10-01T00:01:08,2019-10-01T00:16:26,43,Karhupuisto,085,Jalavatie,3477.0,916.0,2019,10
9031378,2019-10-01T00:00:17,2019-10-01T00:12:58,41,Ympyrätalo,030,Itämerentori,3531.0,760.0,2019,10


### Now we can find out the most popular journey (from which departure station to which destination station the most journeys were made in 2016-2019)

In [6]:
most_popular_journey = df[["Departure station name", "Return station name"]].value_counts()
most_popular_journey.head(1)

Departure station name  Return station name                  
Jämeräntaival           Aalto-yliopisto (M), Korkeakouluaukio    22694
Name: count, dtype: int64

### Let's find out the most popular departure station in 2016-2019

In [7]:
most_popular_departure_station = df["Departure station name"].value_counts()
most_popular_departure_station.head(1)

Departure station name
Itämerentori    202137
Name: count, dtype: int64

### Finally, we can look at the most popular journey for each year

In [17]:
df2017 = df[df["Year"]==2017]
most_popular_journey2017 = df2017[["Departure station name", "Return station name"]].value_counts()

df2018 = df[df["Year"]==2018]
most_popular_journey2018 = df2018[["Departure station name", "Return station name"]].value_counts()

df2019 = df[df["Year"]==2019]
most_popular_journey2019 = df2019[["Departure station name", "Return station name"]].value_counts()

print("The most popular journey 2017:")
print(most_popular_journey2017.head(1))
print("The most popular journey 2018:")
print(most_popular_journey2018.head(1))
print("The most popular journey 2019:")
print(most_popular_journey2019.head(1))

The most popular journey 2017:
Departure station name  Return station name
Baana                   Töölönlahdenkatu       3398
Name: count, dtype: int64
The most popular journey 2018:
Departure station name  Return station name                  
Jämeräntaival           Aalto-yliopisto (M), Korkeakouluaukio    8834
Name: count, dtype: int64
The most popular journey 2019:
Departure station name                 Return station name
Aalto-yliopisto (M), Korkeakouluaukio  Jämeräntaival          14287
Name: count, dtype: int64
