In [None]:

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt
import seaborn as sns

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

In [None]:
df = pd.read_csv('/kaggle/input/metro-systems-worldwide/Metro-Systems-Worldwide.csv')
df.head()

# EDA

In [None]:
df.columns.tolist()

In [None]:
df.rename(columns = { 'Country\r\nregion' : 'Country',
                     'Service\r\nopened' : 'Opened_since',
                     'Last\r\nexpanded' : 'Last_expanded',
                     'System length' : 'System_length',
                     'Annual ridership\r\n(millions)' : 'Annual_ridership(millions)'}, inplace = True) 

In [None]:
df = df[~df['Country'].str.contains('Egypt')]

In [None]:
df['System_length'] = df['System_length'].apply(lambda x: float(x.split()[0]))

In [None]:
df.head()

In [None]:
df.shape

In [None]:
# Check for NaN values in the dataframe
nan_values = df.isnull().sum().sum()

if nan_values > 0:
    print("The dataframe contains NaN values.")
    # Drop rows with NaN values
    df.dropna(inplace=True)
    print("Rows with NaN values have been dropped.")
else:
    print("The dataframe does not contain any NaN values.")

df['Annual_ridership(millions)'] = df['Annual_ridership(millions)'].apply(lambda x: str(x).split("(")[0].strip())

In [None]:
df.head()

In [None]:
df.info()

# For removing "," in the values for converting string to float
for index, row in df.iterrows():
    value = row['Annual_ridership(millions)']
    value = value.replace(",", "") 
    df.at[index, 'Annual_ridership(millions)'] = value 

# Since we convert these values to string we have to convert them to float
df['Annual_ridership(millions)'] = df['Annual_ridership(millions)'].astype(float)
df.head()


In [None]:
df['Last_expanded'] = df['Last_expanded'].replace('–', np.nan).astype(float)
df.info()

In [None]:
sns.set_style("white")
sns.set_palette(["#7c459c", "#808080"])
sns.despine()

sns.histplot(data=df, x='Opened_since', bins=15, kde=True)
plt.xlabel('Years')
plt.ylabel('Frequency')
plt.title('Distribution of Opening Years for Metro Systems')
plt.show()

In [None]:
sns.histplot(data=df, x='Last_expanded', bins=15, kde=True, color = '#e88504')
plt.xlabel('Years')
plt.ylabel('Frequency')
plt.title('Distribution of Year of Expansion for Subways')
plt.xticks(rotation=55)
plt.show()

#### We notice that 2020 was the year with the **most Openings** of subway services as well as **most Expansions** of subway services.

In [None]:
oldest_operating = df.nsmallest(10, 'Opened_since')
newest_operating = df.nlargest(10, 'Opened_since')

sns.scatterplot(data=oldest_operating, x='Name', y='Opened_since', color='grey', marker="p", s=400)

plt.xlabel('Subway Name')
plt.ylabel('Year of Operation Start')
plt.title('Top 10 subways that operated for the longest time')
plt.xticks(rotation=55)

plt.show()

In [None]:
sns.scatterplot(data=newest_operating, x='Name', y='Opened_since',color='green', marker="v", s=400)

plt.xlabel('Subway Name')
plt.ylabel('Year of Operation Start')
plt.title('Top 10 subways that operated for the fewest time')
plt.xticks(rotation=55)

plt.show()

#### We see that the **oldest** Subway station is "London Underground" which opened in 1865, and the **newest** Subway station is "Quito Metro" which opened in 2023

In [None]:
largest_subways = df.nlargest(5, 'Stations')
smallest_subways = df.nsmallest(5, 'Stations')

fig, axes = plt.subplots(nrows=1, ncols=2, figsize=(12, 6))

#Largest subways

sns.barplot(ax=axes[0], data=largest_subways, x='City', y='Stations', color="#7c459c")
axes[0].set_xlabel('Subway Name')
axes[0].set_ylabel('Number of Stations')
axes[0].set_title('Top 5 Largest Subways')


#Smallest subways

sns.barplot(ax=axes[1], data=smallest_subways, x='City', y='Stations', color = '#e88504')
axes[1].set_xlabel('Subway Name')
axes[1].set_ylabel('Number of Stations')
axes[1].set_title('Top 5 smallest Subways')

plt.tight_layout()

plt.show()

#### **Biggest** Subway is *New York's* with over 400 Stations and **Smallest** Subway is *Karaj's* with only 2 Stations

In [None]:
longest_subways = df.nlargest(5, 'System_length')
shortest_subways = df.nsmallest(5, 'System_length')

fig, axes = plt.subplots(nrows=1, ncols=2, figsize=(12, 6))

#longest subways

sns.scatterplot(ax=axes[0], data=longest_subways, x='Name', y='System_length', color="red", s =800, marker='p')
axes[0].set_xlabel('Subway Name')
axes[0].set_ylabel('Length in KM')
axes[0].set_title('Top 5 longest Subways')
plt.xticks(rotation=55)

#shortest subways

sns.scatterplot(ax=axes[1], data=shortest_subways, x='Name', y='System_length', color = 'green', s=800, marker='o')
axes[1].set_xlabel('Subway Name')
axes[1].set_ylabel('Length in KM')
axes[1].set_title('Top 5 shortest Subways')

plt.tight_layout()

plt.show()

#### **Longest** Subway is Shanghai's Metro with up 800 KM railway, while **shortest** is Minatomirai's Line with only 4 KM railway

### Average Ridership per Country

In [None]:
country_group = df.groupby('Country')
average_ridership_per_country = country_group['Annual_ridership(millions)'].mean()

In [None]:
plt.figure(figsize=(15, 6))

sns.set_style("white")
sns.set_palette(["#7c459c", "#808080"])

# Average annual ridership per country
plt.bar(average_ridership_per_country.index, average_ridership_per_country)
plt.xticks(rotation=90)
plt.xlabel('Country')
plt.ylabel('Average Annual Ridership (millions)')
plt.title('Average Annual Ridership by Country')


sns.despine()
plt.show()

In [None]:
# Calculate the correlation matrix
correlation_matrix = df.corr()


plt.figure(figsize=(8, 6))
sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', fmt=".2f", linewidths=0.5)
plt.title('Correlation Matrix')
plt.show()

#### We notice that the number of Stations & System Length is highly correlated with the Annual ridership
#### This implies that larger number of Stations and longer systems tend to attract more ridership.