#### To run this, install lxml.     how(windows): pip install lxml

In [1]:

import os, csv
import requests
import pandas as pd

# Reading all hospital files
base_2015 = os.path.join("..","datas", "mortality", "2015 mortality.csv")
quebec_path = os.path.join("..", "datas", "Hospital", "hospital_quebec.csv")
hosp_que = pd.read_csv(quebec_path).reset_index(drop=True)

ont_path = os.path.join("..", "datas", "Hospital", "hospital_ont.csv")
hosp_on = pd.read_csv(ont_path)[["City", "Hospital"]]

nw_path = os.path.join("..", "datas", "Hospital", "hospital_nw.csv")
hosp_nw = pd.read_csv(nw_path)
city_df = pd.read_csv(base_2015).drop(columns=["Number of Mortality"])


# Quebec and Ontario only because North/West has one more column, will be done separately

# Going through 4th, 5th link on url
a=[4,5]

# list to store each dataframes
df_list = []

for x in a:
    url = f"https://internatlibs.mcgill.ca/hospitals/hospital-histories{x}.htm"
    data = requests.get(url).content

    # Scrapping all data from url    
    query = pd.read_html(data)
    
    # Getting 4th section which is the table into a dataframe. drop details
    df = pd.DataFrame(query[4]).drop(columns=[2])
    
    # rename column names and delete first row.
    df = df.rename(columns={0:"City", 1:"Hospital"})
    
    # delete first row and drop duplicate hospitals
    df = df.drop(df.index[0]).drop_duplicates(keep="first")
    
    # getting df with number of hospitals for each city, append it to list
    #  0th item is number of hospitals for each city of Quebec
    num_hos= df.groupby(["City"]).count().sort_values(by="Hospital", ascending=False)
    df_list.append(num_hos)

In [2]:
# Number of Hospitals for Quebec
num_hos_qb = df_list[0]
# get City back to column instead of index
num_hos_qb = num_hos_qb.reset_index()

# concat province 
num_hos_qb["City"] =num_hos_qb["City"] + ", Quebec"

In [3]:

# number of hospitals for Ontario
num_hos_on = df_list[1]
num_hos_on = num_hos_on.reset_index()
num_hos_on["City"] = num_hos_on["City"] + ", Ontario"
display(num_hos_on.head())

Unnamed: 0,City,Hospital
0,"Toronto, Ontario",42
1,"Hamilton, Ontario",14
2,"London, Ontario",10
3,"Ottawa, Ontario",10
4,"Thunder Bay, Ontario",7


### West and North

In [4]:
url = f"https://internatlibs.mcgill.ca/hospitals/hospital-histories6.htm"
data = requests.get(url).content
query = pd.read_html(data)

# Getting the table. this time it is 5th section, drop 1st row and details
df = pd.DataFrame(query[5]).drop(index=[0],columns=[3])

df = df.rename(columns={0:"Province", 1: "City", 2:"Hospital"})
# capitalize only first letter in 0 column
df["Province"] = df["Province"].str.title()


# merge City to Province. ex: Barrie, Ontario
df['Cities'] = df[['City','Province']].apply(lambda x: ', '.join(x.fillna('').map(str)), axis=1)
df = df.drop(columns=["Province", "City"])
df = df.rename(columns={"Cities":"City"})

# Change the order and drop duplicate hospitals
df = df[["City", "Hospital"]].drop_duplicates(keep="first")

# number of hospitals in each city.
num_hos_nw = df.groupby(["City"]).count().sort_values(by="Hospital", ascending=False).reset_index()

display(num_hos_nw.head())

Unnamed: 0,City,Hospital
0,"Winnipeg, Manitoba",17
1,"Edmonton, Alberta",13
2,"Calgary, Alberta",11
3,"Vancouver, British Columbia",10
4,"Victoria, British Columbia",7


### Cleaning and joining hospital data

In [5]:
# get cities with # hospital if that city is in city_df
on_cities = city_df["Geography"].str.endswith("Ontario")
on_city_list = city_df[on_cities]

# change column into a list
on_city_list = on_city_list["Geography"].tolist()

In [6]:
# go over all data in hospital_ontario data 
for i in range(len(hosp_on)):
    
    #  if current city is not in our city_list, drop that row
    if ((hosp_on["City"][i]) not in on_city_list):
        hosp_on = hosp_on.drop([i])

# reset index so it starts from 0
hosp_on = hosp_on.reset_index(drop=True)

In [7]:
# create a list for all nw cities we are using
cities = city_df["Geography"].tolist()

# get cities we are using.
for i in range(len(hosp_nw)):
    
    # drop if not in the list of cities     
    if ((hosp_nw["City"][i]) not in cities):
        hosp_nw = hosp_nw.drop([i])
hosp_nw = hosp_nw.reset_index(drop=True)

# Joining all datas
hospital = pd.concat([hosp_on, hosp_nw, hosp_que]).sort_values(by="City").reset_index(drop=True)

In [8]:
hospital

Unnamed: 0,City,Hospital
0,"Abbotsford-Mission, British Columbia",1
1,"Brantford, Ontario",1
2,"Calgary, Alberta",11
3,"Edmonton, Alberta",13
4,"Guelph, Ontario",4
5,"Hamilton, Ontario",14
6,"Kelowna, British Columbia",1
7,"Kingston, Ontario",7
8,"London, Ontario",10
9,"Montreal, Quebec",59
