In [46]:
import numpy as np
import pandas as pd

pd.set_option("display.max_columns", 99)

Here, we create a dataset that includes:

* lat, long of the weather stations.
* year
* tmin
* tmax
* tavg
* extMax
* extMin
* prcp
* is_flag: whether there was an extreme weather event in that year.


In [47]:
# Load data
weather = pd.read_csv("/home/joosungm/projects/def-lelliott/joosungm/projects/ssc23-case-comp/data/climate_data/weather_Station_data.csv")
print(weather.columns)

# Extract columns
weather_columns = ["Station Name", "Longitude (x)", "Latitude (y)", "Year", "Month", 
"Mean Max Temp (°C)", 'Mean Min Temp (°C)', 'Mean Temp (°C)', 
'Extr Max Temp (°C)', 'Extr Max Temp Flag', 'Extr Min Temp (°C)', 'Extr Min Temp Flag',
'Total Precip (mm)', 'Total Precip Flag'
]

weather = weather.loc[:, weather_columns]
display(weather.head())

Index(['Longitude (x)', 'Latitude (y)', 'Station Name', 'Climate ID',
       'Date/Time', 'Year', 'Month', 'Mean Max Temp (°C)',
       'Mean Max Temp Flag', 'Mean Min Temp (°C)', 'Mean Min Temp Flag',
       'Mean Temp (°C)', 'Mean Temp Flag', 'Extr Max Temp (°C)',
       'Extr Max Temp Flag', 'Extr Min Temp (°C)', 'Extr Min Temp Flag',
       'Total Rain (mm)', 'Total Rain Flag', 'Total Snow (cm)',
       'Total Snow Flag', 'Total Precip (mm)', 'Total Precip Flag',
       'Snow Grnd Last Day (cm)', 'Snow Grnd Last Day Flag',
       'Dir of Max Gust (10's deg)', 'Dir of Max Gust Flag',
       'Spd of Max Gust (km/h)', 'Spd of Max Gust Flag'],
      dtype='object')


  weather = pd.read_csv("/home/joosungm/projects/def-lelliott/joosungm/projects/ssc23-case-comp/data/climate_data/weather_Station_data.csv")


Unnamed: 0,Station Name,Longitude (x),Latitude (y),Year,Month,Mean Max Temp (°C),Mean Min Temp (°C),Mean Temp (°C),Extr Max Temp (°C),Extr Max Temp Flag,Extr Min Temp (°C),Extr Min Temp Flag,Total Precip (mm),Total Precip Flag
0,NAIN,-61.68,56.55,2004,11,-0.2,-6.7,-3.4,5.1,,-13.3,,80.7,
1,NAIN,-61.68,56.55,2004,12,-9.7,-16.9,-13.3,-1.0,,-27.1,,88.0,
2,NAIN,-61.68,56.55,2005,1,-18.4,-25.4,-21.9,-9.7,,-30.8,,42.5,
3,NAIN,-61.68,56.55,2005,2,-8.9,-19.8,-14.4,3.8,,-29.2,,119.8,
4,NAIN,-61.68,56.55,2005,3,-5.2,-14.5,-9.9,4.0,,-30.4,,145.8,


In [48]:
weather2 = weather.rename(columns = {
    "Longitude (x)":"lon", "Latitude (y)":"lat", "Year":"year", "Month":"month",
    "Mean Max Temp (°C)":"tmax", "Mean Min Temp (°C)":"tmin", "Mean Temp (°C)":"tavg", 
    "Extr Max Temp (°C)":"tmax_ext", "Extr Min Temp (°C)":"tmin_ext",
    "Extr Max Temp Flag":"tmax_ext_flag", "Extr Min Temp Flag":"tmin_ext_flag",
    "Total Precip (mm)":"precip", "Total Precip Flag":"precip_flag"})
weather2.head()

Unnamed: 0,Station Name,lon,lat,year,month,tmax,tmin,tavg,tmax_ext,tmax_ext_flag,tmin_ext,tmin_ext_flag,precip,precip_flag
0,NAIN,-61.68,56.55,2004,11,-0.2,-6.7,-3.4,5.1,,-13.3,,80.7,
1,NAIN,-61.68,56.55,2004,12,-9.7,-16.9,-13.3,-1.0,,-27.1,,88.0,
2,NAIN,-61.68,56.55,2005,1,-18.4,-25.4,-21.9,-9.7,,-30.8,,42.5,
3,NAIN,-61.68,56.55,2005,2,-8.9,-19.8,-14.4,3.8,,-29.2,,119.8,
4,NAIN,-61.68,56.55,2005,3,-5.2,-14.5,-9.9,4.0,,-30.4,,145.8,


In [49]:
print(weather2.tmax_ext_flag.unique())
print(weather2.tmin_ext_flag.unique()) 
print(weather2.precip_flag.unique())
# I: incomplete
# S: more than 1 occurrence
# E: estimated
# B: more than 1 occurence & estimated
# M: missing
# T: trace; value is zero.


[nan 'I' 'S' 'E' 'B']
[nan 'S' 'I' 'E' 'B' 'M']
[nan 'E' 'I' 'M' 'T' 'TRUE']


In [50]:
# extreme_flag == 1 if one of tmax_ext_flag, tmin_ext_flag is either one of S, E, and B
weather2["extreme_flag"] = np.where((weather2.tmax_ext_flag.isin(["S", "E", "B"])) | (weather2.tmin_ext_flag.isin(["S", "E", "B"])), 1, 0)

# tmax_flag == 1 if tmax_ext_flag is either one of S, E, and B
weather2["tmax_flag"] = np.where(weather2.tmax_ext_flag.isin(["S", "E", "B"]), 1, 0)

# tmin_flag == 1 if tmin_ext_flag is either one of S, E, and B
weather2["tmin_flag"] = np.where(weather2.tmin_ext_flag.isin(["S", "E", "B"]), 1, 0)

# total_flag = sum of tmax_flag, tmin_flag, and precip_flag
weather2["total_flag"] = weather2.tmax_flag + weather2.tmin_flag

# group by Station Name, year, lat, long, and sum extreme_flag.
weather3 = weather2.groupby(["Station Name", "year", "lat", "lon"], as_index=False).agg({"extreme_flag":"sum", "tmax_flag":"sum", "tmin_flag":"sum", "total_flag":"sum"})
weather3.head()

Unnamed: 0,Station Name,year,lat,lon,extreme_flag,tmax_flag,tmin_flag,total_flag
0,100 MILE HOUSE 6NE,1998,51.68,-121.22,4,2,4,6
1,100 MILE HOUSE 6NE,1999,51.68,-121.22,4,3,4,7
2,100 MILE HOUSE 6NE,2000,51.68,-121.22,4,1,3,4
3,100 MILE HOUSE 6NE,2001,51.68,-121.22,6,4,4,8
4,100 MILE HOUSE 6NE,2002,51.68,-121.22,7,6,4,10


In [51]:
province_prod = pd.DataFrame()
provinces = ["AB", "BC", "MB", "NB", "NL", "NS", "ON", "PE", "QC", "PE", "SK"]
for pr in provinces:
    prod_temp_filename = "/home/joosungm/projects/def-lelliott/joosungm/projects/ssc23-case-comp/data/user_data/01_iv_analysis/" + pr + "/prod_temp.csv"

    prod_temp = pd.read_csv(prod_temp_filename)
    # Exclude "production_in_division_" from all column names
    prod_temp.columns = [col.replace("production_in_division_", "") for col in prod_temp.columns]
    prod_temp.rename(columns = {"lat":"prod_lat", "long":"prod_lon"}, inplace = True)
    prod_temp = prod_temp.loc[prod_temp.month == 12, :].drop(columns = ["Date", "month", "tavg", "tmin", "tmax", "census_year_ref", "max_lat", "min_lat", "max_long", "min_long"]).reset_index(drop = True)

    # concat prod_temp to province_prod by row
    province_prod = pd.concat([province_prod, prod_temp], axis = 0)

print(province_prod.shape)
province_prod.tail()

# drop rows that have NaN Dominant_NAICS
province_prod = province_prod.dropna(subset = ["Dominant_NAICS"])
province_prod.tail()

# save province_prod
province_prod.to_csv("/home/joosungm/projects/def-lelliott/joosungm/projects/ssc23-case-comp/data/user_data/02_counterfactual_analysis/province_prod.csv", index = False)

(129150, 22)


In [33]:
province_prod.columns

Index(['provincename', 'X22.Utilities', 'X23.Construction',
       'X31.33.Manufacturing', 'X48.49.Transportation.and.warehousing',
       'X61.Educational.services', 'X62.Health.care.and.social.assistance',
       'X72.Accommodation.and.food.services',
       'X81.Other.services..except.public.administration.',
       'X91.Public.administration',
       'X11.Agriculture.forestry.fishing.hunting.21.Mining.quarrying.and.oil.and.gas.extraction',
       'X41.Wholesale.trade.44.45.Retail.trade',
       'X52.Finance.and.insurance.53.Real.estate.and.rental.and.leasing',
       'X54.Professional..scientific.and.technical.services.55.56',
       'X51.Information.culture.and.recreation.71', 'Population', 'GeoUID',
       'Dominant_NAICS', 'colourval', 'year', 'prod_lat', 'prod_lon'],
      dtype='object')

In [52]:
# filter out unique set of lat, lon
weather_unique = weather3.drop_duplicates(subset = ["lat", "lon"]).reset_index(drop = True)[["Station Name", "lat", "lon"]]
display(weather_unique.head())
display(weather_unique.shape)

# filter out unique GeoUID and their prod_lat, prod_lon from province_prod, drop NaN
province_prod = pd.read_csv("/home/joosungm/projects/def-lelliott/joosungm/projects/ssc23-case-comp/data/user_data/02_counterfactual_analysis/province_prod.csv")
province_prod_unique = province_prod.drop_duplicates(subset = ["GeoUID", "prod_lat", "prod_lon"]).dropna().reset_index(drop = True)[["provincename", "GeoUID", "prod_lat", "prod_lon"]]


display(province_prod_unique.head())
display(province_prod_unique.shape)

Unnamed: 0,Station Name,lat,lon
0,100 MILE HOUSE 6NE,51.68,-121.22
1,ABEE AGDM,54.28,-112.97
2,ADDENBROKE ISLAND,51.6,-127.86
3,AGASSIZ CDA,49.24,-121.76
4,ALBERNI ROBERTSON CREEK,49.34,-124.98


(545, 3)

Unnamed: 0,provincename,GeoUID,prod_lat,prod_lon
0,Alberta,4801003,50.014229,-110.583589
1,Alberta,4801006,50.045816,-110.701488
2,Alberta,4801008,49.481525,-111.161699
3,Alberta,4801009,49.475704,-111.448178
4,Alberta,4801014,49.873328,-111.370904


(4475, 4)

In [53]:
# Measure distance between two lat/lon pairs
def distance(lat1, lon1, lat2, lon2):
    # Convert degrees to radians
    lat1 = np.radians(lat1)
    lon1 = np.radians(lon1)
    lat2 = np.radians(lat2)
    lon2 = np.radians(lon2)
    # Find the differences
    dlat = lat2 - lat1
    dlon = lon2 - lon1
    # Apply the formula
    a = np.sin(dlat/2)**2 + np.cos(lat1) * np.cos(lat2) * np.sin(dlon/2)**2
    c = 2 * np.arctan2(np.sqrt(a), np.sqrt(1-a)) # great circle distance in radians
    # Convert to kilometers
    km = 6371 * c
    return km


In [36]:
# For each GeoUID, find the closest station name and its lat, lon.
# - Add weather Station name, lat, lon, distance columns to a copy of province_prod_unique.

province_prod_unique2 = province_prod_unique.copy()
province_prod_unique2["weather_station_name"] = ""
province_prod_unique2["weather_station_lat"] = 0
province_prod_unique2["weather_station_lon"] = 0
province_prod_unique2["weather_station_distance"] = 0

for i in range(province_prod_unique2.shape[0]):
    # get GeoUID, prod_lat, prod_lon
    # i = 1
    geo_uid = province_prod_unique2.loc[i, "GeoUID"]
    prod_lat = province_prod_unique2.loc[i, "prod_lat"]
    prod_lon = province_prod_unique2.loc[i, "prod_lon"]
    
    # get weather station name, lat, lon
    weather_station_name = weather_unique.loc[weather_unique.index[0], "Station Name"]
    weather_station_lat = weather_unique.loc[weather_unique.index[0], "lat"]
    weather_station_lon = weather_unique.loc[weather_unique.index[0], "lon"]
    # get distance
    weather_station_distance = distance(prod_lat, prod_lon, weather_station_lat, weather_station_lon)
    # loop through weather_unique to find the closest station name, lat, lon
    for j in range(weather_unique.shape[0]):
        # j = 1
        # get weather station name, lat, lon
        weather_station_name_temp = weather_unique.loc[weather_unique.index[j], "Station Name"]
        weather_station_lat_temp = weather_unique.loc[weather_unique.index[j], "lat"]
        weather_station_lon_temp = weather_unique.loc[weather_unique.index[j], "lon"]
        # get distance
        weather_station_distance_temp = distance(prod_lat, prod_lon, weather_station_lat_temp, weather_station_lon_temp)
        # if distance is smaller, update weather_station_name, lat, lon, distance
        if weather_station_distance_temp < weather_station_distance:
            weather_station_name = weather_station_name_temp
            weather_station_lat = weather_station_lat_temp
            weather_station_lon = weather_station_lon_temp
            weather_station_distance = weather_station_distance_temp
    # update province_prod_unique2
    province_prod_unique2.loc[i, "weather_station_name"] = weather_station_name
    province_prod_unique2.loc[i, "weather_station_lat"] = weather_station_lat
    province_prod_unique2.loc[i, "weather_station_lon"] = weather_station_lon
    province_prod_unique2.loc[i, "weather_station_distance"] = weather_station_distance


province_prod_unique2 = province_prod_unique2.dropna().reset_index(drop = True)
province_prod_unique2.head()
province_prod_unique2.shape

Unnamed: 0,provincename,GeoUID,prod_lat,prod_lon,weather_station_name,weather_station_lat,weather_station_lon,weather_station_distance
0,Alberta,4801003,50.014229,-110.583589,BOW ISLAND,49.73,-111.45,69.671231
1,Alberta,4801006,50.045816,-110.701488,BOW ISLAND,49.73,-111.45,64.099292
2,Alberta,4801008,49.481525,-111.161699,BOW ISLAND,49.73,-111.45,34.568124
3,Alberta,4801009,49.475704,-111.448178,BOW ISLAND,49.73,-111.45,28.276784
4,Alberta,4801014,49.873328,-111.370904,BOW ISLAND,49.73,-111.45,16.918191


In [40]:
# - Extract weather_station_name and provincename
station_province = province_prod_unique2[["weather_station_name", "provincename"]].drop_duplicates().reset_index(drop = True)
station_province.shape

# - for each weather_station_name, compute the number of provincename
station_province["num_provincename"] = 0
for i in range(station_province.shape[0]):
    # i = 0
    weather_station_name = station_province.loc[i, "weather_station_name"]
    # weather_station_name
    num_provincename = station_province[station_province["weather_station_name"] == weather_station_name].shape[0]
    station_province.loc[i, "num_provincename"] = num_provincename

dup_provinces = station_province[station_province["num_provincename"] > 1]["weather_station_name"].tolist()

for station in dup_provinces:
    # station = dup_provinces[0]
    temp_dup = province_prod_unique2[province_prod_unique2["weather_station_name"] == station].groupby(["provincename"]).count().reset_index()
    dominant_prov = temp_dup[temp_dup["GeoUID"] == temp_dup["GeoUID"].max()]["provincename"].tolist()[0]

    # drop all the rows with weather_station_name == station and provincename != dominant_prov
    province_prod_unique2 = province_prod_unique2[~((province_prod_unique2["weather_station_name"] == station) & (province_prod_unique2["provincename"] != dominant_prov))].reset_index(drop = True)

Unnamed: 0,weather_station_name,year,X22.Utilities,X23.Construction,X31.33.Manufacturing,X48.49.Transportation.and.warehousing,X61.Educational.services,X62.Health.care.and.social.assistance,X72.Accommodation.and.food.services,X81.Other.services..except.public.administration.,X91.Public.administration,X11.Agriculture.forestry.fishing.hunting.21.Mining.quarrying.and.oil.and.gas.extraction,X41.Wholesale.trade.44.45.Retail.trade,X52.Finance.and.insurance.53.Real.estate.and.rental.and.leasing,X54.Professional..scientific.and.technical.services.55.56,X51.Information.culture.and.recreation.71,Population
0,100 MILE HOUSE 6NE,1997,9.173696,42.205906,54.016007,21.850102,13.280715,28.840283,14.813409,8.952717,22.839209,130.155562,39.430058,51.862133,23.543608,10.603754,16165.0
1,100 MILE HOUSE 6NE,1998,10.361439,32.093841,52.577916,23.83882,14.438922,31.085022,14.542048,9.372471,21.770213,115.502032,41.41993,51.718541,26.8459,9.855601,16165.0
2,100 MILE HOUSE 6NE,1999,9.741119,36.176122,69.344267,23.798871,16.292231,31.278678,13.178521,9.012962,25.875216,125.767547,40.637891,52.895577,26.4943,10.734781,16165.0
3,100 MILE HOUSE 6NE,2000,11.995937,33.469164,70.027871,24.85475,15.772039,29.993456,14.411332,9.632571,25.348531,119.212186,42.747333,57.466138,26.164915,11.525466,16165.0
4,100 MILE HOUSE 6NE,2001,7.206688,32.89136,60.547691,22.637911,15.045302,31.198847,14.364507,10.064814,24.859586,126.76058,44.845824,56.587092,26.220371,12.41662,16165.0


In [41]:
# - left join province_prod_unique2 to province_prod by GeoUID
province_prod.columns
province_prod_unique2.columns
province_prod2 = pd.merge(province_prod, province_prod_unique2[["GeoUID", "weather_station_name"]], on = "GeoUID", how = "left")
province_prod2_cols1 = ["weather_station_name", "year", "provincename"]
province_prod2_cols2 = province_prod.columns[1:16].tolist()  # production data
province_prod2_cols = province_prod2_cols1 + province_prod2_cols2

Unnamed: 0,weather_station_name,year,X22.Utilities,X23.Construction,X31.33.Manufacturing,X48.49.Transportation.and.warehousing,X61.Educational.services,X62.Health.care.and.social.assistance,X72.Accommodation.and.food.services,X81.Other.services..except.public.administration.,X91.Public.administration,X11.Agriculture.forestry.fishing.hunting.21.Mining.quarrying.and.oil.and.gas.extraction,X41.Wholesale.trade.44.45.Retail.trade,X52.Finance.and.insurance.53.Real.estate.and.rental.and.leasing,X54.Professional..scientific.and.technical.services.55.56,X51.Information.culture.and.recreation.71,Population,provincename
0,100 MILE HOUSE 6NE,1997,9.173696,42.205906,54.016007,21.850102,13.280715,28.840283,14.813409,8.952717,22.839209,130.155562,39.430058,51.862133,23.543608,10.603754,16165.0,British Columbia
1,100 MILE HOUSE 6NE,1997,9.173696,42.205906,54.016007,21.850102,13.280715,28.840283,14.813409,8.952717,22.839209,130.155562,39.430058,51.862133,23.543608,10.603754,16165.0,British Columbia
2,100 MILE HOUSE 6NE,1997,9.173696,42.205906,54.016007,21.850102,13.280715,28.840283,14.813409,8.952717,22.839209,130.155562,39.430058,51.862133,23.543608,10.603754,16165.0,British Columbia
3,100 MILE HOUSE 6NE,1997,9.173696,42.205906,54.016007,21.850102,13.280715,28.840283,14.813409,8.952717,22.839209,130.155562,39.430058,51.862133,23.543608,10.603754,16165.0,British Columbia
4,100 MILE HOUSE 6NE,1997,9.173696,42.205906,54.016007,21.850102,13.280715,28.840283,14.813409,8.952717,22.839209,130.155562,39.430058,51.862133,23.543608,10.603754,16165.0,British Columbia


In [43]:
# - group by weather_station_name and year, sum up all the columns in province_prod2_cols
province_prod3 = province_prod2[province_prod2_cols].dropna().groupby(["weather_station_name", "year", "provincename"]).sum().reset_index()
province_prod3.shape
province_prod3.columns
province_prod3.head()

Unnamed: 0,Station Name,year,lat,lon,extreme_flag,tmax_flag,tmin_flag,total_flag,weather_station_name,X22.Utilities,X23.Construction,X31.33.Manufacturing,X48.49.Transportation.and.warehousing,X61.Educational.services,X62.Health.care.and.social.assistance,X72.Accommodation.and.food.services,X81.Other.services..except.public.administration.,X91.Public.administration,X11.Agriculture.forestry.fishing.hunting.21.Mining.quarrying.and.oil.and.gas.extraction,X41.Wholesale.trade.44.45.Retail.trade,X52.Finance.and.insurance.53.Real.estate.and.rental.and.leasing,X54.Professional..scientific.and.technical.services.55.56,X51.Information.culture.and.recreation.71,Population,provincename
53262,YOHO PARK,2002,51.44,-116.34,4,1,3,4,YOHO PARK,0.0,0.0,0.0,0.0,0.0,0.0,0.314625,0.382686,0.885122,0.0,0.0,0.0,0.0,1.791579,1028.0,Alberta
53263,YOHO PARK,2003,51.44,-116.34,1,0,1,1,YOHO PARK,0.0,0.0,0.0,0.0,0.0,0.0,0.315156,0.430354,0.845187,0.0,0.0,0.0,0.0,1.677869,1028.0,Alberta
53264,YOHO PARK,2004,51.44,-116.34,0,0,0,0,YOHO PARK,0.0,0.0,0.0,0.0,0.0,0.0,0.313377,0.424476,0.869805,0.0,0.0,0.0,0.0,1.803226,1028.0,Alberta
53265,YOHO PARK,2005,51.44,-116.34,1,1,1,2,YOHO PARK,0.0,0.0,0.0,0.0,0.0,0.0,0.338469,0.510992,0.899264,0.0,0.0,0.0,0.0,1.761924,1028.0,Alberta
53266,YOHO PARK,2006,51.44,-116.34,2,2,0,2,YOHO PARK,0.0,0.0,0.0,0.0,0.0,0.0,0.353394,0.493145,0.885637,0.0,0.0,0.0,0.0,1.998985,1028.0,Alberta


In [44]:
# - merge province_prod4 to weather3 by weather_station_name and year
weather_prod_final = pd.merge(weather3, province_prod3, left_on = ["Station Name", "year"], right_on = ["weather_station_name", "year"], how = "left").dropna().reset_index()
weather_prod_final.shape
weather_prod_final.head()
weather_prod_final.tail()

array(['British Columbia', 'Alberta', 'Ontario', 'Quebec',
       'Newfoundland and Labrador', 'New Brunswick', 'Saskatchewan',
       'Manitoba', 'Nova Scotia', 'Prince Edward Island'], dtype=object)

In [45]:
weather_prod_final.provincename.unique()

In [None]:
weather_prod_final.to_csv("/home/joosungm/projects/def-lelliott/joosungm/projects/ssc23-case-comp/data/user_data/02_counterfactual_analysis/weather_prod_final.csv", index = False)