# Load Data

In [1]:
# Importing libraries used on this project

import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import glob as glob
from pycaret.regression import *

In [2]:
# pd.set_option('display.max_columns', None)
# pd.set_option('display.max_rows', None)

In [3]:
# Loading all csv files into one object called "files"

files = glob.glob('./data_StatsCan/en_climate_daily_NS_*.csv')
files

# Concatenated all csv files loaded into one DataFrame for analysis

df = pd.concat([pd.read_csv(f, low_memory=False) for f in files], ignore_index=True)
df.head()

Unnamed: 0,Longitude (x),Latitude (y),Station Name,Climate ID,Date/Time,Year,Month,Day,Data Quality,Max Temp (°C),...,Total Snow (cm),Total Snow Flag,Total Precip (mm),Total Precip Flag,Snow on Grnd (cm),Snow on Grnd Flag,Dir of Max Gust (10s deg),Dir of Max Gust Flag,Spd of Max Gust (km/h),Spd of Max Gust Flag
0,-63.5,44.88,HALIFAX STANFIELD INT'L A,8202250,1960-01-01,1960,1,1,,,...,,,,,,,,,,
1,-63.5,44.88,HALIFAX STANFIELD INT'L A,8202250,1960-01-02,1960,1,2,,,...,,,,,,,,,,
2,-63.5,44.88,HALIFAX STANFIELD INT'L A,8202250,1960-01-03,1960,1,3,,,...,,,,,,,,,,
3,-63.5,44.88,HALIFAX STANFIELD INT'L A,8202250,1960-01-04,1960,1,4,,,...,,,,,,,,,,
4,-63.5,44.88,HALIFAX STANFIELD INT'L A,8202250,1960-01-05,1960,1,5,,,...,,,,,,,,,,


This project is going to focus on a few parameters of weather data from Halifax Stanfield Aiport. The main parameter studied is the evolution of mean temperature along the years and how climate is changing. 

Another parameter of interest, which is somehow related to climate change is the Heating Degree-Days (HDD). HDD is used to calculate heat demand and insulation requirements for buildings in the construction industry.

HDD = Heating degree-days for a given day are the number of degrees Celsius that the mean temperature is below 18 °C. If the temperature is equal to or greater than 18 °C, then the number will be zero. For example, a day with a mean temperature of 15.5 °C has 2.5 heating degree-days; a day with a mean temperature of 20.5 °C has zero heating degree-days. Heating degree-days are used primarily to estimate the heating requirements of buildings.

[Climate Weather Canada](https://climate.weather.gc.ca/glossary_e.html#hdd)

# EDA and Data Preparation

In [4]:
# Checking for the shape of data
df.shape

(23742, 31)

In [5]:
# Checking for nulls
df.isna().sum()

Longitude (x)                    0
Latitude (y)                     0
Station Name                     0
Climate ID                       0
Date/Time                        0
Year                             0
Month                            0
Day                              0
Data Quality                 23742
Max Temp (°C)                  935
Max Temp Flag                23657
Min Temp (°C)                  937
Min Temp Flag                23655
Mean Temp (°C)                 945
Mean Temp Flag               23655
Heat Deg Days (°C)             945
Heat Deg Days Flag           23655
Cool Deg Days (°C)             945
Cool Deg Days Flag           23655
Total Rain (mm)               1000
Total Rain Flag              20720
Total Snow (cm)                918
Total Snow Flag              21058
Total Precip (mm)              964
Total Precip Flag            19235
Snow on Grnd (cm)             3733
Snow on Grnd Flag            22052
Dir of Max Gust (10s deg)     8371
Dir of Max Gust Flag

In [6]:
# listing DataFrame columns
df.columns

Index(['Longitude (x)', 'Latitude (y)', 'Station Name', 'Climate ID',
       'Date/Time', 'Year', 'Month', 'Day', 'Data Quality', 'Max Temp (°C)',
       'Max Temp Flag', 'Min Temp (°C)', 'Min Temp Flag', 'Mean Temp (°C)',
       'Mean Temp Flag', 'Heat Deg Days (°C)', 'Heat Deg Days Flag',
       'Cool Deg Days (°C)', 'Cool Deg Days Flag', 'Total Rain (mm)',
       'Total Rain Flag', 'Total Snow (cm)', 'Total Snow Flag',
       'Total Precip (mm)', 'Total Precip Flag', 'Snow on Grnd (cm)',
       'Snow on Grnd Flag', 'Dir of Max Gust (10s deg)',
       'Dir of Max Gust Flag', 'Spd of Max Gust (km/h)',
       'Spd of Max Gust Flag'],
      dtype='object')

In [7]:
# Droping columns that are not going to be used on this project
df = df.drop(['Longitude (x)', 'Latitude (y)', 'Station Name', 'Climate ID','Max Temp Flag','Heat Deg Days Flag','Min Temp Flag',
              'Data Quality','Mean Temp Flag', 'Cool Deg Days (°C)', 'Cool Deg Days Flag', 'Total Rain (mm)',
       'Total Rain Flag', 'Total Snow (cm)', 'Total Snow Flag',
       'Total Precip (mm)', 'Total Precip Flag', 'Snow on Grnd (cm)',
       'Snow on Grnd Flag', 'Dir of Max Gust (10s deg)',
       'Dir of Max Gust Flag', 'Spd of Max Gust (km/h)',
       'Spd of Max Gust Flag'], axis=1)
df

Unnamed: 0,Date/Time,Year,Month,Day,Max Temp (°C),Min Temp (°C),Mean Temp (°C),Heat Deg Days (°C)
0,1960-01-01,1960,1,1,,,,
1,1960-01-02,1960,1,2,,,,
2,1960-01-03,1960,1,3,,,,
3,1960-01-04,1960,1,4,,,,
4,1960-01-05,1960,1,5,,,,
...,...,...,...,...,...,...,...,...
23737,2023-12-27,2023,12,27,,,,
23738,2023-12-28,2023,12,28,,,,
23739,2023-12-29,2023,12,29,,,,
23740,2023-12-30,2023,12,30,,,,


In [8]:
# Verifying for nulls on the remaining data that will be used
df.isna().sum()

Date/Time               0
Year                    0
Month                   0
Day                     0
Max Temp (°C)         935
Min Temp (°C)         937
Mean Temp (°C)        945
Heat Deg Days (°C)    945
dtype: int64

In [9]:
# Droping nulls from the data
df = df.dropna()
df.isna().sum()

Date/Time             0
Year                  0
Month                 0
Day                   0
Max Temp (°C)         0
Min Temp (°C)         0
Mean Temp (°C)        0
Heat Deg Days (°C)    0
dtype: int64

In [10]:
# Verifying main statistics on the data. Notice that there are lots os missing data on 1960 compared to other years (only 184 counts). 
# Also, there is only 92 rows in 2023 because it is the current year.
df.groupby('Year')[['Mean Temp (°C)']].describe().T

Unnamed: 0,Year,1960,1961,1962,1963,1964,1965,1966,1967,1968,1969,...,2014,2015,2016,2017,2018,2019,2020,2021,2022,2023
Mean Temp (°C),count,184.0,365.0,365.0,365.0,366.0,365.0,365.0,365.0,366.0,365.0,...,365.0,365.0,363.0,359.0,357.0,360.0,349.0,340.0,363.0,92.0
Mean Temp (°C),mean,10.370652,6.421644,5.543836,5.694795,5.698087,5.341644,6.708767,5.780822,6.163934,6.977534,...,7.099178,6.476438,7.344077,7.529805,7.364706,6.755833,7.84957,8.467059,8.166942,-2.103261
Mean Temp (°C),std,8.648011,10.085186,9.343123,9.756776,8.707344,9.454167,8.72819,10.080103,9.975271,8.769505,...,9.701582,10.461173,9.33778,9.598454,9.660297,9.506765,9.286428,9.109338,9.569401,4.900178
Mean Temp (°C),min,-12.8,-19.5,-19.2,-17.5,-13.4,-19.7,-21.1,-20.0,-21.7,-14.8,...,-18.6,-16.6,-15.8,-14.9,-14.1,-12.6,-15.0,-10.6,-16.0,-20.8
Mean Temp (°C),25%,3.1,-1.1,-0.6,-1.7,-1.4,-2.2,-0.3,-2.3,-0.9,-0.6,...,0.0,-2.2,0.0,-0.3,0.1,-1.5,0.4,0.675,1.25,-4.525
Mean Temp (°C),50%,11.55,6.4,6.4,6.4,6.15,5.0,7.0,5.6,7.25,7.5,...,7.7,6.7,7.6,8.4,7.0,6.75,7.2,8.8,8.4,-1.3
Mean Temp (°C),75%,18.1,15.6,13.4,14.2,13.4,13.9,14.7,14.7,14.7,14.5,...,15.0,15.9,15.25,16.2,15.3,14.7,15.4,16.6,16.7,1.1
Mean Temp (°C),max,24.8,25.9,21.7,27.0,22.2,22.5,23.1,22.3,25.0,24.5,...,22.7,23.4,23.5,23.5,25.4,26.0,24.8,25.2,25.5,7.4


In [11]:
# To avoid statistical influence on mean temperatures from missing data it will be droped data from 1960 and 2023.
df = df[(df['Year']!=1960) & (df['Year']!=2023)]

In [12]:
# Data is clean now and ready for evaluation
df.isna().sum()

Date/Time             0
Year                  0
Month                 0
Day                   0
Max Temp (°C)         0
Min Temp (°C)         0
Mean Temp (°C)        0
Heat Deg Days (°C)    0
dtype: int64

In [13]:
# This is the shape of the DataFrame
df.shape

(22521, 8)