-
Notifications
You must be signed in to change notification settings - Fork 0
/
scrape_weather.py
169 lines (150 loc) · 6.14 KB
/
scrape_weather.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
#################################################################
# Description: Project - Part 1 - Scraping
# Author: Kwok Keung Lai
# Date: Oct 15, 2023
# Usage: This module create a HTML Parser class to scrape
# Winnipeg weather data from the Environment Canada website
#################################################################
"""This module create a HTML Parser class to scrape Winnipeg weather
data from the Environment Canada website"""
import urllib.request
from html.parser import HTMLParser
from datetime import datetime
from dateutil.relativedelta import relativedelta
class WeatherScraper(HTMLParser):
"""
A class to scrape Winnipeg weather data from the Environment Canada website.
"""
def __init__(self):
super().__init__()
self.in_date = False
self.in_max_temp = False
self.in_min_temp = False
self.in_mean_temp = False
self.in_dynamic_data_table = False
self.in_tbody = False
self.title_date = None
self.date = None
self.weather = {}
self.base_url = "http://climate.weather.gc.ca/climate_data/daily_data_e.html"
self.current_date = datetime.now().date()
self.col_count=0
def fetch_weather_data(self, url):
"""
Fetch weather data HTML content from the given URL.
"""
try:
with urllib.request.urlopen(url) as response:
return response.read().decode('utf-8')
except urllib.error.URLError as url_exception:
print(f"Error fetching data from {url}: {url_exception}")
return None
def handle_starttag(self, tag, attrs):
"""
Handle the start of an HTML tag.
"""
if tag == 'div':
for attr in attrs:
if attr[0] == 'id' and attr[1] == 'dynamicDataTable':
# Set a flag to indicate that we're inside the desired <div>.
self.in_dynamic_data_table = True
if self.in_dynamic_data_table:
if tag == 'tbody' :
self.in_tbody = True
if tag == 'abbr' and self.in_tbody:
for attr in attrs:
if attr[0] == 'title' :
self.in_date = True
self.title_date = attr[1]
if tag == 'td' and self.in_date:
self.col_count += 1
match self.col_count:
case 1:
self.in_max_temp = True
case 2:
self.in_min_temp = True
case 3:
self.in_mean_temp = True
def handle_data(self, data):
"""
Handle the data content within an HTML tag.
"""
if self.in_dynamic_data_table:
if self.in_date:
if self.is_valid_date(self.title_date):
self.date = datetime.strptime(self.title_date, '%B %d, %Y').date()
if self.in_max_temp:
if self.is_float(data):
self.weather[self.date.strftime("%Y-%m-%d")] = {"Max": float(data)}
else:
self.weather[self.date.strftime("%Y-%m-%d")] = {"Max": None}
self.in_max_temp = False
if self.in_min_temp:
if self.is_float(data):
self.weather[self.date.strftime("%Y-%m-%d")]["Min"] = float(data)
else:
self.weather[self.date.strftime("%Y-%m-%d")]["Min"] = None
self.in_min_temp = False
if self.in_mean_temp:
if self.is_float(data):
self.weather[self.date.strftime("%Y-%m-%d")]["Mean"] = float(data)
else:
self.weather[self.date.strftime("%Y-%m-%d")]["Mean"] = None
self.in_mean_temp = False
self.reset_flags()
def reset_flags(self):
"""Reset the parser flags for temperature data."""
self.in_date = False
self.col_count = 0
def scrape_weather(self, latest_date_str=None):
"""
Scrape weather data from the website and store it in the weather dictionary.
"""
latest_date = datetime.now().date()
date_format = "%Y-%m-%d"
if latest_date_str is not None:
# Convert the string to a datetime object
latest_date = datetime.strptime(latest_date_str, date_format).date()
else:
latest_date = datetime.strptime("1950-01-01", date_format).date()
while True:
if latest_date >= self.current_date:
break
url = f"{self.base_url}?StationID=27174&timeframe=2&StartYear=1840" \
f"&EndYear={self.current_date.year}&Day=1" \
f"&Year={self.current_date.year}&Month={self.current_date.month}#"
html_data = self.fetch_weather_data(url)
# Check if the "We're sorry we were unable to satisfy your request."
# exists (indicating no data)
if "We're sorry we were unable to satisfy your request." in html_data:
break
self.feed(html_data)
# Subtract one month from the current date
self.current_date -= relativedelta(months=1)
return self.weather
def is_float(self, str_data):
""" Function for check data is float number."""
try:
float(str_data)
return True
except ValueError:
return False
def is_valid_date(self, date_string):
""" Function for check data is date string with MMM dd, YYYY format."""
try:
datetime.strptime(date_string, '%B %d, %Y')
return True
except ValueError:
return False
if __name__ == '__main__':
scraper = WeatherScraper()
weather = scraper.scrape_weather()
if weather:
for date, daily_temps in weather.items():
print(f"{date}: " \
f"Max={daily_temps['Max']}, " \
f"Min={daily_temps['Min']}, " \
f"Mean={daily_temps['Mean']}")
else:
print("No weather data retrieved. Check for errors in the script.")
print(weather)