In [19]:
import os
import re
import requests

import geopandas

import matplotlib.pyplot as plt
from matplotlib.cm import get_cmap
import seaborn as sns

In [27]:
# Constants
DEBUG = False
STATIONS_DATA_URL = "https://www.metoffice.gov.uk/pub/data/weather/uk/climate/stationdata/"
STATIONS_DATA_DIR = "./data/part1/"

WELLBEING_DATA_URL = "https://www.ons.gov.uk/file?uri=%2fpeoplepopulationandcommunity%2fwellbeing%2fdatasets%2fpersonalwellbeingestimatesgeographicalbreakdown%2f201415/geographicbreakdownreferencetable_tcm77-417203.xls"
WELLBEING_DATA_FILENAME = "geographicbreakdownreferencetable_tcm77-417203.xls"
WELLBEING_DATA_DIR = "./data/part3/"

N_CLUSTERS = 5

In [21]:
def echo(s):
    global DEBUG
    if DEBUG:
        print(s)

In [22]:
def cleanNumericValues(f, type_cast=float):
    number = re.search(string=f, pattern="(\-?(\d*\.)?\d+)")
    if number is None:
        return None
    else:
        return type_cast(number.group(1))

def cleanNumericValuesInt(f):
    return cleanNumericValues(f, type_cast=int)

In [23]:
def downloadDataFiles(station_filename, stations_data_dir = STATIONS_DATA_DIR, stations_data_url = STATIONS_DATA_URL):
    
    stations_data_files = os.listdir(stations_data_dir)
    
    if station_filename in stations_data_files:
        echo(f"File {stations_data_dir}{station_filename} already exists")
    else:
        echo(f"Download URL: {stations_data_url}{station_filename}")
        echo(f"Download Dir: {stations_data_dir}{station_filename}")
        echo(f"Downloading {station_filename}...")
        
        http_request = requests.get(f"{stations_data_url}{station_filename}")

        with open(f"{stations_data_dir}{station_filename}", 'wb') as f:
            f.write(http_request.content)

        # Retrieve HTTP meta-data
        echo(f"Status code: {http_request.status_code}")
        echo(f"File content type: {http_request.headers['content-type']}")
        
        echo(f"Finished downloading {station_filename}...")

In [24]:
def downloadWellbeingDataset():
    # Open the dataset directory
    data_files = os.listdir(WELLBEING_DATA_DIR)
    
    # If dataset exists skip
    if WELLBEING_DATA_FILENAME in data_files:
        echo(f"File {WELLBEING_DATA_DIR}{WELLBEING_DATA_FILENAME} already exists")
    
    # If dataset doesn't exist, download it
    else:
        echo(f"Download URL: {WELLBEING_DATA_URL}")
        echo(f"Download Dir: {WELLBEING_DATA_DIR}{WELLBEING_DATA_FILENAME}")
        echo(f"Downloading {WELLBEING_DATA_FILENAME}...")
        
        http_request = requests.get(f"{WELLBEING_DATA_URL}")

        with open(f"{WELLBEING_DATA_DIR}{WELLBEING_DATA_FILENAME}", 'wb') as f:
            f.write(http_request.content)

        # Retrieve HTTP meta-data
        echo(f"Status code: {http_request.status_code}")
        echo(f"File content type: {http_request.headers['content-type']}")
        
        echo(f"Finished downloading {WELLBEING_DATA_FILENAME}...")
    
    return f"{WELLBEING_DATA_DIR}{WELLBEING_DATA_FILENAME}"

In [36]:
def getNumberOfLinesToSkip(file_path, stop_string):
    # file_path:
    # stop_string: The line on which this string is found is the first line to start reading. 
    # All the lines before are to be skipped.
    
    lines_to_skip = 0
    
    with open(file_path, 'r') as f:
        
        while(1):
            line = f.readline()
            
            # Check if we reached the end of the file
            if line == '':
                break
                
            # for every line in which the string stop_string is not found increment
            # the variable lines_to_skip by one to skip these lines later when reading
            # the file into a Pandas DataFrame
            if stop_string not in line:
                lines_to_skip = lines_to_skip + 1
            else:
                break;
                
    return lines_to_skip

In [37]:
def getLocationData(file_path):
    # file_path:
    
    location_name = ""
    
    with open(file_path, 'r') as f:
        
        search_next_line = False
        
        location_data = {"area": None, "dms_e": None, "dms_n": None, "lat": None, "lon": None, "elevation": None}
        location_data["area"] = f.readline().strip()
        
        for i in range(0,5):
            
            line = f.readline()
            
            # Check if we reached the end of the file
            if line == '':
                break
            
            if str(line).lower().find("location") != -1 or search_next_line:
                
                search_next_line = False
                
                # Location 433900E 387200N, Lat 53.381 Lon -1.490, 131 metres amsl
                loc = re.search(string=line, pattern="(\d+E) *(\d+N)")
                if loc is not None:
                    location_data["dms_e"] = (loc.group(1))
                    location_data["dms_n"] = (loc.group(2))
                else: 
                    search_next_line = True
                
                loc = re.search(string=line, pattern="Lat *(\-?\d+(\.\d+)?) Lon *(\-?\d+(\.\d+)?)")
                if loc is not None:
                    location_data["lat"] = (loc.group(1))
                    location_data["lon"] = (loc.group(3))
                else: 
                    search_next_line = True
                    
                loc = re.search(string=line, pattern="(\d+) *(m|metres)")
                if loc is not None:
                    location_data["elevation"] = (loc.group(1))
                else: 
                    search_next_line = True
                
    return location_data

In [3]:
# This function take a Pandas DataFrame as input and plots a map with the weather stations
# colored according to their cluster.
def plotMap(X, X2=None, hide_legend=False, cluster_labels:dict=None):
    
    # Remove the default grid of Seaborn
    sns.set_style("white")
    
    # Group observations by station and count the how many times this station's readings were categorised 
    # under each cluster. Finally choose the cluster that received the highest number of observations for
    # the station
    clustered_stations = X.groupby(by=["station", "cluster"], as_index=False).size().reset_index(name="count")
    clustered_stations = clustered_stations.groupby(by=["station"], as_index=False)["count"].max().merge(clustered_stations, on=["station", "count"], how="inner")
    clustered_stations = clustered_stations.sort_values(by="cluster")
    
    # Prepare the plot's canvas
    fig, ax1, ax2 = None, None, None
    if X2 is not None:
        fig, (ax1, ax2) = plt.subplots(ncols=2, sharex=True, sharey=True, figsize=(15, 10))
        ax1.set_prop_cycle(color=get_cmap("tab10").colors)
        ax2.set_prop_cycle(color=get_cmap("tab10").colors)
    else:
        fig, ax1 = plt.subplots(ncols=1, figsize=(8, 10))
        ax1.set_prop_cycle(color=get_cmap("tab10").colors)
    
    # Choose the map style
    world = geopandas.read_file(geopandas.datasets.get_path('naturalearth_lowres'))
    
    # We restrict to UK
    world[world.name == "United Kingdom"].plot(color='white', edgecolor='black', figsize=(10,10), ax=ax1)

    global station_locations

    # Plot the Map
    for i, cluster in enumerate(clustered_stations.cluster.unique()):
        tmp_df = station_locations.merge(clustered_stations[clustered_stations.cluster == cluster], on=["station"], how="inner")
        gdf = geopandas.GeoDataFrame(tmp_df, geometry=geopandas.points_from_xy(tmp_df.lon, tmp_df.lat))
        
        # Set the cluster label
        label = f"Cluster {cluster}"
        if cluster_labels is not None:
            label = f"{cluster_labels[cluster]}"
        gdf.plot(ax=ax1, label=label)
        
        # Show station names on the map
        for x, y, label in zip(gdf.geometry.x, gdf.geometry.y, gdf.station):
            ax1.annotate(label.capitalize(), xy=(x, y), xytext=(3, 3), textcoords="offset points", fontsize="x-small")

    if not hide_legend:
        ax1.legend()
    
    if X2 is not None:
        clustered_stations = X2.groupby(by=["station", "cluster"], as_index=False).size().reset_index(name="count")
        clustered_stations = clustered_stations.groupby(by=["station"], as_index=False)["count"].max().merge(clustered_stations, on=["station", "count"], how="inner")
        clustered_stations = clustered_stations.sort_values(by="cluster")
        
        # We restrict to Europe
        world[world.name == "United Kingdom"].plot(color='white', edgecolor='black', figsize=(10,10), ax=ax2)

        # We can now plot our ``GeoDataFrame``.
        for i, cluster in enumerate(clustered_stations.cluster.unique()):
            tmp_df = station_locations.merge(clustered_stations[clustered_stations.cluster == cluster], on=["station"], how="inner")
            gdf = geopandas.GeoDataFrame(tmp_df, geometry=geopandas.points_from_xy(tmp_df.lon, tmp_df.lat))
#             gdf.plot(ax=ax2, label=f"Cluster {cluster}")
            
            # Set the cluster label
            label = f"Cluster {cluster}"
            if cluster_labels is not None:
                label = f"{cluster_labels[cluster]}"
            gdf.plot(ax=ax2, label=label)
        
            # Show station names on the map
            for x, y, label in zip(gdf.geometry.x, gdf.geometry.y, gdf.station):
                ax2.annotate(label.capitalize(), xy=(x, y), xytext=(3, 3), textcoords="offset points", fontsize="x-small")
        
        # Show the legend (List of clusters)
        if not hide_legend:
            ax2.legend()
            
    return fig, ax1, ax2

In [30]:
# This function take a Pandas DataFrame as input and plots a map with the weather stations
# colored according to their cluster.
def plotSingleMap(X, ax, hide_legend=False, show_station_names=False, cluster_labels=None):
    
    # Remove the default grid of Seaborn
    sns.set_style("white")
    
    # Group observations by station and count the how many times this station's readings were categorised 
    # under each cluster. Finally choose the cluster that received the highest number of observations for
    # the station
    clustered_stations = X.groupby(by=["station", "cluster"], as_index=False).size().reset_index(name="count")
    clustered_stations = clustered_stations.groupby(by=["station"], as_index=False)["count"].max().merge(clustered_stations, on=["station", "count"], how="inner")
    clustered_stations = clustered_stations.sort_values(by="cluster")
    
    # Prepare the plot's canvas
    ax.set_prop_cycle(color=get_cmap("tab20").colors)
    
    # Choose the map style
    world = geopandas.read_file(geopandas.datasets.get_path('naturalearth_lowres'))
    
    # We restrict to UK
    world[world.name == "United Kingdom"].plot(color='white', edgecolor='black', ax=ax)

    # Plot the Map
    for i, cluster in enumerate(clustered_stations.cluster.unique()):
        tmp_df = station_locations.merge(clustered_stations[clustered_stations.cluster == cluster], on=["station"], how="inner")
        gdf = geopandas.GeoDataFrame(tmp_df, geometry=geopandas.points_from_xy(tmp_df.lon, tmp_df.lat))
        
        # Set the cluster label
        label = f"Cluster {cluster}"
        if cluster_labels is not None:
            label = f"{cluster_labels[cluster]}"
        gdf.plot(ax=ax, label=label)
        
        # Show station names on the map
        if show_station_names:
            for x, y, label in zip(gdf.geometry.x, gdf.geometry.y, gdf.station):
                ax.annotate(label.capitalize(), xy=(x, y), xytext=(3, 3), textcoords="offset points", fontsize="x-small")
        
    if not hide_legend:
        ax.legend()
    
    return ax