### This file maps the locations listed in forecast_weather.csv to their corresponding counties.

In [1]:
#### import libraries
import pandas as pd
from pathlib import Path
import json
# geopy is a Python library for locating the coordinates of addresses
# import Nominatim class from geopy library to use for geocoding 
# addresses into latitude and longitude.
from geopy.geocoders import Nominatim

In [2]:
#### load all CSV files
folder_path = '/kaggle/input/predict-energy-behavior-of-prosumers'
# look for all files in the folder ending with "".csv"
csv_files = Path(folder_path).glob("*.csv")

# initialize an empty dictionary to store the DataFrames
files = {}

# loop through each CSV file found
for csv_file in csv_files:
    # create a variable name based on the file name (without the folder path and the ".csv" part)
    variable_name = csv_file.stem
    
    # read the CSV file and assign its DataFrame to the dictionary with the key
    files[variable_name] = pd.read_csv(csv_file)
    
print(files.keys())

dict_keys(['client', 'gas_prices', 'electricity_prices', 'weather_station_to_county_mapping', 'historical_weather', 'train', 'forecast_weather'])


In [3]:
#### load the JSON file
# open the JSON file for reading
# with statement ensures the file is properly closed after its suite finishes
with open('/kaggle/input/predict-energy-behavior-of-prosumers/county_id_to_name_map.json') as f:
    # load the JSON data into a Python dictionary
    county_codes = json.load(f)

print(county_codes)

{'0': 'HARJUMAA', '1': 'HIIUMAA', '2': 'IDA-VIRUMAA', '3': 'JÄRVAMAA', '4': 'JÕGEVAMAA', '5': 'LÄÄNE-VIRUMAA', '6': 'LÄÄNEMAA', '7': 'PÄRNUMAA', '8': 'PÕLVAMAA', '9': 'RAPLAMAA', '10': 'SAAREMAA', '11': 'TARTUMAA', '12': 'UNKNOWN', '13': 'VALGAMAA', '14': 'VILJANDIMAA', '15': 'VÕRUMAA'}


In [4]:
#### clean the JSON file
# initialize an empty dictionary
parsed_counties = {}

# loop through each key-value pair in the original dictionary
for code, name in county_codes.items():
    # process the name, make it lowercase and remove trailing 'maa'
    processed_name = name.lower().rstrip('maa')
    # assign the new name to the code in the new dictionary
    parsed_counties[processed_name] = code
    
print(parsed_counties)

{'harju': '0', 'hiiu': '1', 'ida-viru': '2', 'järv': '3', 'jõgev': '4', 'lääne-viru': '5', 'lääne': '6', 'pärnu': '7', 'põlv': '8', 'rapl': '9', 'saare': '10', 'tartu': '11', 'unknown': '12', 'valg': '13', 'viljandi': '14', 'võru': '15'}


In [5]:
#### map codes and locations
# a dictionary that maps full country names to their shortened versions
name_mapping = {
    "valga": "valg",
    "põlva": "põlv",
    "jõgeva": "jõgev",
    "rapla": "rapl",
    "järva": "järv"
}

# initialize the geocoder
geoLoc = Nominatim(user_agent='GetLoc')

# loop through each row with unique latitude and longitude values
for coords in files['forecast_weather'][['latitude', 'longitude']].drop_duplicates().itertuples(index=False):
    # extract latitude and longitude from the current row
    lat, lon = coords[0], coords[1]
    
    # use the geocoder to get the location information for the current coordinates
    locname = geoLoc.reverse((lat, lon))
    
    # access raw JSON file which is in python dictionary format,
    # using get() to access key in the dictionary
    if locname and locname.raw.get('address', {}).get('country') == "Eesti":
        county_name = locname.raw['address'].get('county', '').split()[0].lower()
        # use a name mapping dictionary or the original county name if not found
        county_name_mapped = name_mapping.get(county_name, county_name)
        # output the county information
        print(f"county: '{county_name_mapped}', county code:", parsed_counties[county_name_mapped], (lat, lon))

county: 'saare', county code: 10 (57.6, 23.2)
county: 'võru', county code: 15 (57.6, 26.7)
county: 'võru', county code: 15 (57.6, 27.2)
county: 'saare', county code: 10 (57.9, 21.7)
county: 'saare', county code: 10 (57.9, 22.2)
county: 'saare', county code: 10 (57.9, 23.2)
county: 'pärnu', county code: 7 (57.9, 23.7)
county: 'pärnu', county code: 7 (57.9, 24.2)
county: 'valg', county code: 13 (57.9, 26.2)
county: 'võru', county code: 15 (57.9, 26.7)
county: 'võru', county code: 15 (57.9, 27.2)
county: 'võru', county code: 15 (57.9, 27.7)
county: 'saare', county code: 10 (58.2, 21.7)
county: 'saare', county code: 10 (58.2, 22.2)
county: 'saare', county code: 10 (58.2, 22.7)
county: 'saare', county code: 10 (58.2, 23.2)
county: 'pärnu', county code: 7 (58.2, 23.7)
county: 'pärnu', county code: 7 (58.2, 24.2)
county: 'pärnu', county code: 7 (58.2, 24.7)
county: 'pärnu', county code: 7 (58.2, 25.2)
county: 'viljandi', county code: 14 (58.2, 25.7)
county: 'tartu', county code: 11 (58.2, 26.

In [6]:
#### save the data
# create a new dictionary with the parsed county names as keys
# and empty lists as values
county_data = {v: [] for v in parsed_counties.values()}

# loop through each row with unique latitude and longitude values
for coords in files['forecast_weather'][['latitude', 'longitude']].drop_duplicates().itertuples(index=False):
    # extract latitude and longitude from the current row
    lat, lon = coords[0], coords[1]
    # use the geocoder to get the location information for the current coordinates
    locname = geoLoc.reverse((lat, lon))
    # access raw JSON file which is in python dictionary format,
    # using get() to access key in the dictionary
    if locname and locname.raw.get('address', {}).get('country') == "Eesti":
        county_name = locname.raw['address'].get('county', '').split()[0].lower()
        # use a name mapping dictionary or the original county name if not found
        # get(key, default), get the value of key, otherwise using default value
        county_name = name_mapping.get(county_name, county_name)
        # output the county information
        county_data[parsed_counties[county_name]].append((lat, lon))

In [7]:
print(county_data)

# the key '0' has a value that is a list of 10 tuples,
# each tuple contains 2 elements (latitute, longitude).

{'0': [(59.1, 24.2), (59.1, 25.2), (59.4, 23.7), (59.4, 24.2), (59.4, 24.7), (59.4, 25.2), (59.4, 25.7), (59.7, 24.7), (59.7, 25.2), (59.7, 25.7)], '1': [(58.8, 21.7), (58.8, 22.2), (58.8, 22.7), (58.8, 23.2), (59.1, 22.2), (59.1, 22.7)], '2': [(59.1, 27.2), (59.1, 27.7), (59.4, 27.2), (59.4, 27.7)], '3': [(58.8, 25.2), (58.8, 25.7), (59.1, 25.7)], '4': [(58.5, 26.2), (58.8, 26.2), (58.8, 26.7), (58.8, 27.2)], '5': [(59.1, 26.2), (59.1, 26.7), (59.4, 26.2), (59.4, 26.7), (59.7, 26.2), (59.7, 26.7)], '6': [(58.8, 23.7), (59.1, 23.2), (59.1, 23.7), (59.4, 23.2)], '7': [(57.9, 23.7), (57.9, 24.2), (58.2, 23.7), (58.2, 24.2), (58.2, 24.7), (58.2, 25.2), (58.5, 23.7), (58.5, 24.2), (58.5, 24.7)], '8': [(58.2, 27.2)], '9': [(58.8, 24.2), (58.8, 24.7), (59.1, 24.7)], '10': [(57.6, 23.2), (57.9, 21.7), (57.9, 22.2), (57.9, 23.2), (58.2, 21.7), (58.2, 22.2), (58.2, 22.7), (58.2, 23.2), (58.5, 21.7), (58.5, 22.2), (58.5, 22.7), (58.5, 23.2)], '11': [(58.2, 26.2), (58.2, 26.7), (58.5, 26.7), (58.

In [8]:
df_data = {"county": [], "latitude": [], "longitude": []}


# loop through the county_data
for key, value in county_data.items():
    # the key is the county code, the value is the list of coordinates (tuples)
    # extend() is used to add multiple elements to the end of a list
    # extend the 'county' list with the county code 'k' repeated 'len(v)' times
    df_data["county"].extend([key] * len(value))
    
    # extend the 'latitude' and 'longitude' lists with the respective coordinates
    # use list comprehension to create latitude column
    df_data["latitude"].extend([lat for lat, _ in value])
    # use list comprehension to create longitude column
    df_data["longitude"].extend([lon for _, lon in value])


In [9]:
print(df_data)

{'county': ['0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '1', '1', '1', '1', '1', '1', '2', '2', '2', '2', '3', '3', '3', '4', '4', '4', '4', '5', '5', '5', '5', '5', '5', '6', '6', '6', '6', '7', '7', '7', '7', '7', '7', '7', '7', '7', '8', '9', '9', '9', '10', '10', '10', '10', '10', '10', '10', '10', '10', '10', '10', '10', '11', '11', '11', '11', '13', '14', '14', '14', '15', '15', '15', '15', '15'], 'latitude': [59.1, 59.1, 59.4, 59.4, 59.4, 59.4, 59.4, 59.7, 59.7, 59.7, 58.8, 58.8, 58.8, 58.8, 59.1, 59.1, 59.1, 59.1, 59.4, 59.4, 58.8, 58.8, 59.1, 58.5, 58.8, 58.8, 58.8, 59.1, 59.1, 59.4, 59.4, 59.7, 59.7, 58.8, 59.1, 59.1, 59.4, 57.9, 57.9, 58.2, 58.2, 58.2, 58.2, 58.5, 58.5, 58.5, 58.2, 58.8, 58.8, 59.1, 57.6, 57.9, 57.9, 57.9, 58.2, 58.2, 58.2, 58.2, 58.5, 58.5, 58.5, 58.5, 58.2, 58.2, 58.5, 58.5, 57.9, 58.2, 58.5, 58.5, 57.6, 57.6, 57.9, 57.9, 57.9], 'longitude': [24.2, 25.2, 23.7, 24.2, 24.7, 25.2, 25.7, 24.7, 25.2, 25.7, 21.7, 22.2, 22.7, 23.2, 22.2, 22.7, 27.2, 27.7, 2

In [10]:
pd.DataFrame(df_data).to_csv("/kaggle/working/county_lon_lats.csv")