# Downloading and Processing Weather Data
In order to properly download the weather data, we need to know which of the ERA5 grid cells they fall into. In order to do this, I will create a modified version of the locations dataset, corresponding each voltage station to a netCDF grid cell. The lat and lon of which represents the center of its respective grid cell. Through experimentation, we know that these lie on the 0.1 marks of lat and lon, meaning the grid cells all span from +- 0.05 of that center, for example from lat 30.15 to 30.25, and lon 76.25 to 76.35.

In [1]:
import netCDF4 as nc
import pandas as pd
from datetime import datetime, timedelta
import os
from tqdm import tqdm
import numpy as np
import cdsapi

In [2]:
locations_path = "../ESMI_India_538_locations.csv"
locations = pd.read_csv(locations_path, dtype={"ESMI_ID" : str, "station_id" : int}, usecols=["station_id", "ESMI_ID", "Location name", "District", "State", "Lat", "Lon"])

In [3]:
uniform_dir = "../india_processing/india_uniform"

# For every station that we have collected the uniform data for, we list down the minimum and maximum time for that station, giving it a month on either side
for file in tqdm(os.listdir(uniform_dir)):
    station_id = int(file.split("_")[1].split(".")[0])
    station_df = pd.read_csv(os.path.join(uniform_dir, file), header=0, index_col=False, parse_dates=['time'])
    station_min = station_df['time'].min()
    station_max = station_df['time'].max()
    
    if station_min.month == 1:
        station_min = datetime(station_min.year - 1, 12, 1)
    else:
        station_min = datetime(station_min.year, station_min.month - 1, 1)

    if station_max.month == 12:
        station_max = datetime(station_max.year + 1, 1, 1)
    else:
        station_max = datetime(station_max.year, station_max.month + 1, 1)

    locations.loc[locations['station_id'] == station_id, 'era_min_time'] = station_min
    locations.loc[locations['station_id'] == station_id, 'era_max_time'] = station_max

  0%|          | 0/536 [00:00<?, ?it/s]

100%|██████████| 536/536 [08:40<00:00,  1.03it/s]


We drop all those locations that we didn't create uniform data for.

In [4]:
locations = locations.dropna(subset=['era_min_time', 'era_max_time'])

Then for each station, we consider which grid cell we need to download. We'll do this simply by calculating which grid center (multiple of 0.1) it is closest to in terms of both longitude and latitude.

In [5]:
for station_id in tqdm(locations['station_id']):
    lat = locations.loc[locations['station_id'] == station_id, 'Lat']
    lon = locations.loc[locations['station_id'] == station_id, 'Lon']

    era_lat = np.round(lat * 10) / 10
    era_lon = np.round(lon * 10) / 10

    locations.loc[locations['station_id'] == station_id, 'era_lat'] = era_lat
    locations.loc[locations['station_id'] == station_id, 'era_lon'] = era_lon

100%|██████████| 536/536 [00:00<00:00, 785.92it/s]


Below we can determine that some of the stations share the same ERA5 grid cell. Therefore, we'll download the ERA5 data and label it by its actual grid cell, that way we can reuse as needed.

In [6]:
weather = set()
for i in range(len(locations)):
    weather.add((locations.iloc[i]['era_lat'], locations.iloc[i]['era_lon']))

print(len(weather))

308


In [7]:
locations.to_csv("../era5_locations.csv", index=False)

We can now proceed to actual perform the downloading of the data, looping through each station and time, skipping the ones that have already been downloaded.

In [20]:
weather_dir = 'E:\Georgia Institute of Technology\He, Yiyi - 01_data\EAR5\\final_verification\ERA5'

if os.path.exists(weather_dir) == False:
    os.makedirs(weather_dir)

c = cdsapi.Client()

for i in range(len(locations)):
    lat = locations.iloc[i]['era_lat']
    lon = locations.iloc[i]['era_lon']
    min_time = locations.iloc[i]['era_min_time']
    max_time = locations.iloc[i]['era_max_time']

    year = min_time.year
    month = min_time.month
    curr_time = datetime(year, month, 1)

    while curr_time <= max_time:
        year_s = str(curr_time.year)
        month_s = str(curr_time.month).zfill(2)

        print(curr_time)
        print(year_s)
        print(month_s)

        filename = f'ERA5_{year_s}_{month_s}_{lat}_{lon}.nc'
        if os.path.exists(os.path.join(weather_dir, filename)) == False:
            r = c.retrieve(
            'reanalysis-era5-land', {
                    'variable'    : ['2m_temperature', '10m_u_component_of_wind', '10m_v_component_of_wind', 'total_precipitation'],
                    'year'        : year_s,
                    'month'       : month_s,
                    'day': [
                        '01', '02', '03',
                        '04', '05', '06',
                        '07', '08', '09',
                        '10', '11', '12',
                        '13', '14', '15',
                        '16', '17', '18',
                        '19', '20', '21',
                        '22', '23', '24',
                        '25', '26', '27',
                        '28', '29', '30',
                        '31',
                        ],
                    'time': [
                        '00:00', '01:00', '02:00',
                        '03:00', '04:00', '05:00',
                        '06:00', '07:00', '08:00',
                        '09:00', '10:00', '11:00',
                        '12:00', '13:00', '14:00',
                        '15:00', '16:00', '17:00',
                        '18:00', '19:00', '20:00',
                        '21:00', '22:00', '23:00',
                    ],
                    'area'          : [lat, lon, lat, lon],
                    'format'      : 'netcdf'
            })

            r.download(os.path.join(weather_dir, filename))
        else:
            print(f'File {filename} already exists')

        if curr_time.month == 12:
            curr_time = datetime(curr_time.year + 1, 1, 1)
        else:
            curr_time = datetime(curr_time.year, curr_time.month + 1, 1)
    
    print(f'{curr_time}')
    print(str(max_time))
    print()

2016-06-01 00:00:00
2016
06
File ERA5_2016_6_12.9_77.6.nc already exists
2016-07-01 00:00:00
2016
07
File ERA5_2016_6_12.9_77.6.nc already exists
2016-08-01 00:00:00
2016
08
File ERA5_2016_6_12.9_77.6.nc already exists
2016-09-01 00:00:00
2016
09
File ERA5_2016_6_12.9_77.6.nc already exists
2016-10-01 00:00:00
2016
10
File ERA5_2016_6_12.9_77.6.nc already exists
2016-11-01 00:00:00
2016
11
File ERA5_2016_6_12.9_77.6.nc already exists
2016-12-01 00:00:00
2016
12
File ERA5_2016_6_12.9_77.6.nc already exists
2017-01-01 00:00:00
2017
01
File ERA5_2016_6_12.9_77.6.nc already exists
2017-02-01 00:00:00
2017
02
File ERA5_2016_6_12.9_77.6.nc already exists
2017-03-01 00:00:00
2017
03
File ERA5_2016_6_12.9_77.6.nc already exists
2017-04-01 00:00:00
2017
04
File ERA5_2016_6_12.9_77.6.nc already exists
2017-05-01 00:00:00
2017
05
File ERA5_2016_6_12.9_77.6.nc already exists
2017-06-01 00:00:00
2017
06
File ERA5_2016_6_12.9_77.6.nc already exists
2017-07-01 00:00:00
2017
07
File ERA5_2016_6_12.9_7

In [22]:
year_s = str(curr_time.year)
month_s = str(curr_time.month).zfill(2)

for file in os.listdir(weather_dir):
    if file.endswith(".nc"):
        parts = file.split("_")
        parts[2] = parts[2].zfill(2)
        
        new_file_name = "".join(parts)
        print(new_file_name)
        #path = os.path.join(weather_dir, file)

Year: 2014, Month: 09
Year: 2014, Month: 09
Year: 2014, Month: 09
Year: 2014, Month: 09
Year: 2014, Month: 09
Year: 2014, Month: 10
Year: 2014, Month: 10
Year: 2014, Month: 10
Year: 2014, Month: 10
Year: 2014, Month: 10
Year: 2014, Month: 10
Year: 2014, Month: 10
Year: 2014, Month: 10
Year: 2014, Month: 10
Year: 2014, Month: 10
Year: 2014, Month: 10
Year: 2014, Month: 10
Year: 2014, Month: 10
Year: 2014, Month: 10
Year: 2014, Month: 10
Year: 2014, Month: 10
Year: 2014, Month: 11
Year: 2014, Month: 11
Year: 2014, Month: 11
Year: 2014, Month: 11
Year: 2014, Month: 11
Year: 2014, Month: 11
Year: 2014, Month: 11
Year: 2014, Month: 11
Year: 2014, Month: 11
Year: 2014, Month: 11
Year: 2014, Month: 11
Year: 2014, Month: 11
Year: 2014, Month: 12
Year: 2014, Month: 12
Year: 2014, Month: 12
Year: 2014, Month: 12
Year: 2014, Month: 12
Year: 2014, Month: 12
Year: 2014, Month: 12
Year: 2014, Month: 12
Year: 2014, Month: 12
Year: 2014, Month: 12
Year: 2014, Month: 12
Year: 2014, Month: 12
Year: 2014

In [12]:
nc_file = nc.Dataset(r'C:\Users\nytig\repos\Resilient_energy_network\weather_2017_01.nc', 'r')

print(nc_file.variables.keys())

dict_keys(['longitude', 'latitude', 'time', 't2m', 'u10', 'v10', 'tp'])


In [17]:
lon = nc_file.variables['longitude'][:]
time = nc_file.variables['time']
time = nc_file.variables[''][:]

In [55]:
lon = test.variables.keys()
lon

dict_keys(['longitude', 'latitude', 'time', 't2m', 'u10', 'v10', 'tp'])

In [74]:
tp = test.variables['tp'][:]
len(tp)

72

In [57]:
time = test.variables['time'][:]

In [59]:
time[0]

1025616

In [66]:
int(time[0])

1025616

In [44]:
test.close()

In [68]:
base = datetime(1900, 1, 1)
print(base + timedelta(hours = int(time[-1])))

2017-01-03 23:00:00
