# NetCDF4 to Pandas DataFrame
---
This code reads a NetCDF4 file (.nc) and saves the values into a DataFrame. Currently, the NetCDF4 dictionary that this code reads include the attributes: latitude, longitude, time, the u component of wind, and the v component of wind.

In [1]:
import pandas as pd
import datetime
import netCDF4 as nc

#### Input NetCDF4 file is from [Copernicus EU](https://cds.climate.copernicus.eu/cdsapp#!/dataset/reanalysis-era5-single-levels-monthly-means?tab=overview) dated 2007 to 2020.

Downloaded NC file includes monthly observations from Jan 2007 to Dec 2020 and must be trimmed to match Global Wind Atlas input map.

The latitude and longitude may be saved as is. The datetime must be derived from the time. Lastly, wind speed must be computed from its u and v components. A DataFrame row represents 1 grid monthly observation, which corresponds to a unique latitude-longitude-month combination (**71 by 45 by 156 = 498420 rows**).

In [2]:
filename = "data/ERA5.wind_uv_100m_PH_2007-2020.nc"
ds = nc.Dataset(filename)

In [3]:
print(ds)

<class 'netCDF4._netCDF4.Dataset'>
root group (NETCDF3_64BIT_OFFSET data model, file format NETCDF3):
    Conventions: CF-1.6
    history: 2021-03-16 14:56:58 GMT by grib_to_netcdf-2.16.0: /opt/ecmwf/eccodes/bin/grib_to_netcdf -S param -o /cache/data8/adaptor.mars.internal-1615906569.1779313-4262-9-ab629e05-1c42-4c97-ad78-f865d852d412.nc /cache/tmp/ab629e05-1c42-4c97-ad78-f865d852d412-adaptor.mars.internal-1615906569.1784804-4262-3-tmp.grib
    dimensions(sizes): longitude(45), latitude(71), time(168)
    variables(dimensions): float32 longitude(longitude), float32 latitude(latitude), int32 time(time), int16 u100(time, latitude, longitude), int16 v100(time, latitude, longitude)
    groups: 


In [4]:
print(ds.__dict__)

{'Conventions': 'CF-1.6', 'history': '2021-03-16 14:56:58 GMT by grib_to_netcdf-2.16.0: /opt/ecmwf/eccodes/bin/grib_to_netcdf -S param -o /cache/data8/adaptor.mars.internal-1615906569.1779313-4262-9-ab629e05-1c42-4c97-ad78-f865d852d412.nc /cache/tmp/ab629e05-1c42-4c97-ad78-f865d852d412-adaptor.mars.internal-1615906569.1784804-4262-3-tmp.grib'}


In [5]:
for dim in ds.dimensions.values():
    print(dim)

<class 'netCDF4._netCDF4.Dimension'>: name = 'longitude', size = 45
<class 'netCDF4._netCDF4.Dimension'>: name = 'latitude', size = 71
<class 'netCDF4._netCDF4.Dimension'>: name = 'time', size = 168


In [6]:
for var in ds.variables.values():
    print(var)

<class 'netCDF4._netCDF4.Variable'>
float32 longitude(longitude)
    units: degrees_east
    long_name: longitude
unlimited dimensions: 
current shape = (45,)
filling on, default _FillValue of 9.969209968386869e+36 used
<class 'netCDF4._netCDF4.Variable'>
float32 latitude(latitude)
    units: degrees_north
    long_name: latitude
unlimited dimensions: 
current shape = (71,)
filling on, default _FillValue of 9.969209968386869e+36 used
<class 'netCDF4._netCDF4.Variable'>
int32 time(time)
    units: hours since 1900-01-01 00:00:00.0
    long_name: time
    calendar: gregorian
unlimited dimensions: 
current shape = (168,)
filling on, default _FillValue of -2147483647 used
<class 'netCDF4._netCDF4.Variable'>
int16 u100(time, latitude, longitude)
    scale_factor: 0.00035470298897597396
    add_offset: -2.656274763603863
    _FillValue: -32767
    missing_value: -32767
    units: m s**-1
    long_name: 100 metre U wind component
unlimited dimensions: 
current shape = (168, 71, 45)
filling on

In [7]:
print(ds["u100"])

<class 'netCDF4._netCDF4.Variable'>
int16 u100(time, latitude, longitude)
    scale_factor: 0.00035470298897597396
    add_offset: -2.656274763603863
    _FillValue: -32767
    missing_value: -32767
    units: m s**-1
    long_name: 100 metre U wind component
unlimited dimensions: 
current shape = (168, 71, 45)
filling on


In [8]:
print(ds["v100"])

<class 'netCDF4._netCDF4.Variable'>
int16 v100(time, latitude, longitude)
    scale_factor: 0.0003277946693250785
    add_offset: -1.400058134852973
    _FillValue: -32767
    missing_value: -32767
    units: m s**-1
    long_name: 100 metre V wind component
unlimited dimensions: 
current shape = (168, 71, 45)
filling on


In [9]:
u = ds["u100"][:]
u

masked_array(
  data=[[[-8.25703496e+00, -8.16232926e+00, -7.96937084e+00, ...,
          -6.57928982e+00, -6.57077695e+00, -6.55658883e+00],
         [-8.25384263e+00, -8.13927357e+00, -7.98852480e+00, ...,
          -6.74387201e+00, -6.73748735e+00, -6.69598711e+00],
         [-8.25774437e+00, -8.14885055e+00, -8.04705079e+00, ...,
          -6.86801805e+00, -6.87440271e+00, -6.83786830e+00],
         ...,
         [ 4.26602795e-02, -9.95756191e-02, -1.38947651e-01, ...,
          -1.64785417e+00, -1.76348734e+00, -1.84365022e+00],
         [ 7.10365186e-02,  8.06134993e-02, -5.55924484e-02, ...,
          -1.31975390e+00, -1.43503237e+00, -1.43716059e+00],
         [ 3.48568138e-02,  1.06152115e-01,  2.70533480e-02, ...,
          -1.01435463e+00, -1.07075240e+00, -1.06117542e+00]],

        [[-6.32567718e+00, -6.28524104e+00, -6.17705663e+00, ...,
          -5.04768232e+00, -4.99908801e+00, -4.97780583e+00],
         [-6.38562199e+00, -6.34447644e+00, -6.29694624e+00, ...,
        

In [10]:
v = ds["v100"][:]
v

masked_array(
  data=[[[-8.36372809e+00, -8.62924177e+00, -8.83739139e+00, ...,
          -4.48034464e+00, -4.50099571e+00, -4.54098666e+00],
         [-8.44075984e+00, -8.62006352e+00, -8.79543367e+00, ...,
          -4.51836882e+00, -4.55016491e+00, -4.56589905e+00],
         [-8.50074626e+00, -8.65022063e+00, -8.78986116e+00, ...,
          -4.54295342e+00, -4.55803198e+00, -4.55409844e+00],
         ...,
         [-9.88675825e-01, -1.13552784e+00, -1.13126651e+00, ...,
          -4.14304393e+00, -4.24990499e+00, -4.11649256e+00],
         [-8.50018680e-01, -8.21172749e-01, -7.13328303e-01, ...,
          -3.97259070e+00, -4.07945176e+00, -3.96144568e+00],
         [-5.61887165e-01, -3.90450553e-01, -1.69189152e-01, ...,
          -4.04667229e+00, -3.94636713e+00, -3.80639880e+00]],

        [[-3.07017198e+00, -3.22357988e+00, -3.29667809e+00, ...,
          -2.19332123e+00, -2.26346929e+00, -2.34181222e+00],
         [-2.94823236e+00, -3.04788194e+00, -3.14654813e+00, ...,
        

In [11]:
len(u), len(v)

(168, 168)

In [12]:
len(u[0]), len(v[0])

(71, 71)

In [13]:
len(u[0][0]), len(v[0][0])

(45, 45)

In [14]:
u[0][0]

masked_array(data=[-8.25703496, -8.16232926, -7.96937084, -7.7512285 ,
                   -7.54088962, -7.29330694, -6.99606583, -6.68499131,
                   -6.33809179, -6.03659425, -5.75708829, -5.63684398,
                   -5.79220389, -6.30794204, -7.3358713 , -8.7248882 ,
                   -9.67052637, -9.87235237, -9.4608969 , -8.80114934,
                   -8.31094981, -7.89630202, -7.58345398, -7.32735843,
                   -7.12446832, -7.00138638, -6.96165964, -6.85737697,
                   -6.79778686, -6.70485468, -6.65058512, -6.59205913,
                   -6.5055116 , -6.51508858, -6.58673858, -6.63355938,
                   -6.62788413, -6.60270022, -6.59454205, -6.6122772 ,
                   -6.6122772 , -6.60873017, -6.57928982, -6.57077695,
                   -6.55658883],
             mask=False,
       fill_value=1e+20)

In [15]:
v[0][0]

masked_array(data=[-8.36372809, -8.62924177, -8.83739139, -8.98588237,
                   -9.12191716, -9.20353803, -9.21927218, -9.19534317,
                   -9.05865279, -8.81149561, -8.35815558, -7.80418259,
                   -7.1453153 , -6.40286038, -5.88723936, -6.05212008,
                   -6.77326835, -7.51801784, -7.91169924, -7.82942278,
                   -7.47704351, -7.20202378, -6.95027748, -6.62936649,
                   -6.25863072, -5.92133001, -5.67187827, -5.31884341,
                   -5.13363942, -5.00711068, -4.91500037, -4.83108494,
                   -4.73831905, -4.64686433, -4.66063171, -4.62391871,
                   -4.58589453, -4.54000327, -4.51443529, -4.5567208 ,
                   -4.51312411, -4.50394586, -4.48034464, -4.50099571,
                   -4.54098666],
             mask=False,
       fill_value=1e+20)

In [16]:
longs = ds["longitude"][:]
longs, len(longs)

(masked_array(data=[116.5 , 116.75, 117.  , 117.25, 117.5 , 117.75, 118.  ,
                    118.25, 118.5 , 118.75, 119.  , 119.25, 119.5 , 119.75,
                    120.  , 120.25, 120.5 , 120.75, 121.  , 121.25, 121.5 ,
                    121.75, 122.  , 122.25, 122.5 , 122.75, 123.  , 123.25,
                    123.5 , 123.75, 124.  , 124.25, 124.5 , 124.75, 125.  ,
                    125.25, 125.5 , 125.75, 126.  , 126.25, 126.5 , 126.75,
                    127.  , 127.25, 127.5 ],
              mask=False,
        fill_value=1e+20,
             dtype=float32),
 45)

In [17]:
lats = ds["latitude"][:]
lats, len(lats)

(masked_array(data=[21.5 , 21.25, 21.  , 20.75, 20.5 , 20.25, 20.  , 19.75,
                    19.5 , 19.25, 19.  , 18.75, 18.5 , 18.25, 18.  , 17.75,
                    17.5 , 17.25, 17.  , 16.75, 16.5 , 16.25, 16.  , 15.75,
                    15.5 , 15.25, 15.  , 14.75, 14.5 , 14.25, 14.  , 13.75,
                    13.5 , 13.25, 13.  , 12.75, 12.5 , 12.25, 12.  , 11.75,
                    11.5 , 11.25, 11.  , 10.75, 10.5 , 10.25, 10.  ,  9.75,
                     9.5 ,  9.25,  9.  ,  8.75,  8.5 ,  8.25,  8.  ,  7.75,
                     7.5 ,  7.25,  7.  ,  6.75,  6.5 ,  6.25,  6.  ,  5.75,
                     5.5 ,  5.25,  5.  ,  4.75,  4.5 ,  4.25,  4.  ],
              mask=False,
        fill_value=1e+20,
             dtype=float32),
 71)

In [18]:
times = ds["time"][:]
times

masked_array(data=[ 937944,  938688,  939360,  940104,  940824,  941568,
                    942288,  943032,  943776,  944496,  945240,  945960,
                    946704,  947448,  948144,  948888,  949608,  950352,
                    951072,  951816,  952560,  953280,  954024,  954744,
                    955488,  956232,  956904,  957648,  958368,  959112,
                    959832,  960576,  961320,  962040,  962784,  963504,
                    964248,  964992,  965664,  966408,  967128,  967872,
                    968592,  969336,  970080,  970800,  971544,  972264,
                    973008,  973752,  974424,  975168,  975888,  976632,
                    977352,  978096,  978840,  979560,  980304,  981024,
                    981768,  982512,  983208,  983952,  984672,  985416,
                    986136,  986880,  987624,  988344,  989088,  989808,
                    990552,  991296,  991968,  992712,  993432,  994176,
                    994896,  995640,  996384,  9971

In [19]:
months = pd.date_range(start="2007-01-01", periods=len(ds["time"]), freq="MS")
months

DatetimeIndex(['2007-01-01', '2007-02-01', '2007-03-01', '2007-04-01',
               '2007-05-01', '2007-06-01', '2007-07-01', '2007-08-01',
               '2007-09-01', '2007-10-01',
               ...
               '2020-03-01', '2020-04-01', '2020-05-01', '2020-06-01',
               '2020-07-01', '2020-08-01', '2020-09-01', '2020-10-01',
               '2020-11-01', '2020-12-01'],
              dtype='datetime64[ns]', length=168, freq='MS')

In [20]:
df_vals = []
for z in range(0, len(months)): # 168
    for x in range(0, len(longs)): # 45
        for y in range(0, len(lats)): # 71
            df_vals.append({"date": months[z],
                            "lon": longs[x], 
                            "lat": lats[y],
                            "u": u[z][y][x],
                            "v": v[z][y][x]})
len(df_vals)

536760

In [31]:
df = pd.DataFrame(df_vals, columns=["date", "lat", "lon", "u", "v"])
df

Unnamed: 0,date,lat,lon,u,v
0,2007-01-01,21.50,116.5,-8.257035,-8.363728
1,2007-01-01,21.25,116.5,-8.253843,-8.440760
2,2007-01-01,21.00,116.5,-8.257744,-8.500746
3,2007-01-01,20.75,116.5,-8.290377,-8.566961
4,2007-01-01,20.50,116.5,-8.343582,-8.631209
...,...,...,...,...,...
536755,2020-12-01,5.00,127.5,-0.639788,-1.543960
536756,2020-12-01,4.75,127.5,-0.479817,-1.541665
536757,2020-12-01,4.50,127.5,-0.296790,-1.533798
536758,2020-12-01,4.25,127.5,-0.096029,-1.499708


In [32]:
df = df[df["date"] > "2007-12-01"].reset_index(drop=True)
df

Unnamed: 0,date,lat,lon,u,v
0,2008-01-01,21.50,116.5,-8.495750,-7.432136
1,2008-01-01,21.25,116.5,-8.502489,-7.396406
2,2008-01-01,21.00,116.5,-8.523772,-7.323636
3,2008-01-01,20.75,116.5,-8.613157,-7.253815
4,2008-01-01,20.50,116.5,-8.735175,-7.220708
...,...,...,...,...,...
498415,2020-12-01,5.00,127.5,-0.639788,-1.543960
498416,2020-12-01,4.75,127.5,-0.479817,-1.541665
498417,2020-12-01,4.50,127.5,-0.296790,-1.533798
498418,2020-12-01,4.25,127.5,-0.096029,-1.499708


In [33]:
import math
def get_wind_speed(u, v):
    return math.sqrt((u * u) + (v * v))

In [34]:
df.columns

Index(['date', 'lat', 'lon', 'u', 'v'], dtype='object')

In [35]:
df["windvelo"] = df.apply(lambda x : get_wind_speed(x["u"], x["v"]), axis=1)

In [36]:
df

Unnamed: 0,date,lat,lon,u,v,windvelo
0,2008-01-01,21.50,116.5,-8.495750,-7.432136,11.287799
1,2008-01-01,21.25,116.5,-8.502489,-7.396406,11.269390
2,2008-01-01,21.00,116.5,-8.523772,-7.323636,11.237897
3,2008-01-01,20.75,116.5,-8.613157,-7.253815,11.260742
4,2008-01-01,20.50,116.5,-8.735175,-7.220708,11.333221
...,...,...,...,...,...,...
498415,2020-12-01,5.00,127.5,-0.639788,-1.543960,1.671269
498416,2020-12-01,4.75,127.5,-0.479817,-1.541665,1.614607
498417,2020-12-01,4.50,127.5,-0.296790,-1.533798,1.562249
498418,2020-12-01,4.25,127.5,-0.096029,-1.499708,1.502779


In [37]:
df.dtypes

date        datetime64[ns]
lat                float64
lon                float64
u                  float64
v                  float64
windvelo           float64
dtype: object

In [38]:
df = df.drop(columns=["u", "v"])
df = df.astype({"lat": "float32", "lon": "float32", "windvelo": "float32"})
df

Unnamed: 0,date,lat,lon,windvelo
0,2008-01-01,21.50,116.5,11.287799
1,2008-01-01,21.25,116.5,11.269390
2,2008-01-01,21.00,116.5,11.237897
3,2008-01-01,20.75,116.5,11.260742
4,2008-01-01,20.50,116.5,11.333221
...,...,...,...,...
498415,2020-12-01,5.00,127.5,1.671269
498416,2020-12-01,4.75,127.5,1.614607
498417,2020-12-01,4.50,127.5,1.562249
498418,2020-12-01,4.25,127.5,1.502779


In [39]:
df.dtypes

date        datetime64[ns]
lat                float32
lon                float32
windvelo           float32
dtype: object

In [43]:
df.to_pickle("results/era0820_100m.pkl")