In [5]:
!pip install netCDF4



In [2]:
DRIVE_PATH = "/content/drive/MyDrive/data606/"

# Set the location of this script in GDrive
SCRIPT_PATH = DRIVE_PATH + "src/"

# Root Path of the data on the cloud drive
DATA_PATH = DRIVE_PATH + "data/"

In [3]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


**Meteorilogical Source Data**

This data is formatted in netCDF format using SCI scientific units such as degrees Kelvin.  This is highly technical and is collected in  many different types of datasets.  The data aggregated into working datasets such as can be found at Kaggle represents a summarization of the most relevant subsets of this data as what makes data analysis practical without becoming an expert in the field.

In [12]:
import netCDF4
import numpy as np
import pandas as pd

---

**Air Pressure, 2 meters elev.**

---

In [10]:
# Sample code to read netCDF data
# This is air pressure recorded at 2meters above surface elevation, using long term mean averaging
f = netCDF4.Dataset('air.2m.mon.ltm.nc')
f = netCDF4.Dataset(DATA_PATH+'air.mon.anom.nc')

print(f.variables.keys()) # get all variable names
air = f.variables['air']
print(type(air))
time = f.variables['time']
print(time)

dict_keys(['lat', 'lon', 'time', 'time_bnds', 'air'])
<class 'netCDF4._netCDF4.Variable'>
<class 'netCDF4._netCDF4.Variable'>
float64 time(time)
    units: days since 1800-1-1 00:00:0.0
    long_name: Time
    delta_t: 0000-01-00 00:00:00
    avg_period: 0000-01-00 00:00:00
    standard_name: time
    axis: T
    coordinate_defines: start
    bounds: time_bnds
    calendar: gregorian
    coverage_content_type: coordinate
    actual_range: [18262. 81692.]
unlimited dimensions: time
current shape = (2085,)
filling on, default _FillValue of 9.969209968386869e+36 used


which produces output
```
odict_keys(['lat', 'lon', 'time', 'climatology_bounds', 'air', 'valid_yr_count'])
<class 'netCDF4._netCDF4.Variable'>
float32 air(time, lat, lon)
    long_name: Long Term Mean Monthly Mean of Air Temperature
    valid_range: [150. 400.]
    units: degK
    add_offset: 0.0
    scale_factor: 1.0
    missing_value: -9.96921e+36
    precision: 2
    least_significant_digit: 1
    GRIB_id: 11
    GRIB_name: TMP
    var_desc: Air temperature
    level_desc: 2 m
    statistic: Long Term Mean
    parent_stat: Mean
    dataset: NCEP Reanalysis Derived Products
    actual_range: [199.70786 312.07498]
unlimited dimensions:
current shape = (12, 94, 192)
filling on, default _FillValue of 9.969209968386869e+36 used

<class 'netCDF4._netCDF4.Variable'>
float64 time(time)
    long_name: Time
    delta_t: 0000-01-00 00:00:00
    avg_period: 0030-00-00 00:00:00
    prev_avg_period: 0017-00-00 00:00:00
    standard_name: time
    axis: T
    units: hours since 1800-01-01 00:00:0.0
    climatology: climatology_bounds
    climo_period: 1991/01/01 - 2020/12/31
    actual_range: [-15769752. -15761736.]
    ltm_range: [1674264. 1936512.]
    interpreted_actual_range: 0001/01/01 00:00:00 - 0001/12/01 00:00:00
unlimited dimensions:
current shape = (12,)
filling on, default _FillValue of 9.969209968386869e+36 used
```

---

**Air Temp**

---

In [11]:
# Sample code to read netCDF data
# This is air temp above surface elevation, using long term mean averaging
f = netCDF4.Dataset(DATA_PATH+'air.mon.anom.nc')

print(f.variables.keys()) # get all variable names
air = f.variables['air']
print(type(air))
time = f.variables['time']
print(time)

dict_keys(['lat', 'lon', 'time', 'time_bnds', 'air'])
<class 'netCDF4._netCDF4.Variable'>
<class 'netCDF4._netCDF4.Variable'>
float64 time(time)
    units: days since 1800-1-1 00:00:0.0
    long_name: Time
    delta_t: 0000-01-00 00:00:00
    avg_period: 0000-01-00 00:00:00
    standard_name: time
    axis: T
    coordinate_defines: start
    bounds: time_bnds
    calendar: gregorian
    coverage_content_type: coordinate
    actual_range: [18262. 81692.]
unlimited dimensions: time
current shape = (2085,)
filling on, default _FillValue of 9.969209968386869e+36 used


---

**HadSST (??)**

---

In [None]:
data_file = DATA_PATH + "HadSST.4.0.1.0_median.nc"
f = netCDF4.Dataset(data_file)

In [None]:
#f = netCDF4.Dataset(DATA_PATH + 'air.2m.mon.ltm.nc')

In [None]:
# get all variable names
fields = f.variables.keys()
fields

dict_keys(['tos', 'time', 'time_bnds', 'latitude', 'latitude_bnds', 'longitude', 'longitude_bnds'])

In [None]:
#air = f.variables['air']
#air

In [None]:
lat = f.variables['latitude']
long = f.variables['longitude']

In [None]:
time = f.variables['time']
time

<class 'netCDF4._netCDF4.Variable'>
float64 time(time)
    axis: T
    bounds: time_bnds
    units: days since 1850-01-01T00:00:00Z
    standard_name: time
    long_name: time
    calendar: gregorian
unlimited dimensions: 
current shape = (2085,)
filling on, default _FillValue of 9.969209968386869e+36 used

In [None]:
lat.get_dims()
latitudes = f.variables[lat_dim.name][:]


(<class 'netCDF4._netCDF4.Dimension'>: name = 'latitude', size = 36,)

In [None]:
print(f)

<class 'netCDF4._netCDF4.Dataset'>
root group (NETCDF4 data model, file format HDF5):
    comment: 
    history: Converted to netcdf today
    institution: Met Office
    reference: Kennedy et al. (2019), https://www.metoffice.gov.uk/hadobs/hadsst4
    source: surface observation
    title: Ensemble-median sea-surface temperature anomalies from the HadSST.4.0.1.0 data set
    version: HadSST.4.0.1.0
    Conventions: CF-1.7
    dimensions(sizes): time(2085), latitude(36), longitude(72), bnds(2)
    variables(dimensions): float32 tos(time, latitude, longitude), float64 time(time), float64 time_bnds(time, bnds), float64 latitude(latitude), float64 latitude_bnds(latitude, bnds), float64 longitude(longitude), float64 longitude_bnds(longitude, bnds)
    groups: 


In [None]:
time_dim, lat_dim, long_dim = f.variables['tos'].get_dims()

In [None]:
latitudes = f.variables[lat_dim.name][:]
latitudes

masked_array(data=[-87.5, -82.5, -77.5, -72.5, -67.5, -62.5, -57.5, -52.5,
                   -47.5, -42.5, -37.5, -32.5, -27.5, -22.5, -17.5, -12.5,
                    -7.5,  -2.5,   2.5,   7.5,  12.5,  17.5,  22.5,  27.5,
                    32.5,  37.5,  42.5,  47.5,  52.5,  57.5,  62.5,  67.5,
                    72.5,  77.5,  82.5,  87.5],
             mask=False,
       fill_value=1e+20)

In [None]:
longitude = f.variables[long_dim.name][:]
longitude

masked_array(data=[-177.5, -172.5, -167.5, -162.5, -157.5, -152.5, -147.5,
                   -142.5, -137.5, -132.5, -127.5, -122.5, -117.5, -112.5,
                   -107.5, -102.5,  -97.5,  -92.5,  -87.5,  -82.5,  -77.5,
                    -72.5,  -67.5,  -62.5,  -57.5,  -52.5,  -47.5,  -42.5,
                    -37.5,  -32.5,  -27.5,  -22.5,  -17.5,  -12.5,   -7.5,
                     -2.5,    2.5,    7.5,   12.5,   17.5,   22.5,   27.5,
                     32.5,   37.5,   42.5,   47.5,   52.5,   57.5,   62.5,
                     67.5,   72.5,   77.5,   82.5,   87.5,   92.5,   97.5,
                    102.5,  107.5,  112.5,  117.5,  122.5,  127.5,  132.5,
                    137.5,  142.5,  147.5,  152.5,  157.5,  162.5,  167.5,
                    172.5,  177.5],
             mask=False,
       fill_value=1e+20)

---

**Load into DataFrame**

---

In [None]:
df = pd.DataFrame()

In [None]:
# from https://stackoverflow.com/questions/14035148/import-netcdf-file-to-pandas-dataframe
time_arr = netCDF4.num2date(time[:], time.units)

In [None]:
for f in fields:
  print(f)

lat
lon
time
climatology_bounds
air
valid_yr_count


In [None]:
lat

<class 'netCDF4._netCDF4.Variable'>
float64 latitude(latitude)
    axis: Y
    bounds: latitude_bnds
    units: degrees_north
    standard_name: latitude
    long_name: latitude
unlimited dimensions: 
current shape = (36,)
filling on, default _FillValue of 9.969209968386869e+36 used

In [None]:
ser = pd.Series(air[:,0],index=time_arr)


ValueError: ignored

In [None]:
lat = f.variables['lat']


In [None]:
ser = pd.Series(lat[:],index=time_arr)


ValueError: ignored