In [1]:
!pip install netCDF4

Collecting netCDF4
  Downloading netCDF4-1.6.5-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (5.5 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m5.5/5.5 MB[0m [31m29.9 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting cftime (from netCDF4)
  Downloading cftime-1.6.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.3/1.3 MB[0m [31m69.6 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: cftime, netCDF4
Successfully installed cftime-1.6.3 netCDF4-1.6.5


In [2]:
DRIVE_PATH = "/content/drive/MyDrive/data606/"

# Set the location of this script in GDrive
SCRIPT_PATH = DRIVE_PATH + "src/"

# Root Path of the data on the cloud drive
DATA_PATH = DRIVE_PATH + "data/"

In [3]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


---

**Utils**

---

In [4]:
def get_data(grp, cols):
  vars = {}
  for col in cols:
    arr = np.asarray(grp.variables[col][:])
    vars[col] = arr
    print(f'## Loaded: {col} {arr.shape}')
  return vars

**Meteorilogical Source Data**

This data is formatted in netCDF format using SCI scientific units such as degrees Kelvin.  This is highly technical and is collected in  many different types of datasets.  The data aggregated into working datasets such as can be found at Kaggle represents a summarization of the most relevant subsets of this data as what makes data analysis practical without becoming an expert in the field.

In [5]:
import netCDF4 as nc
import numpy as np
import pandas as pd

---

**Sample Data Types**

---

showing fields from `rootgrp.variables`, a var type, and var descriptions.

```
odict_keys(['lat', 'lon', 'time', 'climatology_bounds', 'air', 'valid_yr_count'])
<class 'netCDF4._netCDF4.Variable'>
float32 air(time, lat, lon)
    long_name: Long Term Mean Monthly Mean of Air Temperature
    valid_range: [150. 400.]
    units: degK
    add_offset: 0.0
    scale_factor: 1.0
    missing_value: -9.96921e+36
    precision: 2
    least_significant_digit: 1
    GRIB_id: 11
    GRIB_name: TMP
    var_desc: Air temperature
    level_desc: 2 m
    statistic: Long Term Mean
    parent_stat: Mean
    dataset: NCEP Reanalysis Derived Products
    actual_range: [199.70786 312.07498]
unlimited dimensions:
current shape = (12, 94, 192)
filling on, default _FillValue of 9.969209968386869e+36 used

<class 'netCDF4._netCDF4.Variable'>
float64 time(time)
    long_name: Time
    delta_t: 0000-01-00 00:00:00
    avg_period: 0030-00-00 00:00:00
    prev_avg_period: 0017-00-00 00:00:00
    standard_name: time
    axis: T
    units: hours since 1800-01-01 00:00:0.0
    climatology: climatology_bounds
    climo_period: 1991/01/01 - 2020/12/31
    actual_range: [-15769752. -15761736.]
    ltm_range: [1674264. 1936512.]
    interpreted_actual_range: 0001/01/01 00:00:00 - 0001/12/01 00:00:00
unlimited dimensions:
current shape = (12,)
filling on, default _FillValue of 9.969209968386869e+36 used
```

---

**Air Temp**

---

In [6]:
data_file = DATA_PATH + "air.mon.anom.nc"
rootgrp = nc.Dataset(data_file)

In [7]:
# get all variable names
fields = rootgrp.variables.keys()
fields

dict_keys(['lat', 'lon', 'time', 'time_bnds', 'air'])

In [8]:
COLS = ['time', 'lat', 'lon', 'air']

In [9]:
vars = get_data(rootgrp, COLS)

## Loaded: time (2085,)
## Loaded: lat (36,)
## Loaded: lon (72,)
## Loaded: air (2085, 36, 72)


In [10]:
time_meta = rootgrp.variables['time']
times = nc.num2date(vars['time'], time_meta.units, time_meta.calendar, only_use_cftime_datetimes=False, only_use_python_datetimes=True)

In [11]:
times

array([real_datetime(1850, 1, 1, 0, 0), real_datetime(1850, 2, 1, 0, 0),
       real_datetime(1850, 3, 1, 0, 0), ...,
       real_datetime(2023, 7, 1, 0, 0), real_datetime(2023, 8, 1, 0, 0),
       real_datetime(2023, 9, 1, 0, 0)], dtype=object)

**Load into DataFrame**


This will be about understanding the normalization of the netCDF data and re-combining into a flat data layout.

In [12]:
airs = vars['air']

In [13]:
#airs = airs.reshape(airs.shape[0], -1)
airs.shape

(2085, 36, 72)

In [14]:
airs[0][0]

array([-0.19552313, -0.19552313, -0.19552313, -0.19552313, -0.19552313,
       -0.19552317, -0.1955232 , -0.1955232 , -0.19552322, -0.19552322,
       -0.19552322, -0.19552322, -0.19552322, -0.19552322, -0.19552322,
       -0.19552322, -0.19552322, -0.19552322, -0.19552322, -0.19552322,
       -0.19552322, -0.19552322, -0.19552322, -0.19552322, -0.19552322,
       -0.19552322, -0.19552322, -0.19552322, -0.19552322, -0.19552322,
       -0.19552322, -0.19552322, -0.1955232 , -0.1955232 , -0.19552322,
       -0.19170347, -0.1755955 , -0.14818609, -0.12665969, -0.12665974,
       -0.14818622, -0.17559573, -0.19170378, -0.19552355, -0.19552355,
       -0.19552355, -0.19552355, -0.19552352, -0.19552346, -0.1955234 ,
       -0.19552332, -0.19552328, -0.19552322, -0.1955232 , -0.19552317,
       -0.19552317, -0.19552317, -0.19552317, -0.19552317, -0.19552317,
       -0.19552313, -0.19552313, -0.19552313, -0.19552313, -0.19552313,
       -0.19552313, -0.19552313, -0.19552313, -0.19552313, -0.19

**Create lat/long lookup**

In [15]:
lat = vars['lat']
lon = vars['lon']
#np.cross(lat,lon)

In [16]:
lat

array([-87.5, -82.5, -77.5, -72.5, -67.5, -62.5, -57.5, -52.5, -47.5,
       -42.5, -37.5, -32.5, -27.5, -22.5, -17.5, -12.5,  -7.5,  -2.5,
         2.5,   7.5,  12.5,  17.5,  22.5,  27.5,  32.5,  37.5,  42.5,
        47.5,  52.5,  57.5,  62.5,  67.5,  72.5,  77.5,  82.5,  87.5],
      dtype=float32)

In [17]:
lon

array([  2.5,   7.5,  12.5,  17.5,  22.5,  27.5,  32.5,  37.5,  42.5,
        47.5,  52.5,  57.5,  62.5,  67.5,  72.5,  77.5,  82.5,  87.5,
        92.5,  97.5, 102.5, 107.5, 112.5, 117.5, 122.5, 127.5, 132.5,
       137.5, 142.5, 147.5, 152.5, 157.5, 162.5, 167.5, 172.5, 177.5,
       182.5, 187.5, 192.5, 197.5, 202.5, 207.5, 212.5, 217.5, 222.5,
       227.5, 232.5, 237.5, 242.5, 247.5, 252.5, 257.5, 262.5, 267.5,
       272.5, 277.5, 282.5, 287.5, 292.5, 297.5, 302.5, 307.5, 312.5,
       317.5, 322.5, 327.5, 332.5, 337.5, 342.5, 347.5, 352.5, 357.5],
      dtype=float32)

In [18]:
df_lat = pd.DataFrame(lat)
df_lon = pd.DataFrame(lon)

In [19]:
df_latlon = df_lat.merge(df_lon, how='cross')

In [20]:
df_latlon.rename(columns={'0_x':'lat','0_y':'long'}, inplace=True)

**Assemble full DF**

In [21]:
NUM_ROWS = airs.shape[0] * airs.shape[1]
df_all = pd.DataFrame()

In [22]:
df_times = pd.DataFrame({'date':times})

In [23]:
df_times['date'][0]

Timestamp('1850-01-01 00:00:00')

In [24]:
for i, arr in enumerate(airs):
  # next time step
  # start w/ the lat/longs we will align with
  df_step = df_latlon.copy(deep=True)
  df_step['air'] = arr
  df_step['date'] = df_times['date'][i]
  df_all = pd.concat([df_all, df_step])

ValueError: ignored

In [None]:
#df_all.rename(columns={'0_x':'lat','0_y':'long'}, inplace=True)

In [None]:
df_all.reset_index(inplace=True)

In [None]:
df_all.columns

In [None]:
df_lon.head()

In [None]:
df_lat.tail()

In [None]:
df_all['long'].unique()


In [None]:
# Filter Geographically -- range of Gulf of Mexico, Carribean
#df_all[(df_all['long'] < -70) & (df_all['long'] > -100) & (df_all['lat'] > 15) & (df_all['lat'] < 40)]
df_all[(df_all['long'] < -70)]

In [None]:
df_all

**Extras....**

In [None]:
# pct populated == num NON-NULL / TOTAL NUM
pcts = []
COLS = []
for col in df.columns:
  pct = df[col][df[col].isna()==False].size/df[col].size
  #print(f'\t{col} ::\t{pct}')
  COLS.append(col)
  pcts.append(pct)


In [None]:
df_stats = pd.DataFrame({'col':COLS,'pct':pcts})

In [None]:
df_stats[df_stats['pct'] > 0.74]

---

**HadSST (??)**

---

In [None]:
data_file = DATA_PATH + "HadSST.4.0.1.0_median.nc"
rootgrp = nc.Dataset(data_file)

In [None]:
# get all variable names
fields = rootgrp.variables.keys()
fields

In [None]:
COLS = ['time', 'latitude', 'longitude', 'tos']

In [None]:
vars = get_data(rootgrp, COLS)

In [None]:
time_meta = rootgrp.variables['time']
times = nc.num2date(vars['time'], time_meta.units, time_meta.calendar, only_use_cftime_datetimes=False, only_use_python_datetimes=True)

In [None]:
times

**Load into DataFrame**


This will be about understanding the normalization of the netCDF data and re-combining into a flat data layout.

In [None]:
df = pd.DataFrame(index=times, columns={'tos':vars['tos']})

In [None]:
df[df['tos'].isna()==False]