# Observational data preprocessing steps

Script to work out the preprocessing needed for observational datasets. Ultimately plan to create a class for this pre processing.  

Datasets included: HadISST, COBE, COBE2, Kaplan

## Setting up the workspace

In [1]:
%reset -f

In [123]:
import numpy as np
import xarray as xr
from matplotlib import pyplot as plt
import pandas as pd
import os
import pooch
from datetime import datetime

from GradientProjectFunctions import lat_lon_res_Eq

### Importing datasets

#### Hadley

In [203]:
odie = pooch.create(
    path = pooch.os_cache('HadISST'),
    base_url = 'https://www.metoffice.gov.uk/hadobs/hadisst/data/',
    registry = {
        'HadISST_sst.nc.gz': 'sha256:b03d7c0adcdc29b1687ee2bb22c322a6019547aee3339f29af0a6dc505e7477f'
    },
)

file_path = odie.fetch('HadISST_sst.nc.gz')

In [274]:
dsHad = xr.open_dataset(file_path)

#### COBE2

In [140]:
file_path = 'http://psl.noaa.gov/thredds/dodsC/Datasets/COBE2/sst.mon.mean.nc'
dsCOBE2 = xr.open_dataset(file_path)

#### COBE

In [141]:
file_path = 'http://psl.noaa.gov/thredds/dodsC/Datasets/COBE/sst.mon.mean.nc'
dsCOBE = xr.open_dataset(file_path)

#### Kaplan

In [142]:
file_path = 'http://psl.noaa.gov/thredds/dodsC/Datasets/kaplan_sst/sst.mean.anom.nc'
dsKaplan = xr.open_dataset(file_path)

## Cleaning coordinates

### Cleaning coordinate names

In [112]:
dsObs = dsKaplan

try:
    if 'lat' not in dsObs.dims:
        if 'y' in dsObs.dims:
            dsObs = dsObs.rename({'y': 'lat'})
        elif 'latitude' in dsObs.dims:
            dsObs = dsObs.rename({'latitude': 'lat'})

    if 'lon' not in dsObs.dims:
        if 'x' in dsObs.dims:
            dsObs = dsObs.rename({'x': 'lon'})
        elif 'longitude' in dsObs.dims:
            dsObs = dsObs.rename({'longitude': 'lon'})

    # print('Successful coordinate cleaning')

except Exception as e:

    print(f'Error checking and correcting coordinates: {e}')

### Rolling coordinates (lat and lon)

We want coords to go 0-360 and -90 - 90

In [232]:
# first check if they are correct already

# longitude
if not (np.floor(dsObs.lon[0]) <= 5) & (np.ceil(dsObs.lon[-1]) >= 350):
    dsObs = dsObs.assign_coords(lon = ((360 + (dsObs.lon % 360)) % 360))
    dsObs = dsObs.roll(lon = int(len(dsObs.lon) / 2), roll_coords = True)
else:
    pass

# latitude
if not (np.floor(dsObs.lat[0]) == -90) & (np.ceil(dsObs.lat[-1]) == 90):
    dsObs = dsObs.sortby('lat')
else:
    pass

### Making time the right format

In [114]:
convertedTime = pd.to_datetime(dsObs.time.values.astype(str))
dsObs['time'] = ('time', convertedTime)

### Checking the resolution

In [115]:
lat_lon_res_Eq(dsObs)

Latitude: Mean: 5.00 and SD: 0.000
Longitude: Mean: 5.00 and SD: 0.000


### Checking the time span

In [139]:
dateFirst = np.datetime_as_string(dsObs.time[0].values, unit = 'M')
dateLast = np.datetime_as_string(dsObs.time[-1].values, unit = 'M')

print(f'Dataset spans from {dateFirst} to {dateLast}')

Dataset spans from 1856-01 to 2023-01


## Creating a class for this

In [267]:
class PrepObsData:
    def __init__(self, dsObs):
        '''
        Takes the input of an observational dataset and cleans the data as follows:
        - standardises coordinate names to lat and lon
        - rolls coordinates so that lon goes 0-360 and lat goes -90 - 90
        - corrects the format of the time to be standard
        - checks the resolution of the dataset around the 10° band about the equator
        - outputs the length of the dataset
        
        :param: dsObs: observational dataset
        '''
        
        self.dsObs = dsObs
        self.dsOut = self.ExecAllSteps()
        
    def CleanCoords(self):
        
        try:
            if 'lat' not in self.dsObs.dims:
                if 'y' in self.dsObs.dims:
                    self.dsOut = self.dsObs.rename({'y': 'lat'})
                elif 'latitude' in self.dsObs.dims:
                    self.dsOut = self.dsObs.rename({'latitude': 'lat'})

            if 'lon' not in self.dsOut.dims:
                if 'x' in self.dsOut.dims:
                    self.dsOut = self.dsOut.rename({'x': 'lon'})
                elif 'longitude' in self.dsOut.dims:
                    self.dsOut = self.dsOut.rename({'longitude': 'lon'})
            
            return self.dsOut

        except Exception as e:

            print(f'Error checking and correcting coordinate names: {e}')
    
    def RollCoords(self):
        
        try:
            # longitude
            if not (np.floor(self.dsOut.lon[0]) <= 5) & (np.ceil(self.dsOut.lon[-1]) >= 350):
                self.dsOut = self.dsOut.assign_coords(lon = ((360 + (self.dsOut.lon % 360)) % 360))
                self.dsOut = self.dsOut.roll(lon = int(len(self.dsOut.lon) / 2), roll_coords = True)
            else:
                pass

            # latitude
            if not (np.floor(self.dsOut.lat[0]) == -90) & (np.ceil(self.dsOut.lat[-1]) == 90):
                self.dsOut = self.dsOut.sortby('lat')
            else:
                pass
            
            return self.dsOut
            
        except Exception as e:
            
            print(f'Error rolling coordinates: {e}')
        
    def CleanTime(self):
        
        try:
            convertedTime = pd.to_datetime(self.dsOut.time.values.astype(str))
            self.dsOut['time'] = ('time', convertedTime)
            
            return self.dsOut
        
        except Exception as e:
            
            print(f'Error cleaning the time coordinate: {e}')
    
    def CheckingResolution(self):
        
        try:
            lat_lon_res_Eq(self.dsOut)
            
        except Exception as e:
            
            print(f'Error checking resolution: {e}')
            
    def CheckingTime(self):
        
        try:
            dateFirst = np.datetime_as_string(self.dsOut.time[0].values, unit = 'M')
            dateLast = np.datetime_as_string(self.dsOut.time[-1].values, unit = 'M')

            print(f'Dataset spans from {dateFirst} to {dateLast}')
            
        except Exception as e:
            
            print(f'Error checking time: {e}')
            
    def ExecAllSteps(self):
        self.CleanCoords()
        self.RollCoords()
        self.CleanTime()
        self.CheckingResolution()
        self.CheckingTime()
        return self.dsOut

In [275]:
test = PrepObsData(dsHad)

Latitude: Mean: 1.00 and SD: 0.000
Longitude: Mean: 1.00 and SD: 0.000
Dataset spans from 1870-01 to 2024-02


In [270]:
import sys

In [272]:
sys.path.append('/home/hbyrne/Research/Tools')

In [None]:
from Utils_Functions_HB