# Functions testing

## Research Project 2022

In [None]:
import xarray 
import pandas as pd
import numpy as np
from sklearn.metrics import mean_squared_error


In [None]:
def write_to_tif(data):
    """
    Function to convert data to tif file.
    Arguments:
        data:
    Returns:
        tif file
    """
    #if type data
    return

In [None]:
def read_and_prep_parquet(path, purpose):
    """
    Function to read parquet and prepare as train or test data.
    Arguments:
        path: path to file.
        purpose: {'train', 'test', 'validate', 'predict'} purpose of file.
    Returns: train/test dataset and label array (specify output datatype!)
    """
    valid = {'train', 'test', 'validate', 'predict'}
    if purpose not in valid:
        raise ValueError("Purpose must be one of %r." % valid)

    df = pd.read_parquet(path)
    if purpose in ['train', 'test', 'validate']:
        df = df.loc[df['opt_value'] != -1] # remove mask
        df = df.fillna(-1) # fill values to be able to train
        X = df[['x', 'y', 'mw_value', 'col', 'row', 'v1', 'v2', 'v3', 'v4', 'v6', 'v7', 'v8', 'v9', 'mean', 'elevation_data']] # v5 is duplicated
        y = df[['opt_value']]
        return X, y
    else:
        df = df.fillna(-1) # fill values to be able to train
        X = df[['x', 'y', 'mw_value', 'col', 'row', 'v1', 'v2', 'v3', 'v4', 'v6', 'v7', 'v8', 'v9', 'mean', 'elevation_data']] # v5 is duplicated
        return X

In [None]:
def get_rmse(y_real, y_predicted):
    return np.sqrt(mean_squared_error(y_real, y_predicted))


In [2]:
import xarray
path_elevation =  r"../Data/elevation_data/gimpdem_1km_compressed.tif"
data_elevation = xarray.open_dataarray(path_elevation)

In [19]:
import rasterio

def information(path_to_file):
    with rasterio.open(path_elevation) as src:
        print('BOUNDS:')
        print(f'    {src.bounds}')
        print('METADATA:')
        print(f'    {src.meta}')
        #print(src.crs)

    data = xarray.open_dataarray(path_elevation)
    print('MORE CRS INFO:')
    print(f'    {data.spatial_ref.crs_wkt}')
    print('RESOLUTION:')
    print(f'    {data.rio.resolution()}')

    return


# ----band data:----
# raster = rasterio.open(optical_path) #,masked=True)
#band_arr = raster.read(band_id) 

BOUNDS:
    BoundingBox(left=-637000.0, bottom=-3349000.0, right=850000.0, top=-662000.0)
METADATA:
    {'driver': 'GTiff', 'dtype': 'int16', 'nodata': None, 'width': 1487, 'height': 2687, 'count': 1, 'crs': CRS.from_epsg(3413), 'transform': Affine(1000.0, 0.0, -637000.0,
       0.0, -1000.0, -662000.0)}
MORE CRS INFO:
    PROJCS["WGS 84 / NSIDC Sea Ice Polar Stereographic North",GEOGCS["WGS 84",DATUM["WGS_1984",SPHEROID["WGS 84",6378137,298.257223563,AUTHORITY["EPSG","7030"]],AUTHORITY["EPSG","6326"]],PRIMEM["Greenwich",0,AUTHORITY["EPSG","8901"]],UNIT["degree",0.0174532925199433,AUTHORITY["EPSG","9122"]],AUTHORITY["EPSG","4326"]],PROJECTION["Polar_Stereographic"],PARAMETER["latitude_of_origin",70],PARAMETER["central_meridian",-45],PARAMETER["false_easting",0],PARAMETER["false_northing",0],UNIT["metre",1,AUTHORITY["EPSG","9001"]],AXIS["Easting",SOUTH],AXIS["Northing",SOUTH],AUTHORITY["EPSG","3413"]]
RESOLUTION:
    (1000.0, -1000.0)


## Thesis 2023

In [1]:
import pandas as pd
from sklearn.model_selection import TimeSeriesSplit
from sklearn.tree import DecisionTreeRegressor

import numpy as np
from sklearn.metrics import mean_squared_error

from tqdm import tqdm

df_path = r"../Data/combined/dataframe_extended/"

date_from = '2019-07-01'
date_to = '2019-07-02'

In [12]:
def import_data(date_from:str, date_to:str, df_path:str):
    """
    Imports data and merges into one dataframe.

    Args:
        date_from (format: 'yyyy-mm-dd'): Period starting date (included).

        date_to (format: 'yyyy-mm-dd'): Period end date (included).

        df_path: Path to folder with daily data files.

    Returns:
        pandas.DataFrame: Dataframe with all merged data.
    """

    date_range = pd.date_range(date_from, date_to) # both ends included
    date_range = [str(day.date()) for day in date_range]
    df_list = []

    for melt_date in tqdm(date_range):
        try: # bc some days are empty
            file = pd.read_parquet(df_path + 'melt_'+ melt_date + '_extended.parquet.gzip', index= False) 
            df_list.append(file) # list of df
        except:
            continue

    df = pd.concat(df_list, axis=0) # concat af df to one
    del df_list
    return df

In [23]:
def data_prep(df, removeMaskedClouds = True):
    # remove -1 from mw:
    df = df[df['mw_value'] != -1]
    
    if removeMaskedClouds == True:
        # remove nan/-1 from opt:
        df = df[df['opt_value'] != -1]

    # remove bare ice?
    # check if all aggregations are num (not nan)
    return df

In [14]:
def get_rmse(y_real, y_predicted):
    """
    Calculates RMSE score.

    Args:
        y_real (): real target values.

        y_predicted (): model predicted target values.

    Returns:
        float: RMSE score.
    """

    return np.sqrt(mean_squared_error(y_real, y_predicted))

In [15]:
def cross_validation(df, columns, train_func, n_splits = 5, hyperparameters = None):
    """
    Cross-validation with TimeSeriesSplit.

    Args:
        df (pandas.DataFrame): Full train/ test dataframe.

        columns (list of strings): List of columns to be used in training.

        train_func (function): Custom defined function for training and evaluating model.
                                E.g.: model_decisionTree()
        
        n_splits (int): Number of cv splits.

        hyperparameters (dict, optional): Dictionary with hyperparameters for model.

    Returns:
        list: Two list with <n_splits> RMSE scores for train and test data.
    """

    df.sort_values(by=['date'], inplace = True) # sort df by time
    X = df[columns]
    y = df[["opt_value"]]

    rmse_train_list = []
    rmse_test_list = []
    tscv = TimeSeriesSplit(n_splits = n_splits)

    for train_index, test_index in tqdm(tscv.split(X)):
    #for i, (train_index, test_index) in enumerate(tscv.split(X)):
        # print(f"Fold {i}:")
        X_train = X.iloc[train_index]
        y_train = y.iloc[train_index]
        X_test  = X.iloc[test_index]
        y_test  = y.iloc[test_index]

        y_predicted_train, y_predicted_test = train_func(X_train, y_train, X_test, y_test, hyperparameters)

        rmse_train = get_rmse(y_train, y_predicted_train)
        rmse_test = get_rmse(y_test, y_predicted_test)

        rmse_train_list.append(rmse_train)
        rmse_test_list.append(rmse_test)

    return rmse_train_list, rmse_test_list

In [16]:
def model_decisionTree(X_train, y_train, X_test, y_test, hyperparameters = None):
    """
    Trains model and predicts target values.

    Args:
        X_train (pandas.DataFrame): Dataframe with train data.

        y_train (pandas.DataFrame): Dataframe with train labels, one column.

        X_test (pandas.DataFrame): Dataframe with test data.

        y_test (pandas.DataFrame): Dataframe with test labels, one column.

        hyperparameters (dict, optional): Dictionary with model parameters.

    Returns:
        list: Two lists with predicted values for train and test set.
    """
        
    if hyperparameters:
        regressor = DecisionTreeRegressor(**hyperparameters)
    else:
        regressor = DecisionTreeRegressor(random_state=0)

    regressor.fit(X_train, y_train)
    y_predicted_train = regressor.predict(X_train)
    y_predicted_test = regressor.predict(X_test)

    return y_predicted_train, y_predicted_test

In [None]:
def model_meanBenchmark(y_train, y_test):
    """
    Creates predictions for mean benchmark.

    Args:
        y_train (pandas.DataFrame): Dataframe with train labels, one column.

        y_test (pandas.DataFrame): Dataframe with test labels, one column.

    Returns:
        list: Lists with predicted values for test set.
    """

    y_predicted = np.full((1, len(y_test)), y_train.mean())[0]

    return y_predicted

In [None]:
def model_mwBenchmark(X_test):
    """
    Creates predictions for microwave benchmark by comparing the mw and opt datasets directly.

    Args:
        X_test (pandas.DataFrame): Dataframe with test data.

    Returns:
        list: Lists with predicted values for test set.
    """

    y_predicted = X_test['mw_value']

    return y_predicted

In [18]:
def hyperparameter_tune():
    # define grid (if grid)
    # do cv for each??? - maybe less splits?
    # define hyperparameters as a dictionary eg: dt_params = {'max_depth':7, 'criterion': 'squared_error'}
    return

In [None]:
def plot_results():
    return

### Test

In [21]:
df = import_data(date_from, date_to, df_path)
df=df[['mw_value', 'opt_value', 'date', 'mean']]

a, b = cross_validation(df, ['mw_value', 'mean'], model_decisionTree, n_splits = 5, hyperparameters = None)


100%|██████████| 2/2 [00:01<00:00,  1.02it/s]


In [25]:
df = import_data(date_from, date_to, df_path)
df.isna().sum()

100%|██████████| 2/2 [00:01<00:00,  1.74it/s]


x                     0
y                     0
mw_value              0
opt_value             0
col                   0
row                   0
v1                14044
v2                 6648
v3                14036
v4                 7576
v5                    0
v6                 7576
v7                14036
v8                 6648
v9                14044
date                  0
mean                  0
elevation_data        0
dtype: int64

In [28]:
df.isna().sum()

x                    0
y                    0
mw_value             0
opt_value            0
col                  0
row                  0
v1                1112
v2                 453
v3                1829
v4                 710
v5                   0
v6                1378
v7                2135
v8                1419
v9                2738
date                 0
mean                 0
elevation_data       0
dtype: int64

In [27]:
df = df[df['mw_value'] != -1]

In [None]:
def convert_to_tif(data, path_file_metadata, path_out):
    """
    Function to convert data to tif file.
    Arguments:
        data: new file
        path_file_metadata: tif file with metadata matching expected output tif file
        path_out: output tif file destination and name path
    Returns:
        .tif file
    """
    with rasterio.open(path_file_metadata) as src:
        kwargs1 = src.meta.copy()

    with rasterio.open(path_out, "w", **kwargs1) as dst:
        dst.write_band(1, data)  # numpy array or xarray
    return

In [None]:
def save_prediction_tif(X_pred, y_predicted, path_out):
    """
    Function to write predictions to .tif.
    Arguments:
        X_pred: data to be predicted on.
        y_predicted: predicted labels in array, or pandas series.
        path_out: path to save .tif file with file name.
    Returns: No return, writes data to path.
    """
    # join prediction and coordinates (row, col)
    X_pred["prediction"] = y_predicted

    # original matrix shape:
    nan_matrix = np.full((2663, 1462), np.nan)

    for row in tqdm(X_pred.iterrows()):  # fix progress bar?
        row_index = int(row[1]["row"])
        col_index = int(row[1]["col"])
        pred_val = row[1]["prediction"]
        nan_matrix[row_index][col_index] = pred_val

    # file to take reference metadata from is interpolated transformed file
    path_metadata_reference = r"../Data/microwave-rs/mw_interpolated/2019-07-01_mw.tif"

    convert_to_tif(nan_matrix, path_metadata_reference, path_out)

    return
