# Functions testing

## Research Project 2022

In [None]:
import xarray 
import pandas as pd
import numpy as np
from sklearn.metrics import mean_squared_error


In [None]:
def write_to_tif(data):
    """
    Function to convert data to tif file.
    Arguments:
        data:
    Returns:
        tif file
    """
    #if type data
    return

In [None]:
def read_and_prep_parquet(path, purpose):
    """
    Function to read parquet and prepare as train or test data.
    Arguments:
        path: path to file.
        purpose: {'train', 'test', 'validate', 'predict'} purpose of file.
    Returns: train/test dataset and label array (specify output datatype!)
    """
    valid = {'train', 'test', 'validate', 'predict'}
    if purpose not in valid:
        raise ValueError("Purpose must be one of %r." % valid)

    df = pd.read_parquet(path)
    if purpose in ['train', 'test', 'validate']:
        df = df.loc[df['opt_value'] != -1] # remove mask
        df = df.fillna(-1) # fill values to be able to train
        X = df[['x', 'y', 'mw_value', 'col', 'row', 'v1', 'v2', 'v3', 'v4', 'v6', 'v7', 'v8', 'v9', 'mean', 'elevation_data']] # v5 is duplicated
        y = df[['opt_value']]
        return X, y
    else:
        df = df.fillna(-1) # fill values to be able to train
        X = df[['x', 'y', 'mw_value', 'col', 'row', 'v1', 'v2', 'v3', 'v4', 'v6', 'v7', 'v8', 'v9', 'mean', 'elevation_data']] # v5 is duplicated
        return X

In [None]:
def get_rmse(y_real, y_predicted):
    return np.sqrt(mean_squared_error(y_real, y_predicted))


In [2]:
import xarray
path_elevation =  r"../Data/elevation_data/gimpdem_1km_compressed.tif"
data_elevation = xarray.open_dataarray(path_elevation)

In [19]:
import rasterio

def information(path_to_file):
    with rasterio.open(path_elevation) as src:
        print('BOUNDS:')
        print(f'    {src.bounds}')
        print('METADATA:')
        print(f'    {src.meta}')
        #print(src.crs)

    data = xarray.open_dataarray(path_elevation)
    print('MORE CRS INFO:')
    print(f'    {data.spatial_ref.crs_wkt}')
    print('RESOLUTION:')
    print(f'    {data.rio.resolution()}')

    return


# ----band data:----
# raster = rasterio.open(optical_path) #,masked=True)
#band_arr = raster.read(band_id) 

BOUNDS:
    BoundingBox(left=-637000.0, bottom=-3349000.0, right=850000.0, top=-662000.0)
METADATA:
    {'driver': 'GTiff', 'dtype': 'int16', 'nodata': None, 'width': 1487, 'height': 2687, 'count': 1, 'crs': CRS.from_epsg(3413), 'transform': Affine(1000.0, 0.0, -637000.0,
       0.0, -1000.0, -662000.0)}
MORE CRS INFO:
    PROJCS["WGS 84 / NSIDC Sea Ice Polar Stereographic North",GEOGCS["WGS 84",DATUM["WGS_1984",SPHEROID["WGS 84",6378137,298.257223563,AUTHORITY["EPSG","7030"]],AUTHORITY["EPSG","6326"]],PRIMEM["Greenwich",0,AUTHORITY["EPSG","8901"]],UNIT["degree",0.0174532925199433,AUTHORITY["EPSG","9122"]],AUTHORITY["EPSG","4326"]],PROJECTION["Polar_Stereographic"],PARAMETER["latitude_of_origin",70],PARAMETER["central_meridian",-45],PARAMETER["false_easting",0],PARAMETER["false_northing",0],UNIT["metre",1,AUTHORITY["EPSG","9001"]],AXIS["Easting",SOUTH],AXIS["Northing",SOUTH],AUTHORITY["EPSG","3413"]]
RESOLUTION:
    (1000.0, -1000.0)


## Thesis 2023

In [1]:
import pandas as pd
from sklearn.model_selection import TimeSeriesSplit
from sklearn.tree import DecisionTreeRegressor

import numpy as np
from sklearn.metrics import mean_squared_error

from tqdm import tqdm

df_path = r"../Data/combined/dataframe_extended/"

date_from = '2019-07-01'
date_to = '2019-07-02'

- imports
- paths
- import data
- join all files/days to one df
- prep data
- periods? (have defined preiods or not?)
- train-test split or do 5-fold cv - do time split:
    - train data for cv candidate models and compare and choose best (both best hyperparameters and best model)
    print train scores as well as test scores in cv to track overfitting. 
    - test set to have a test score. (train on all train data) 



In [3]:
def import_data(date_from, date_to):
    date_range = pd.date_range(date_from, date_to) # both ends included
    date_range = [str(day.date()) for day in date_range]
    df_list = []

    for melt_date in tqdm(date_range):
        try: # bc some days are empty
            file = pd.read_parquet(df_path + 'melt_'+ melt_date + '_extended.parquet.gzip', index= False) 
            # join all to one df:
            df_list.append(file)
        except:
            continue

    df = pd.concat(df_list, axis=0)
    del df_list
    # check if dates have propper values now (from engineering - not hardcoded)
    return df

In [4]:
df = import_data(date_from, date_to)

100%|██████████| 2/2 [00:01<00:00,  1.43it/s]


In [5]:
df=df[['mw_value', 'opt_value', 'date', 'mean']]

In [None]:
def remove_data():
    # remove -1 from mw
    # remove nan from optical
    # remove bare ice
    # check if all aggregations are num (not nan)
    return

In [None]:
def data_prep():
    # any data prep?
    return

In [11]:
def cross_validation(train_df, columns, train_func, n_splits = 5, hyperparameters = None):
    
    train_df.sort_values(by=['date'], inplace = True) # sort df by time
    X = train_df[columns]
    y = train_df[["opt_value"]]

    train_rmse_list = []
    test_rmse_list = []
    tscv = TimeSeriesSplit(n_splits = n_splits)

    for train_index, test_index in tqdm(tscv.split(X)):
    #for i, (train_index, test_index) in enumerate(tscv.split(X)):
        # print(f"Fold {i}:")
        X_train = X.iloc[train_index]
        y_train = y.iloc[train_index]
        X_test  = X.iloc[test_index]
        y_test  = y.iloc[test_index]

        # train and evaluate
        train_rmse, test_rmse = train_func(X_train, y_train, X_test, y_test, hyperparameters)
        train_rmse_list.append(train_rmse)
        test_rmse_list.append(test_rmse)

    return train_rmse_list, test_rmse_list

In [17]:
def train_decisionTree(X_train, y_train, X_test, y_test, hyperparameters = None):
    
    
    if hyperparameters:
        regressor = DecisionTreeRegressor(**hyperparameters)
    else:
        regressor = DecisionTreeRegressor(random_state=0)

    regressor.fit(X_train, y_train)
    y_predicted_train = regressor.predict(X_train)
    y_predicted_test = regressor.predict(X_test)
    
    rmse_train = get_rmse(y_train, y_predicted_train)
    rmse_test = get_rmse(y_test, y_predicted_test)

    return rmse_train, rmse_test



In [18]:
def hyperparameter_tune():
    # define grid (if grid)
    # do cv for each??? - maybe less splits?
    # define hyperparameters as a dictionary eg: dt_params = {'max_depth':7, 'criterion': 'squared_error'}
    return

In [13]:
def get_rmse(y_real, y_predicted):
    return np.sqrt(mean_squared_error(y_real, y_predicted))

In [None]:
def plot_results():
    return

In [21]:
def calculate_sum(numbers):
    """
    Calculates the sum of a list of numbers.

    Args:
        numbers (list): A list of integers or floats.

    Returns:
        float: The sum of the numbers in the list.

    Raises:
        TypeError: If any element in the list is not an integer or float.

    Examples:
        >>> calculate_sum([1, 2, 3])
        6.0
        >>> calculate_sum([1.5, 2.5, 3.5])
        7.5
        >>> calculate_sum([1, '2', 3])
        Traceback (most recent call last):
          File "<stdin>", line 1, in <module>
          File "<stdin>", line 11, in calculate_sum
        TypeError: 'str' object is not a number
    """
    total = 0
    for number in numbers:
        if isinstance(number, (int, float)):
            total += number
        else:
            raise TypeError("'{}' object is not a number".format(type(number).__name__))
    return float(total)


In [None]:
def convert_to_tif(data, path_file_metadata, path_out):
    """
    Function to convert data to tif file.
    Arguments:
        data: new file
        path_file_metadata: tif file with metadata matching expected output tif file
        path_out: output tif file destination and name path
    Returns:
        .tif file
    """
    with rasterio.open(path_file_metadata) as src:
        kwargs1 = src.meta.copy()

    with rasterio.open(path_out, "w", **kwargs1) as dst:
        dst.write_band(1, data)  # numpy array or xarray
    return

In [None]:
def save_prediction_tif(X_pred, y_predicted, path_out):
    """
    Function to write predictions to .tif.
    Arguments:
        X_pred: data to be predicted on.
        y_predicted: predicted labels in array, or pandas series.
        path_out: path to save .tif file with file name.
    Returns: No return, writes data to path.
    """
    # join prediction and coordinates (row, col)
    X_pred["prediction"] = y_predicted

    # original matrix shape:
    nan_matrix = np.full((2663, 1462), np.nan)

    for row in tqdm(X_pred.iterrows()):  # fix progress bar?
        row_index = int(row[1]["row"])
        col_index = int(row[1]["col"])
        pred_val = row[1]["prediction"]
        nan_matrix[row_index][col_index] = pred_val

    # file to take reference metadata from is interpolated transformed file
    path_metadata_reference = r"../Data/microwave-rs/mw_interpolated/2019-07-01_mw.tif"

    convert_to_tif(nan_matrix, path_metadata_reference, path_out)

    return


In [None]:
def sum_list_items(_list):
    total = 0

    def do_the_sum(_list):
        for i in _list:
            total += i

    do_the_sum(_list)

    return total

sum_list_items([1, 2, 3])

In [3]:
def sum_list_items(_list):
    total = 0

    def do_the_sum(_list):

        # Define the total variable as non-local, causing it to bind
        # to the nearest non-global variable also called total.
        nonlocal total

        for i in _list:
            total += i

    do_the_sum(_list)

    return total

sum_list_items([1, 2, 3])

6