# Functions testing

## Research Project 2022

In [None]:
import xarray 
import pandas as pd
import numpy as np
from sklearn.metrics import mean_squared_error


In [None]:
def write_to_tif(data):
    """
    Function to convert data to tif file.
    Arguments:
        data:
    Returns:
        tif file
    """
    #if type data
    return

In [None]:
def read_and_prep_parquet(path, purpose):
    """
    Function to read parquet and prepare as train or test data.
    Arguments:
        path: path to file.
        purpose: {'train', 'test', 'validate', 'predict'} purpose of file.
    Returns: train/test dataset and label array (specify output datatype!)
    """
    valid = {'train', 'test', 'validate', 'predict'}
    if purpose not in valid:
        raise ValueError("Purpose must be one of %r." % valid)

    df = pd.read_parquet(path)
    if purpose in ['train', 'test', 'validate']:
        df = df.loc[df['opt_value'] != -1] # remove mask
        df = df.fillna(-1) # fill values to be able to train
        X = df[['x', 'y', 'mw_value', 'col', 'row', 'v1', 'v2', 'v3', 'v4', 'v6', 'v7', 'v8', 'v9', 'mean', 'elevation_data']] # v5 is duplicated
        y = df[['opt_value']]
        return X, y
    else:
        df = df.fillna(-1) # fill values to be able to train
        X = df[['x', 'y', 'mw_value', 'col', 'row', 'v1', 'v2', 'v3', 'v4', 'v6', 'v7', 'v8', 'v9', 'mean', 'elevation_data']] # v5 is duplicated
        return X

In [None]:
def get_rmse(y_real, y_predicted):
    return np.sqrt(mean_squared_error(y_real, y_predicted))


In [2]:
import xarray
path_elevation =  r"../Data/elevation_data/gimpdem_1km_compressed.tif"
data_elevation = xarray.open_dataarray(path_elevation)

In [19]:
import rasterio

def information(path_to_file):
    with rasterio.open(path_elevation) as src:
        print('BOUNDS:')
        print(f'    {src.bounds}')
        print('METADATA:')
        print(f'    {src.meta}')
        #print(src.crs)

    data = xarray.open_dataarray(path_elevation)
    print('MORE CRS INFO:')
    print(f'    {data.spatial_ref.crs_wkt}')
    print('RESOLUTION:')
    print(f'    {data.rio.resolution()}')

    return


# ----band data:----
# raster = rasterio.open(optical_path) #,masked=True)
#band_arr = raster.read(band_id) 

BOUNDS:
    BoundingBox(left=-637000.0, bottom=-3349000.0, right=850000.0, top=-662000.0)
METADATA:
    {'driver': 'GTiff', 'dtype': 'int16', 'nodata': None, 'width': 1487, 'height': 2687, 'count': 1, 'crs': CRS.from_epsg(3413), 'transform': Affine(1000.0, 0.0, -637000.0,
       0.0, -1000.0, -662000.0)}
MORE CRS INFO:
    PROJCS["WGS 84 / NSIDC Sea Ice Polar Stereographic North",GEOGCS["WGS 84",DATUM["WGS_1984",SPHEROID["WGS 84",6378137,298.257223563,AUTHORITY["EPSG","7030"]],AUTHORITY["EPSG","6326"]],PRIMEM["Greenwich",0,AUTHORITY["EPSG","8901"]],UNIT["degree",0.0174532925199433,AUTHORITY["EPSG","9122"]],AUTHORITY["EPSG","4326"]],PROJECTION["Polar_Stereographic"],PARAMETER["latitude_of_origin",70],PARAMETER["central_meridian",-45],PARAMETER["false_easting",0],PARAMETER["false_northing",0],UNIT["metre",1,AUTHORITY["EPSG","9001"]],AXIS["Easting",SOUTH],AXIS["Northing",SOUTH],AUTHORITY["EPSG","3413"]]
RESOLUTION:
    (1000.0, -1000.0)


## Old functions 

In [None]:
def cross_validation(df, columns, train_func, n_splits=5, hyperparameters=None):
    """
    Cross-validation with TimeSeriesSplit.

    Args:
        df (pandas.DataFrame): Full train/ test dataframe.

        columns (list of strings): List of columns to be used in training.

        train_func (function): Custom defined function for training and evaluating model.
                                E.g.: model_decisionTree()

        n_splits (int): Number of cv splits.

        hyperparameters (dict, optional): Dictionary with hyperparameters for model.

    Returns:
        list: Two list with <n_splits> RMSE scores for train and test data.
    """

    df.sort_values(by=["date"], inplace=True)  # sort df by time
    X = df[columns]
    y = df[["opt_value"]]

    rmse_train_list = []
    rmse_test_list = []
    tscv = TimeSeriesSplit(n_splits=n_splits)

    for train_index, test_index in tqdm(tscv.split(X)):
        X_train = X.iloc[train_index]
        y_train = y.iloc[train_index]
        X_test = X.iloc[test_index]
        y_test = y.iloc[test_index]

        y_predicted_train, y_predicted_test = train_func(
            X_train, y_train, X_test, y_test, hyperparameters
        )

        rmse_train = get_rmse(y_train, y_predicted_train)
        rmse_test = get_rmse(y_test, y_predicted_test)

        rmse_train_list.append(rmse_train)
        rmse_test_list.append(rmse_test)

    return rmse_train_list, rmse_test_list


def get_rmse(y_real, y_predicted):
    """
    Calculates RMSE score.

    Args:
        y_real (): real target values.

        y_predicted (): model predicted target values.

    Returns:
        float: RMSE score.
    """

    return np.sqrt(mean_squared_error(y_real, y_predicted))


def model_decisionTree(X_train, y_train, X_test, y_test, hyperparameters=None):
    """
    Trains model and predicts target values.

    Args:
        X_train (pandas.DataFrame): Dataframe with train data.

        y_train (pandas.DataFrame): Dataframe with train labels, one column.

        X_test (pandas.DataFrame): Dataframe with test data.

        y_test (pandas.DataFrame): Dataframe with test labels, one column.

        hyperparameters (dict, optional): Dictionary with model parameters.

    Returns:
        list: Two lists with predicted values for train and test set.
    """

    if hyperparameters:
        regressor = DecisionTreeRegressor(**hyperparameters)
    else:
        regressor = DecisionTreeRegressor(random_state=0)

    regressor.fit(X_train, y_train)
    y_predicted_train = regressor.predict(X_train)
    y_predicted_test = regressor.predict(X_test)

    return y_predicted_train, y_predicted_test


### Benchmark functions

In [4]:
def model_meanBenchmark(y_train, y_test):
    """
    Creates predictions for mean benchmark.

    Args:
        y_train (pandas.DataFrame): Dataframe with train labels, one column.

        y_test (pandas.DataFrame): Dataframe with test labels, one column.

    Returns:
        list: Lists with predicted values for test set.
    """

    y_predicted = np.full((1, len(y_test)), y_train.mean())[0]

    return y_predicted

In [5]:
def model_mwBenchmark(X_test):
    """
    Creates predictions for microwave benchmark by comparing the mw and opt datasets directly.

    Args:
        X_test (pandas.DataFrame): Dataframe with test data.

    Returns:
        list: Lists with predicted values for test set.
    """

    y_predicted = X_test['mw_value']

    return y_predicted

## Thesis 2023

In [1]:
import pandas as pd
from sklearn.model_selection import TimeSeriesSplit
from sklearn.tree import DecisionTreeRegressor

import numpy as np
from sklearn.metrics import mean_squared_error

from tqdm import tqdm

pd.options.mode.chained_assignment = None
from sklearn.cluster import KMeans
from sklearn.metrics import mean_squared_error, r2_score

import pickle
import matplotlib.pyplot as plt # to plot kmeans splits
import itertools

### Data processing functions

In [2]:
def import_data(date_from:str, date_to:str, df_path:str):
    """
    Imports data and merges into one dataframe.

    Args:
        date_from (format: 'yyyy-mm-dd'): Period starting date (included).

        date_to (format: 'yyyy-mm-dd'): Period end date (included).

        df_path: Path to folder with daily data files.

    Returns:
        pandas.DataFrame: Dataframe with all merged data.
    """

    date_range = pd.date_range(date_from, date_to) # both ends included
    date_range = [str(day.date()) for day in date_range]
    df_list = []

    for melt_date in tqdm(date_range):
        try: # bc some days are empty
            file = pd.read_parquet(df_path + 'melt_'+ melt_date + '_extended.parquet.gzip', index= False) 
            df_list.append(file) # list of df
        except:
            continue
        
    df = pd.concat(df_list, axis=0) # concat af df to one
    del df_list
    return df

In [3]:
def remove_data(df, removeMaskedClouds = True, removeNoMelt = True):
    # remove -1 from mw:
    df = df[df['mw_value'] != -1]
    
    if removeMaskedClouds == True:
        # remove nan/-1 from opt:
        df = df[df['opt_value'] != -1]

    if removeNoMelt == True:
        # open file
        # join with file
        melt = pd.read_parquet(r"../Data/split_indexes/noMelt_indexes.parquet", index= False)
        df = df.merge(melt, how = 'left', on = ["y",'x'])
        df = df[df['melt'] == 1]
    return df

In [25]:
class Model:
    """ 
    This class contains models. 
    After training it also contains performance scores and the hyperparameters used to train it.
    """

    def __init__(self, model):  
        self.model = model
        self.hyperparameters = [] # list of dictionaries with hyperparameters
    
    def create_hyperparameter_grid(self, hyperparameters):
        """
        Creates a grid with all possible combinations of hyperparameters.

        Args:
            hyperparameters (dict): Dictionary with hyperparameters.

        Returns:
            list: List with dictionaries with all possible combinations of hyperparameters.
        """
        hyperparameter_grid = []
        for i in range(len(hyperparameters)):
            hyperparameter_grid.append(list(hyperparameters.values())[i])
        hyperparameter_grid = list(itertools.product(*hyperparameter_grid))
        hyperparameter_grid = [dict(zip(hyperparameters.keys(), values)) for values in hyperparameter_grid]
        return hyperparameter_grid


    def __kmeans_split(self, df, loop, plot = False):
        """ 
        This function splits the data into 5 areas based on the kmeans algorithm.

        Args:
            df (pandas.DataFrame): Dataframe with data.

            loop (str): 'inner' or 'outer' loop.

            plot (bool): If True, plots the kmeans split.

        Returns:
            pandas.DataFrame: Dataframe with added column with kmeans split.
        
        """
        kmeans = KMeans(n_clusters=5, random_state=0, n_init="auto").fit(df[['x','y']])
        if loop == 'inner':
            df['inner_area'] = kmeans.labels_
        elif loop == 'outer':
            df['outer_area'] = kmeans.labels_

        if plot == True:
            print(df[loop+'_area'].value_counts())
            plt.scatter(df['x'], df['y'], c=df[loop+'_area'], edgecolor='none', s = 0.05)
            plt.show()
        return df
    
    def __train_test_split(self, df, columns, split_index):
        """ 
        This function splits the data into train and test set.

        Args:
            df (pandas.DataFrame): Dataframe with data.

            columns (list): List with column names to be used in the model.

            split_index (int): Index of the split (loop).

        Returns:
            pandas.DataFrame: Dataframe with added column with kmeans split.
        """
        inner_train = df[df['inner_area'] != split_index]
        inner_test  = df[df['inner_area'] == split_index]
        train_X = inner_train[columns]
        train_y = inner_train[["opt_value"]]
        test_X = inner_test[columns]
        test_y = inner_test[["opt_value"]] 
        return train_X, train_y, test_X, test_y

    def __inner_loop_tune_hyperparameters(self, df, columns):
        """ 
        This function performs hyperparameter tuning in (inner loop of nested cross-validation).
        
        Args:
            df (pandas.DataFrame): Dataframe with data.

            columns (list): List with column names to be used in the model.

        Returns:
            dict: Dictionary with best hyperparameters.
        """
        all_inner_loops_hyperparameter_scores= []
        for inner_split in df['inner_area'].unique():
            inner_train_X, inner_train_y, inner_test_X, inner_test_y = self.__train_test_split(df, columns, inner_split)                
            hyperparameter_scores = []
            if isinstance(self.hyperparameters, list):
                for hyperparams in self.hyperparameters:
                    regressor = self.model(random_state=0, **hyperparams).fit(inner_train_X, inner_train_y)
                    y_predicted_test = regressor.predict(inner_test_X)
                    hyperparameter_scores.append(mean_squared_error(inner_test_y, y_predicted_test, squared=False))
            else:
                print('hyperparameters must be a list')
            all_inner_loops_hyperparameter_scores.append(hyperparameter_scores)
        mean_hyperparameters = np.mean(all_inner_loops_hyperparameter_scores, axis=0)
        best_inner_hyperparameters = self.hyperparameters[np.argmin(mean_hyperparameters)] # not argmax because we want to minimize the error
        return best_inner_hyperparameters
    
    def spatial_cv(self, df, columns):
        """ 
        This function performs spatial cross-validation.
        
        Args:
            df (pandas.DataFrame): Dataframe with data.
            
            columns (list): List with column names to be used in the model.

        Returns:
            Nothing. But it assigns the RMSE and R2 scores for the train and test set to the model object.
                     It also assigns the best hyperparameters, predicted and real values of each outer split to the model object.
        """
        rmse_list_train = []
        rmse_list_test = []
        r2_list_train = []
        r2_list_test = []
        self.best_hyperparameters = []
        predictions_train = []
        predictions_test = []
        real_values_train = []
        real_values_test = []
        
        df = self.__kmeans_split(df, 'outer') #, plot = True #df = self.__cv_split_outer_loop(df)
        for outer_split in df['outer_area'].unique():
            #if outer_split == 1: # remove
            train = df[df['outer_area'] != outer_split]
            train = self.__kmeans_split(train, 'inner') #train = self.__cv_split_inner_loop(train)
            best_hyperparam= self.__inner_loop_tune_hyperparameters(train, columns)
            self.best_hyperparameters.append(best_hyperparam)
            
            train_X, train_y, test_X, test_y = self.__train_test_split(train, columns, outer_split)
            print(f'length train_X: {len(train_X)}, length train_y: {len(train_y)}, length test_X: {len(test_X)}, length test_y: {len(test_y)}')
            regressor = self.model(random_state=0, **best_hyperparam).fit(train_X, train_y)
            train_y_predicted = regressor.predict(train_X)
            test_y_predicted  = regressor.predict(test_X )
            predictions_train.append(train_y_predicted)
            predictions_test.append(test_y_predicted)
            real_values_train.append(train_y)
            real_values_test.append(test_y)

            rmse_list_train.append(mean_squared_error(train_y, train_y_predicted))
            rmse_list_test.append(mean_squared_error(test_y, test_y_predicted))
            r2_list_train.append(r2_score(train_y, train_y_predicted))
            r2_list_test.append(r2_score(test_y, test_y_predicted))

            # else: # tb removed
            #     continue
        # results:
        self.rmse_train = np.mean(rmse_list_train)
        self.rmse_std_train = np.std(rmse_list_train)
        self.rmse_test = np.mean(rmse_list_test)
        self.rmse_std_test = np.std(rmse_list_test)
        self.r2_train = np.mean(r2_list_train)
        self.r2_std_train = np.std(r2_list_train)
        self.r2_test = np.mean(r2_list_test)
        self.r2_std_test = np.std(r2_list_test)
        
        self.outer_loop_results = {'rmse_list_train': rmse_list_train,
                                   'rmse_list_test' : rmse_list_test,
                                   'r2_list_train'  : r2_list_train,
                                   'r2_list_test'   : r2_list_test}
        
        self.outer_loop_predictions = {'train_y_predicted': predictions_train,
                                       'test_y_predicted' : predictions_test}
        self.outer_loop_real_values = {'train_y': real_values_train,
                                        'test_y' : real_values_test}
        return

    def get_results(self):
        """ 
        This function prints the results of the model in a table.
        """
        results = pd.DataFrame({'Metric': ['RMSE', 'RMSE_std', 'R2', 'R2_std'],
                                'Train': [self.rmse_train, self.rmse_std_train ,self.r2_train, self.r2_std_train],
                                'Test': [self.rmse_test, self.rmse_std_test, self.r2_test, self.r2_std_test]})
        print(results)
        return 
    
    def get_attributes(self):
        """ 
        This function prints the attributes of the model.
        """
        for attribute, value in self.__dict__.items():
            print(attribute, '=', value)
        return


In [None]:
def save_object(obj, filename):
    """ 
    This function saves an object to a pickle file.

    Args:
        obj (object): Object to be saved.

        filename (str): Name of the file to be saved, with extension, without path unless a subfolder is desired.
    """
    filename = r'../Models/' + filename + '.pkl'
    with open(filename, 'wb') as outp:  # Overwrites any existing file.
        pickle.dump(obj, outp, pickle.HIGHEST_PROTOCOL)


def load_object(filename):
    """ 
    This function loads an object from a pickle file.
    
    Args:
        filename (str): Name of the file to be loaded, with extension, without path unless a subfolder is desired.
        
    Returns:
            obj (object): Loaded object.
    """
    filename = r'../Models/' + filename + '.pkl'
    with open(filename, 'rb') as inp:
        obj = pickle.load(inp)
    return obj

### Test

In [4]:
df_path = r"../Data/combined/dataframe_extended/"

date_from = '2019-07-01'
date_to = '2019-07-04'

In [5]:
data = import_data(date_from, date_to, df_path)
data=data[['x', 'y','mw_value', 'opt_value', 'date', 'mean_3']]
data = remove_data(data, removeMaskedClouds = True, removeNoMelt = True)

100%|██████████| 4/4 [00:05<00:00,  1.43s/it]


In [8]:
hyperparameters = [{'max_depth': 5, 'min_samples_leaf': 5, 'min_samples_split': 5}]
dt2 = trainedModel(model = DecisionTreeRegressor, hyperparameters = hyperparameters)

# hyp = [{'max_depth':7, 'criterion': 'squared_error'}, {'max_depth':4, 'criterion': 'squared_error'}]
# dt = trainedModel(model = DecisionTreeRegressor, hyperparameters = hyp )

In [32]:
#dt3= trainedModel(model = DecisionTreeRegressor)#
dt3.hyperparameters = [{'max_depth': 5, 'min_samples_leaf': 5, 'min_samples_split': 5}]

In [37]:
dt3.get_attributes()

model = <class 'sklearn.tree._classes.DecisionTreeRegressor'>
hyperparameters = [{'max_depth': 4, 'min_samples_leaf': 2}, {'max_depth': 4, 'min_samples_leaf': 5}, {'max_depth': 7, 'min_samples_leaf': 2}, {'max_depth': 7, 'min_samples_leaf': 5}]


In [36]:
dt3.hyperparameters = dt3.create_hyperparameter_grid({'max_depth':[4, 7], 'min_samples_leaf':[2, 5]})

In [9]:
columns = ['x', 'y','mw_value', 'mean_3']
dt2.spatial_cv(data, columns)

length train_X: 2111739, length train_y: 2111739, length test_X: 347960, length test_y: 347960
length train_X: 2159733, length train_y: 2159733, length test_X: 403216, length test_y: 403216
length train_X: 2514439, length train_y: 2514439, length test_X: 541316, length test_y: 541316
length train_X: 2389462, length train_y: 2389462, length test_X: 481919, length test_y: 481919
length train_X: 2590000, length train_y: 2590000, length test_X: 321068, length test_y: 321068


In [10]:
dt2.get_attributes()

model = <class 'sklearn.tree._classes.DecisionTreeRegressor'>
hyperparameters = [{'max_depth': 5, 'min_samples_leaf': 5, 'min_samples_split': 5}]
best_hyperparameters = [{'max_depth': 5, 'min_samples_leaf': 5, 'min_samples_split': 5}, {'max_depth': 5, 'min_samples_leaf': 5, 'min_samples_split': 5}, {'max_depth': 5, 'min_samples_leaf': 5, 'min_samples_split': 5}, {'max_depth': 5, 'min_samples_leaf': 5, 'min_samples_split': 5}, {'max_depth': 5, 'min_samples_leaf': 5, 'min_samples_split': 5}]
rmse_train = 0.07946593581442872
rmse_std_train = 0.007014737016824507
rmse_test = 0.07837360065180628
rmse_std_test = 0.010516675275927838
r2_train = 0.5319369440969419
r2_std_train = 0.035455626742688304
r2_test = -0.1907628432111141
r2_std_test = 0.7861320413930145
outer_loop_results = {'rmse_list_train': [0.07051953227967811, 0.07191474422878381, 0.08839363624136169, 0.08269475423806222, 0.08380701208425778], 'rmse_list_test': [0.06813612194207963, 0.09251454369839324, 0.08577965303519904, 0.0807

In [11]:
save_object(dt2, 'test_decision_tree_v1')

### Older testing

In [1]:


a, b = cross_validation(df, ['mw_value', 'mean'], model_decisionTree, n_splits = 5, hyperparameters = None)


NameError: name 'import_data' is not defined

In [25]:
df = import_data(date_from, date_to, df_path)
df.isna().sum()

100%|██████████| 2/2 [00:01<00:00,  1.74it/s]


x                     0
y                     0
mw_value              0
opt_value             0
col                   0
row                   0
v1                14044
v2                 6648
v3                14036
v4                 7576
v5                    0
v6                 7576
v7                14036
v8                 6648
v9                14044
date                  0
mean                  0
elevation_data        0
dtype: int64

In [27]:
df = df[df['mw_value'] != -1]

In [None]:
def convert_to_tif(data, path_file_metadata, path_out):
    """
    Function to convert data to tif file.
    Arguments:
        data: new file
        path_file_metadata: tif file with metadata matching expected output tif file
        path_out: output tif file destination and name path
    Returns:
        .tif file
    """
    with rasterio.open(path_file_metadata) as src:
        kwargs1 = src.meta.copy()

    with rasterio.open(path_out, "w", **kwargs1) as dst:
        dst.write_band(1, data)  # numpy array or xarray
    return

In [None]:
def save_prediction_tif(X_pred, y_predicted, path_out):
    """
    Function to write predictions to .tif.
    Arguments:
        X_pred: data to be predicted on.
        y_predicted: predicted labels in array, or pandas series.
        path_out: path to save .tif file with file name.
    Returns: No return, writes data to path.
    """
    # join prediction and coordinates (row, col)
    X_pred["prediction"] = y_predicted

    # original matrix shape:
    nan_matrix = np.full((2663, 1462), np.nan)

    for row in tqdm(X_pred.iterrows()):  # fix progress bar?
        row_index = int(row[1]["row"])
        col_index = int(row[1]["col"])
        pred_val = row[1]["prediction"]
        nan_matrix[row_index][col_index] = pred_val

    # file to take reference metadata from is interpolated transformed file
    path_metadata_reference = r"../Data/microwave-rs/mw_interpolated/2019-07-01_mw.tif"

    convert_to_tif(nan_matrix, path_metadata_reference, path_out)

    return
