## Merge all datasets
Arctic dataframe and dataframe from Waterhouse et al., made in data_exploration/load_ds notebook

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler

In [2]:
import sys
import os
sys.path.append('../..')
from src.utils.directories import get_parent_directory

import warnings
warnings.filterwarnings("ignore", category=RuntimeWarning)

In [3]:
parent_dir = get_parent_directory()

In [4]:
parent_dir

'/Users/Lisanne/Documents/AI4ER/Mres/ArcticTurbulence'

In [5]:
global_pkl = os.path.join(parent_dir, "data/processed/Mashayek2022/input_microstructure.pkl")
global_df = pd.read_pickle(global_pkl)
global_df = global_df.rename(columns={'lat': 'latitude', 'log_N2_sort': 'log_N2'})

# Rename columns using the rename() method
waterhouse_df = global_df.rename(columns={'log_N2_sort': 'log_N2', 'log_kappa': 'LK', 'lat': 'latitude'})

In [25]:
# arctic_pkl = '/Users/Lisanne/Documents/AI4ER/Mres/ArcticTurbulence/data/processed_data/ml_ready/arctic2_1805.pkl'
arctic_pkl = (os.path.join(parent_dir, "data/processed/ml_ready/1710_time.pkl"))
arctic_df = pd.read_pickle(arctic_pkl)

## Concatenate dataframes

In [7]:
all_df = pd.concat([arctic_df, waterhouse_df], axis=0)
arctic_df = all_df[['depth', 'profile', 'cruise', 'latitude', 'longitude',
                 'S', 'T', 'log_eps', 'log_N2', 'dTdz', 'dSdz', 'time', 'season', 'seasonal_sin']]
all_df = all_df[['depth', 'profile', 'cruise', 'latitude',
                 'S', 'T', 'log_eps', 'log_N2', 'dTdz', 'dSdz']]

In [13]:
# Round depth down to the nearest integer
all_df['rounded_depth'] = np.abs(np.floor(all_df['depth']).astype(int))

# Filter the DataFrame to include only values that are multiples of ten
filtered_df = all_df[all_df['rounded_depth'] % 10 == 0]

# Group the DataFrame by cruise
grouped_df = filtered_df.groupby('cruise')

# Create an empty DataFrame to store the filtered data
filtered_df_1500 = pd.DataFrame()

# Iterate over each group (cruise) in the grouped DataFrame
for cruise, group in grouped_df:
    # Check if the group has more than 1500 points
    if len(group) > 1500:
        # Randomly select 1500 points from the group
        selected_points = group.sample(n=1500, random_state=42)  # Adjust the random_state if desired
        
        # Append the selected points to the filtered DataFrame
        filtered_df_1500 = pd.concat([filtered_df_1500, selected_points])
    else:
        # If the group has less than or equal to 1500 points, append all points
        filtered_df_1500 = pd.concat([filtered_df_1500, group])

In [20]:
filtered_df_1500["depth"] = filtered_df_1500["rounded_depth"]

## Save merged dataframe

In [21]:
arctic_pkl_path = (os.path.join(parent_dir, "data/processed/ml_ready/merged_arctic.pkl"))
arctic_df.to_pickle(arctic_pkl_path)

all_pkl_path = (os.path.join(parent_dir, "data/processed/ml_ready/merged_all.pkl"))
all_df.to_pickle(all_pkl_path)

filtered_df_1500_path = (os.path.join(parent_dir, "data/processed/ml_ready/all_1500.pkl"))
filtered_df_1500.to_pickle(filtered_df_1500_path)