In [14]:
import warnings
warnings.filterwarnings("ignore")

import numpy as np
import pandas as pd
import h5py
import os 
import math 
from scipy.spatial import cKDTree
import cartopy.crs as ccrs
import matplotlib.pyplot as plt
import cartopy.feature as cfeature
from sklearn.metrics import r2_score
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split


In [15]:
def list_files(directory: str, ftype):
    """
    List files all file in given folder.

    Parameters:
        directory (str): Directory to search for files.
        
    Returns:
        dict: A dictionary where keys are week ranges and values are lists of matching files.
    """
    matching_files = []
    matching_files.extend(
        [directory+"/"+f for f in os.listdir(directory) if f.endswith(ftype)]
    )
    #files_by_week[f"{start} to {stop}"] = matching_files

    return matching_files

# Function to generate the grid points
def generate_grid_points(top_left_lat, top_left_lon, bottom_right_lat, bottom_right_lon, grid_size):
    grid_points = []

    # Calculate the distance between the top-left and bottom-right corners
    lat_distance = abs(top_left_lat - bottom_right_lat)
    lon_distance = abs(top_left_lon - bottom_right_lon)

    # Calculate the number of grids in latitude and longitude directions
    num_lat_grids = int(lat_distance * 111.32 / grid_size)  # 1 degree latitude ~ 111.32 km
    num_lon_grids = int(lon_distance * 111.32 * math.cos(math.radians(top_left_lat)) / grid_size)

    # Generate grid points
    for i in range(num_lat_grids + 1):
        for j in range(num_lon_grids + 1):
            lat = top_left_lat - (i * grid_size / 111.32)
            lon = top_left_lon + (j * grid_size / (111.32 * math.cos(math.radians(top_left_lat))))
            grid_points.append((lat, lon))

    return grid_points

def load_combined_file(file_path):
    """
    Load the combined HDF5 file and extract the data.
    """
    print(file_path)
    with h5py.File(file_path, 'r') as f:
        soil_moisture = f['soil_moisture'][:]
        latitude = f['latitude'][:]
        longitude = f['longitude'][:]
    
    #print(soil_moisture)
    
    """
    plot_on_map(
        latitude,
        longitude,
        soil_moisture,
        [4.5, 25.5, 95.5, 110.5],
        #p_name,
        title=""
    )
    """
    return soil_moisture, latitude, longitude

def ReadData(csv_file, lat, lon):
    # Determine the bounding box of the SMAP data
    lat_min, lat_max = min(lat), max(lat)
    lon_min, lon_max = min(lon), max(lon)
    
    # Load telemetry station data (CSV format assumed)
    #tele_data = pd.read_csv(csv_file, names=['code','latitude','longitude','val'])
    tele_data = pd.read_csv(csv_file)
    #print(tele_data)
    
    # Filter telemetry stations within SMAP bounding box
    filtered_stations = tele_data[
        (tele_data['latitude'] >= lat_min) & (tele_data['latitude'] <= lat_max) &
        (tele_data['longitude'] >= lon_min) & (tele_data['longitude'] <= lon_max)
    ]
    return filtered_stations

def genSMAP(filtered_stations, smap_locations, _smapDf, paraName):
    tele_locations = filtered_stations[['latitude', 'longitude']].to_numpy()
    tele_values = filtered_stations['val'].to_numpy()
    
    #print(tele_locations, tele_values)
    
    smap_tree = cKDTree(smap_locations)
    
    # Keep track of used locations in smapDf
    used_smap_indices = set()
    
    # Prepare a column to store results
    _smapDf[paraName] = np.nan  # New column for matched SMAP values
    
    
    # Iterate over each smap location and match it to the nearest tele location
    for idx, tele_loc in enumerate(tele_locations):
        # Query the nearest tele location
        #distance, tele_idx = tele_tree.query(tele_loc)
        distance, smap_idx = smap_tree.query(tele_loc)
        #print(distance, smap_idx,idx , tele_loc, smap_locations[smap_idx])
    
        if smap_idx not in used_smap_indices:
            _smapDf.loc[smap_idx, paraName] = tele_values[idx]
            #print(distance, smap_idx,idx , tele_loc, smap_locations[smap_idx],tele_values[idx], smapDf['matched_smap_val'][idx])
            used_smap_indices.add(smap_idx)  # Mark this SMAP index as used
    
    #print(smapDf)
    return _smapDf

# IDW Interpolation function
def inverse_distance_weighting(x, y, values, xi, yi, power=2):
    tree = cKDTree(np.c_[x, y])
    distances, indices = tree.query(np.c_[xi.ravel(), yi.ravel()], k=5)
    weights = 1 / distances ** power
    weighted_values = np.sum(weights * values[indices], axis=1) / np.sum(weights, axis=1)
    return weighted_values.reshape(xi.shape)


In [17]:
"""
# Sample data with missing values
data = {
    'Feature1': [1.0, 2.0, 3.0, 4.0, np.nan],
    'Feature2': [10.0, 15.0, np.nan, 20.0, 25.0],
    'Target':   [100.0, 200.0, 300.0, np.nan, 500.0]
}
df = pd.DataFrame(data)

# Choose the column you want to impute (e.g., 'Target')
target_col = 'Target'

# Split data into rows with and without missing values in target
df_train_ori = df[df[target_col].notnull()].copy()
df_missing = df[df[target_col].isnull()].copy()

print(df_train_ori)
print(df_missing)

# Define features (excluding the target column)
features = df.columns.drop(target_col)
print(features)

# Drop rows with missing values in the features from the training set
df_train = df_train_ori.dropna(subset=features).copy()
print(df_train)


# Train a RandomForestRegressor
model = RandomForestRegressor(n_estimators=100, random_state=42)
model.fit(df_train[features], df_train[target_col])

# Predict missing values
df_missing = df_missing.copy()  # to avoid SettingWithCopyWarning
df_missing[target_col] = model.predict(df_missing[features])

# Combine the imputed rows with the original data
df_imputed = pd.concat([df_train_ori, df_missing]).sort_index()

print(df_imputed)
"""



In [18]:
# Constants
EARTH_RADIUS = 6371  # Earth's radius in kilometers
grid_size = 1

# NorthEast
lat_range = [18.607933, 14.012681]  # Define the latitude range of interest
lon_range = [101.005346, 105.995516]  # Define the longitude range of interest

# Top Left
#lat_range = [18.607933, 16.310307]  # Define the latitude range of interest
#lon_range = [101.005346, 103.5004]  # Define the longitude range of interest

# Bottom Left
#lat_range = [16.310307, 14.012681]  # Define the latitude range of interest
#lon_range = [101.005346, 103.5004]  # Define the longitude range of interest

# Top Right
#lat_range = [18.607933, 16.310307]  # Define the latitude range of interest
#lon_range = [103.5004, 105.995516]  # Define the longitude range of interest

# Bottom Right
#lat_range = [16.310307, 14.012681]  # Define the latitude range of interest
#lon_range = [103.5004, 105.995516]  # Define the longitude range of interest

# Calculate the distance in kilometers
#distance_left_2_right = geodesic((lat_range[0], lon_range[0]), (lat_range[0], lon_range[1])).kilometers
#distance_top_2_down = geodesic((lat_range[0], lon_range[0]), (lat_range[1], lon_range[0])).kilometers

#print(distance_left_2_right)
#print(distance_top_2_down)

grid_points = generate_grid_points(lat_range[0], lon_range[0], lat_range[1], lon_range[1], grid_size)

points = pd.DataFrame(grid_points, columns=['latitude', 'longitude'])
print(points)


         latitude   longitude
0       18.607933  101.005346
1       18.607933  101.014825
2       18.607933  101.024303
3       18.607933  101.033782
4       18.607933  101.043260
...           ...         ...
269819  14.017563  105.953182
269820  14.017563  105.962661
269821  14.017563  105.972139
269822  14.017563  105.981618
269823  14.017563  105.991097

[269824 rows x 2 columns]


In [20]:
# Paths to your data files
smap_dir = "/Users/khaitao/Documents/GitHub/SMAP/Weekly/Thailand/"  # Replace with your .h5 file

h5_files = list_files(smap_dir,'.h5')
print(h5_files)


['/Users/khaitao/Documents/GitHub/SMAP/Weekly/Thailand//2024-01-22to2024-01-28.h5', '/Users/khaitao/Documents/GitHub/SMAP/Weekly/Thailand//2023-11-13to2023-11-19.h5', '/Users/khaitao/Documents/GitHub/SMAP/Weekly/Thailand//2023-04-17to2023-04-23.h5', '/Users/khaitao/Documents/GitHub/SMAP/Weekly/Thailand//2023-12-11to2023-12-17.h5', '/Users/khaitao/Documents/GitHub/SMAP/Weekly/Thailand//2023-04-24to2023-04-30.h5', '/Users/khaitao/Documents/GitHub/SMAP/Weekly/Thailand//2024-08-05to2024-08-11.h5', '/Users/khaitao/Documents/GitHub/SMAP/Weekly/Thailand//2023-07-10to2023-07-16.h5', '/Users/khaitao/Documents/GitHub/SMAP/Weekly/Thailand//2024-03-18to2024-03-24.h5', '/Users/khaitao/Documents/GitHub/SMAP/Weekly/Thailand//2023-01-02to2023-01-08.h5', '/Users/khaitao/Documents/GitHub/SMAP/Weekly/Thailand//2024-10-14to2024-10-20.h5', '/Users/khaitao/Documents/GitHub/SMAP/Weekly/Thailand//2023-10-02to2023-10-08.h5', '/Users/khaitao/Documents/GitHub/SMAP/Weekly/Thailand//2024-11-04to2024-11-10.h5', '/U