In [24]:
import warnings
warnings.filterwarnings("ignore")

import numpy as np
import pandas as pd
import h5py
import os 
import math 
from scipy.spatial import cKDTree
import cartopy.crs as ccrs
import matplotlib.pyplot as plt
import cartopy.feature as cfeature
from sklearn.metrics import r2_score
from sklearn.ensemble import RandomForestRegressor
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split


In [21]:
def calculate_new_lat_lon(lat, lon, distance, bearing):
    lat_rad = math.radians(lat)
    lon_rad = math.radians(lon)
    angular_distance = distance / EARTH_RADIUS

    new_lat = math.asin(math.sin(lat_rad) * math.cos(angular_distance) +
                        math.cos(lat_rad) * math.sin(angular_distance) * math.cos(bearing))

    new_lon = lon_rad + math.atan2(math.sin(bearing) * math.sin(angular_distance) * math.cos(lat_rad),
                                   math.cos(angular_distance) - math.sin(lat_rad) * math.sin(new_lat))

    new_lat = math.degrees(new_lat)
    new_lon = math.degrees(new_lon)

    return new_lat, new_lon

# Function to generate the grid points
def generate_grid_points(top_left_lat, top_left_lon, bottom_right_lat, bottom_right_lon, grid_size):
    grid_points = []

    # Calculate the distance between the top-left and bottom-right corners
    lat_distance = abs(top_left_lat - bottom_right_lat)
    lon_distance = abs(top_left_lon - bottom_right_lon)

    # Calculate the number of grids in latitude and longitude directions
    num_lat_grids = int(lat_distance * 111.32 / grid_size)  # 1 degree latitude ~ 111.32 km
    num_lon_grids = int(lon_distance * 111.32 * math.cos(math.radians(top_left_lat)) / grid_size)

    # Generate grid points
    for i in range(num_lat_grids + 1):
        for j in range(num_lon_grids + 1):
            lat = top_left_lat - (i * grid_size / 111.32)
            lon = top_left_lon + (j * grid_size / (111.32 * math.cos(math.radians(top_left_lat))))
            grid_points.append((lat, lon))

    return grid_points

def ReadData(csv_file, lat, lon):
    # Determine the bounding box of the SMAP data
    lat_min, lat_max = min(lat), max(lat)
    lon_min, lon_max = min(lon), max(lon)

    # Load telemetry station data (CSV format assumed)
    #tele_data = pd.read_csv(csv_file, names=['code','latitude','longitude','val'])
    tele_data = pd.read_csv(csv_file)
    #print(tele_data)

    # Filter telemetry stations within SMAP bounding box
    filtered_stations = tele_data[
        (tele_data['latitude'] >= lat_min) & (tele_data['latitude'] <= lat_max) &
        (tele_data['longitude'] >= lon_min) & (tele_data['longitude'] <= lon_max)
    ]
    return filtered_stations

def genSMAP(filtered_stations, smap_locations, _smapDf, paraName):
    tele_locations = filtered_stations[['latitude', 'longitude']].to_numpy()
    tele_values = filtered_stations['val'].to_numpy()

    #print(tele_locations, tele_values)

    smap_tree = cKDTree(smap_locations)

    # Keep track of used locations in smapDf
    used_smap_indices = set()

    # Prepare a column to store results
    _smapDf[paraName] = np.nan  # New column for matched SMAP values


    # Iterate over each smap location and match it to the nearest tele location
    for idx, tele_loc in enumerate(tele_locations):
        # Query the nearest tele location
        #distance, tele_idx = tele_tree.query(tele_loc)
        distance, smap_idx = smap_tree.query(tele_loc)
        #print(distance, smap_idx,idx , tele_loc, smap_locations[smap_idx])

        if smap_idx not in used_smap_indices:
            _smapDf.loc[smap_idx, paraName] = tele_values[idx]
            #print(distance, smap_idx,idx , tele_loc, smap_locations[smap_idx],tele_values[idx], smapDf['matched_smap_val'][idx])
            used_smap_indices.add(smap_idx)  # Mark this SMAP index as used

    #print(smapDf)
    return _smapDf

# Function to interpolate missing values
def interpolate_feature(df, feature):
    known_points = df[['latitude', 'longitude']][df[feature].notna()].values
    known_values = df[feature].dropna().values
    grid_values = griddata(known_points, known_values, (grid_lat, grid_lon), method='cubic')
    return grid_values

def list_files(directory: str, ftype):
    """
    List files all file in given folder.

    Parameters:
        directory (str): Directory to search for files.

    Returns:
        dict: A dictionary where keys are week ranges and values are lists of matching files.
    """
    matching_files = []
    matching_files.extend(
        [directory+"/"+f for f in os.listdir(directory) if f.endswith(ftype)]
    )
    #files_by_week[f"{start} to {stop}"] = matching_files

    return matching_files

def load_combined_file(file_path):
    """
    Load the combined HDF5 file and extract the data.
    """
    print(file_path)
    with h5py.File(file_path, 'r') as f:
        soil_moisture = f['soil_moisture'][:]
        latitude = f['latitude'][:]
        longitude = f['longitude'][:]

    #print(soil_moisture)

    """
    plot_on_map(
        latitude,
        longitude,
        soil_moisture,
        [4.5, 25.5, 95.5, 110.5],
        #p_name,
        title=""
    )
    """
    return soil_moisture, latitude, longitude

def generate_points_in_boundary(start_lat, start_lon, end_lat, end_lon, interval_km=1):
    """
    Generate latitude and longitude points at every `interval_km` between two points.
    """
    # Define the starting and ending points
    start_point = Point(start_lat, start_lon)
    end_point = Point(end_lat, end_lon)

    # Calculate the total distance between the start and end points
    total_distance = geodesic(start_point, end_point).kilometers

    # Calculate the bearing (direction) from start to end
    bearing = calculate_bearing(start_point, end_point)

    # Generate points along the line at 1 km intervals
    pointsDf = pd.DataFrame(columns=['latitude','longitude'])
    for km in range(0, int(total_distance), interval_km):
        # Calculate the new point at the given distance and bearing
        new_point = distance(kilometers=km).destination(point=start_point, bearing=bearing)
        pointsDf = pointsDf._append({'latitude': new_point.latitude,
                       'longitude': new_point.longitude},
                      ignore_index=True)
        #points.append((new_point.latitude, new_point.longitude))

    return pointsDf

def calculate_bearing(start_point, end_point):
    """
    Calculate the bearing (direction) from start_point to end_point.
    """
    lat1 = math.radians(start_point.latitude)
    lon1 = math.radians(start_point.longitude)
    lat2 = math.radians(end_point.latitude)
    lon2 = math.radians(end_point.longitude)

    dlon = lon2 - lon1
    x = math.sin(dlon) * math.cos(lat2)
    y = math.cos(lat1) * math.sin(lat2) - math.sin(lat1) * math.cos(lat2) * math.cos(dlon)
    bearing = math.atan2(x, y)
    bearing = math.degrees(bearing)
    bearing = (bearing + 360) % 360  # Normalize to 0-360 degrees
    return bearing

def plot_on_map(latitude, longitude, soil_moisture, region_bounds, title):
    """
    Plot soil moisture data on a map using cartopy.
    """
    # Create the map projection
    projection = ccrs.PlateCarree()

    # Create the figure and axis
    fig, ax = plt.subplots(figsize=(12, 10), subplot_kw={'projection': projection})

    # Set the map extent to Thailand
    min_lat, max_lat, min_lon, max_lon = region_bounds
    ax.set_extent([min_lon, max_lon, min_lat, max_lat], crs=projection)

    # Add map features
    ax.add_feature(cfeature.LAND, edgecolor='black')
    ax.add_feature(cfeature.COASTLINE)
    ax.add_feature(cfeature.BORDERS, linestyle=':')


    sc = ax.scatter(longitude, latitude, c=soil_moisture, cmap='YlGnBu', marker='s', s=5, transform=projection)
    plt.colorbar(sc, ax=ax, orientation='vertical', label='Soil Moisture')

    # Add a title
    ax.set_title(title, fontsize=14)

    # Show the plot
    plt.show()


In [35]:

# Sample data with missing values
data = {
    'Feature1': [1.0, 2.0, 3.0, 4.0, np.nan],
    'Feature2': [10.0, 15.0, np.nan, 20.0, 25.0],
    'Target':   [100.0, 200.0, 300.0, np.nan, 500.0]
}
df = pd.DataFrame(data)

# Choose the column you want to impute (e.g., 'Target')
target_col = 'Target'

# Split data into rows with and without missing values in target
df_train_ori = df[df[target_col].notnull()].copy()
df_missing = df[df[target_col].isnull()].copy()

print(df_train_ori)
print(df_missing)

# Define features (excluding the target column)
features = df.columns.drop(target_col)
print(features)

# Drop rows with missing values in the features from the training set
df_train = df_train_ori.dropna(subset=features).copy()
print(df_train)


# Train a RandomForestRegressor
model = RandomForestRegressor(n_estimators=100, random_state=42)
model.fit(df_train[features], df_train[target_col])

# Predict missing values
df_missing = df_missing.copy()  # to avoid SettingWithCopyWarning
df_missing[target_col] = model.predict(df_missing[features])

# Combine the imputed rows with the original data
df_imputed = pd.concat([df_train_ori, df_missing]).sort_index()

print(df_imputed)


   Feature1  Feature2  Target
0       1.0      10.0   100.0
1       2.0      15.0   200.0
2       3.0       NaN   300.0
4       NaN      25.0   500.0
   Feature1  Feature2  Target
3       4.0      20.0     NaN
Index(['Feature1', 'Feature2'], dtype='object')
   Feature1  Feature2  Target
0       1.0      10.0   100.0
1       2.0      15.0   200.0
   Feature1  Feature2  Target
0       1.0      10.0   100.0
1       2.0      15.0   200.0
2       3.0       NaN   300.0
3       4.0      20.0   174.0
4       NaN      25.0   500.0


In [18]:
# Constants
EARTH_RADIUS = 6371  # Earth's radius in kilometers
grid_size = 1

# NorthEast
lat_range = [18.607933, 14.012681]  # Define the latitude range of interest
lon_range = [101.005346, 105.995516]  # Define the longitude range of interest

# Top Left
#lat_range = [18.607933, 16.310307]  # Define the latitude range of interest
#lon_range = [101.005346, 103.5004]  # Define the longitude range of interest

# Bottom Left
#lat_range = [16.310307, 14.012681]  # Define the latitude range of interest
#lon_range = [101.005346, 103.5004]  # Define the longitude range of interest

# Top Right
#lat_range = [18.607933, 16.310307]  # Define the latitude range of interest
#lon_range = [103.5004, 105.995516]  # Define the longitude range of interest

# Bottom Right
#lat_range = [16.310307, 14.012681]  # Define the latitude range of interest
#lon_range = [103.5004, 105.995516]  # Define the longitude range of interest

# Calculate the distance in kilometers
#distance_left_2_right = geodesic((lat_range[0], lon_range[0]), (lat_range[0], lon_range[1])).kilometers
#distance_top_2_down = geodesic((lat_range[0], lon_range[0]), (lat_range[1], lon_range[0])).kilometers

#print(distance_left_2_right)
#print(distance_top_2_down)

grid_points = generate_grid_points(lat_range[0], lon_range[0], lat_range[1], lon_range[1], grid_size)

points = pd.DataFrame(grid_points, columns=['latitude', 'longitude'])
print(points)


         latitude   longitude
0       18.607933  101.005346
1       18.607933  101.014825
2       18.607933  101.024303
3       18.607933  101.033782
4       18.607933  101.043260
...           ...         ...
269819  14.017563  105.953182
269820  14.017563  105.962661
269821  14.017563  105.972139
269822  14.017563  105.981618
269823  14.017563  105.991097

[269824 rows x 2 columns]


In [22]:
# Paths to your data files
smap_dir = "/Users/khaitao/Documents/GitHub/SMAP/Weekly/Thailand/"  # Replace with your .h5 file

tele_humid_dir = '/Users/khaitao/Documents/GitHub/SMAP/AvgTeleHumid/AvgTeleHumid_'  # Replace with your telemetry data file
tele_temp_dir = '/Users/khaitao/Documents/GitHub/SMAP/AvgTeleTemp/AvgTeleTemp_'  # Replace with your telemetry data file
tele_rain_dir = '/Users/khaitao/Documents/GitHub/SMAP/AvgTeleRain/AvgTeleRain_'  # Replace with your telemetry data file

h5_files = list_files(smap_dir,'.h5')
#print(h5_files)


h5_files = list_files(smap_dir,'.h5')
print(h5_files)


['/Users/khaitao/Documents/GitHub/SMAP/Weekly/Thailand//2024-01-22to2024-01-28.h5', '/Users/khaitao/Documents/GitHub/SMAP/Weekly/Thailand//2023-11-13to2023-11-19.h5', '/Users/khaitao/Documents/GitHub/SMAP/Weekly/Thailand//2023-04-17to2023-04-23.h5', '/Users/khaitao/Documents/GitHub/SMAP/Weekly/Thailand//2023-12-11to2023-12-17.h5', '/Users/khaitao/Documents/GitHub/SMAP/Weekly/Thailand//2023-04-24to2023-04-30.h5', '/Users/khaitao/Documents/GitHub/SMAP/Weekly/Thailand//2024-08-05to2024-08-11.h5', '/Users/khaitao/Documents/GitHub/SMAP/Weekly/Thailand//2023-07-10to2023-07-16.h5', '/Users/khaitao/Documents/GitHub/SMAP/Weekly/Thailand//2024-03-18to2024-03-24.h5', '/Users/khaitao/Documents/GitHub/SMAP/Weekly/Thailand//2023-01-02to2023-01-08.h5', '/Users/khaitao/Documents/GitHub/SMAP/Weekly/Thailand//2024-10-14to2024-10-20.h5', '/Users/khaitao/Documents/GitHub/SMAP/Weekly/Thailand//2023-10-02to2023-10-08.h5', '/Users/khaitao/Documents/GitHub/SMAP/Weekly/Thailand//2024-11-04to2024-11-10.h5', '/U

In [25]:
allDf = []
TrainList = []
TestList = []
cc = 0

# Build a KDTree for smapDf locations
smap_locations = points[['latitude', 'longitude']].to_numpy()
print(len(smap_locations))

#break

lenLat = len(points['latitude'].unique())
lenLon = len(points['longitude'].unique())

print(lenLat, lenLon)
#break

#print(grid_lat.shape, grid_lon.shape)

#break

min_max_scaler = MinMaxScaler()
first_round = True

for combined_file in h5_files:
    #print(combined_file)
    soil_moisture, latitude, longitude = load_combined_file(combined_file)
    #print(soil_moisture)

    #print(latitude, longitude)
    # Mask invalid soil moisture data
    soil_moisture = np.ma.masked_invalid(soil_moisture)
    p_name = combined_file.replace(smap_dir+"/","")
    p_name = p_name.replace(".h5","")
    p_name = p_name[:10].replace("-","")
    #print(p_name)

    humidDf= ReadData(tele_humid_dir+p_name+".csv", latitude, longitude)
    humidDf = humidDf.reset_index(drop=True)
    #print(humidDf)

    tempDf= ReadData(tele_temp_dir+p_name+".csv", latitude, longitude)
    tempDf = tempDf.reset_index(drop=True)
    #print(tempDf)

    rainDf= ReadData(tele_rain_dir+p_name+".csv", latitude, longitude)
    rainDf = rainDf.reset_index(drop=True)
    #print(rainDf)

    smapDf = pd.DataFrame(columns=['latitude','longitude','val'])
    smapDf['latitude'] = latitude
    smapDf['longitude'] = longitude
    smapDf['val'] = soil_moisture

    #print(smapDf[:10])

    smapDf = genSMAP(smapDf, smap_locations, points, "val")
    #print(smapDf[:10])

    #break
    smapDf = genSMAP(humidDf, smap_locations, smapDf, "humid")
    #print(smapDf[:10])
    smapDf = genSMAP(rainDf, smap_locations, smapDf, "rain")
    #print(smapDf[:10])
    smapDf = genSMAP(tempDf, smap_locations, smapDf, "temp")
    #print(smapDf[:10])
    #print(len(smapDf))

    if first_round:
        smapDf[['val','humid','rain','temp']] = min_max_scaler.fit_transform(smapDf[['val','humid','rain','temp']])
        first_round = False
    else:
        smapDf[['val','humid','rain','temp']] = min_max_scaler.transform(smapDf[['val','humid','rain','temp']])

    allDf.append(smapDf.copy())

    #print(f"{cc}->{allDf}")

    #if cc<=5:
    #    cc = cc+1
    #else:
    #   break

    break

#break

269824
512 527
/Users/khaitao/Documents/GitHub/SMAP/Weekly/Thailand//2024-01-22to2024-01-28.h5


In [33]:
print(allDf[0])


         latitude   longitude       val  humid  rain  temp
0       18.607933  101.005346  0.000000    NaN   NaN   NaN
1       18.607933  101.014825  0.168070    NaN   NaN   NaN
2       18.607933  101.024303  0.557160    NaN   NaN   NaN
3       18.607933  101.033782  0.494836    NaN   NaN   NaN
4       18.607933  101.043260  0.379273    NaN   NaN   NaN
...           ...         ...       ...    ...   ...   ...
269819  14.017563  105.953182  0.125216    NaN   NaN   NaN
269820  14.017563  105.962661  0.320070    NaN   NaN   NaN
269821  14.017563  105.972139  0.162713    NaN   NaN   NaN
269822  14.017563  105.981618  0.192176    NaN   NaN   NaN
269823  14.017563  105.991097  0.077004    NaN   NaN   NaN

[269824 rows x 6 columns]


In [34]:
df = allDf[0].copy()

# Choose the column you want to impute (e.g., 'Target')
target_col = 'val'

# Split data into rows with and without missing values in target
df_train_ori = df[df[target_col].notnull()].copy()
df_missing = df[df[target_col].isnull()].copy()

# Define features (excluding the target column)
features = ['humid','rain','temp']

# Drop rows with missing values in the features from the training set
df_train = df_train_ori.dropna(subset=features)


print(df_train_ori, df_missing)


         latitude   longitude       val  humid  rain  temp
0       18.607933  101.005346  0.000000    NaN   NaN   NaN
1       18.607933  101.014825  0.168070    NaN   NaN   NaN
2       18.607933  101.024303  0.557160    NaN   NaN   NaN
3       18.607933  101.033782  0.494836    NaN   NaN   NaN
4       18.607933  101.043260  0.379273    NaN   NaN   NaN
...           ...         ...       ...    ...   ...   ...
269819  14.017563  105.953182  0.125216    NaN   NaN   NaN
269820  14.017563  105.962661  0.320070    NaN   NaN   NaN
269821  14.017563  105.972139  0.162713    NaN   NaN   NaN
269822  14.017563  105.981618  0.192176    NaN   NaN   NaN
269823  14.017563  105.991097  0.077004    NaN   NaN   NaN

[200650 rows x 6 columns]          latitude   longitude  val  humid  rain  temp
11      18.607933  101.109611  NaN    NaN   NaN   NaN
22      18.607933  101.213875  NaN    NaN   NaN   NaN
34      18.607933  101.327619  NaN    NaN   NaN   NaN
45      18.607933  101.431884  NaN    NaN   NaN  

In [32]:
# Train a RandomForestRegressor
model = RandomForestRegressor(n_estimators=100, random_state=42)
model.fit(df_train[features], df_train[target_col])

# Predict missing values
df_missing = df_missing.copy()  # to avoid SettingWithCopyWarning
df_missing[target_col] = model.predict(df_missing[features])



ValueError: Input X contains NaN.
RandomForestRegressor does not accept missing values encoded as NaN natively. For supervised learning, you might want to consider sklearn.ensemble.HistGradientBoostingClassifier and Regressor which accept missing values encoded as NaNs natively. Alternatively, it is possible to preprocess the data, for instance by using an imputer transformer in a pipeline or drop samples with missing values. See https://scikit-learn.org/stable/modules/impute.html

In [None]:
# Combine the imputed rows with the original data
df_imputed = pd.concat([df_train_ori, df_missing]).sort_index()

print(df_imputed)