In [1]:
# Import libraries
import numpy as np
import pandas as pd 
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
import os
import geopandas as gpd
import warnings
warnings.filterwarnings('ignore')

# Set display options
pd.set_option('display.max_columns', None)
plt.style.use('ggplot')

In [2]:
# Import the datasets

# Load complete dataset
sentinel1 = pd.read_csv('data/Sentinel1.csv', nrows=100000)
sentinel2 = pd.read_csv('data/Sentinel2.csv', nrows=100000)

test = pd.read_csv("data/Test.csv")
sample_submission = pd.read_csv("data/SampleSubmission.csv")

In [3]:
# Display the first few rows of each dataset
print("\nTest data head:")
display(test.head())

print("\nSample submission head:")
display(sample_submission.head(2))

print("\nSentinel1 sample head:")
display(sentinel1.head())

print("\nSentinel2 sample head:")
display(sentinel2.head())


Test data head:


Unnamed: 0,ID,location,translated_lat,translated_lon
0,ID_QAHFTR,Fergana,41.464538,71.767177
1,ID_XCKAFN,Fergana,41.081855,72.397795
2,ID_AFQOFP,Fergana,41.652106,72.14447
3,ID_VAUSIE,Fergana,41.36698,73.032185
4,ID_YICCWP,Fergana,41.334641,71.52059



Sample submission head:


Unnamed: 0,ID,Cropland
0,ID_ABQOQT,0
1,ID_ADDROF,0



Sentinel1 sample head:


Unnamed: 0,ID,VH,VV,date,orbit,polarization,rel_orbit,translated_lat,translated_lon
0,ID_AFQOFP,-21.479683,-16.633259,2022-07-01,DESCENDING,"[VV, VH]",78.0,41.652292,72.144256
1,ID_AFQOFP,-24.76911,-15.943674,2022-07-01,DESCENDING,"[VV, VH]",78.0,41.652289,72.144375
2,ID_AFQOFP,-25.370838,-15.185609,2022-07-01,DESCENDING,"[VV, VH]",78.0,41.652286,72.144495
3,ID_AFQOFP,-24.134005,-16.351102,2022-07-01,DESCENDING,"[VV, VH]",78.0,41.652283,72.144614
4,ID_AFQOFP,-20.654249,-16.792723,2022-07-01,DESCENDING,"[VV, VH]",78.0,41.65228,72.144733



Sentinel2 sample head:


Unnamed: 0,B11,B12,B2,B3,B4,B5,B6,B7,B8,B8A,ID,cloud_pct,date,solar_azimuth,solar_zenith,translated_lat,translated_lon
0,2169,1820,1328,1610,1670,1985,2446,2628,2598,2638,ID_ZHZRHO,6.980395,2021-07-05,139.093139,22.625533,40.935173,71.617062
1,2151,1770,1306,1586,1640,1961,2495,2691,2684,2732,ID_ZHZRHO,6.980395,2021-07-05,139.093139,22.625533,40.935171,71.61718
2,2169,1820,1456,1674,1808,1985,2446,2628,2486,2638,ID_ZHZRHO,6.980395,2021-07-05,139.093139,22.625533,40.935085,71.61694
3,2169,1820,1284,1604,1658,1985,2446,2628,2658,2638,ID_ZHZRHO,6.980395,2021-07-05,139.093139,22.625533,40.935083,71.617059
4,2151,1770,1242,1522,1564,1961,2495,2691,2696,2732,ID_ZHZRHO,6.980395,2021-07-05,139.093139,22.625533,40.935081,71.617177


In [5]:
import pandas as pd
import numpy as np
from sklearn.neighbors import BallTree
import matplotlib.pyplot as plt

# Load datasets
print("Loading datasets...")
test = pd.read_csv('data/Test.csv')
sentinel1 = pd.read_csv('data/Sentinel1.csv')
sentinel2 = pd.read_csv('data/Sentinel2.csv')

# Display the first few rows of each dataset
print("\nTest data shape:", test.shape)
print("Sentinel1 data shape:", sentinel1.shape)
print("Sentinel2 data shape:", sentinel2.shape)

# Check for direct ID matches
s1_ids = set(sentinel1['ID'].unique())
s2_ids = set(sentinel2['ID'].unique())
test_ids = set(test['ID'].unique())

print(f"\nTest IDs: {len(test_ids)}")
print(f"Sentinel1 unique IDs: {len(s1_ids)}")
print(f"Sentinel2 unique IDs: {len(s2_ids)}")
print(f"Test IDs found in Sentinel1: {len(test_ids.intersection(s1_ids))}")
print(f"Test IDs found in Sentinel2: {len(test_ids.intersection(s2_ids))}")

# Function to find nearest neighbors using BallTree with haversine distance
def find_nearest_neighbors(src_points, query_points, k=1):
    """
    Find k nearest neighbors for each query point using BallTree with haversine distance
    
    Parameters:
    -----------
    src_points : array-like, shape (n_samples, 2)
        Source points (latitude, longitude) in degrees
    query_points : array-like, shape (n_queries, 2)
        Query points (latitude, longitude) in degrees
    k : int
        Number of nearest neighbors to find
        
    Returns:
    --------
    distances : array, shape (n_queries, k)
        Distances to nearest neighbors in km
    indices : array, shape (n_queries, k)
        Indices of nearest neighbors
    """
    # Convert to radians
    src_points_rad = np.radians(src_points)
    query_points_rad = np.radians(query_points)
    
    # Create BallTree
    tree = BallTree(src_points_rad, metric='haversine')
    
    # Find nearest neighbors
    distances, indices = tree.query(query_points_rad, k=k)
    
    # Convert distances from radians to kilometers (Earth radius ~6371 km)
    distances = distances * 6371.0
    
    return distances, indices

# Prepare coordinates for spatial matching
test_coords = test[['translated_lat', 'translated_lon']].values
s1_coords = sentinel1[['translated_lat', 'translated_lon']].values
s2_coords = sentinel2[['translated_lat', 'translated_lon']].values

# Find nearest Sentinel1 points for each test point
print("\nFinding nearest Sentinel1 points for each test point...")
s1_distances, s1_indices = find_nearest_neighbors(s1_coords, test_coords, k=5)

# Find nearest Sentinel2 points for each test point
print("Finding nearest Sentinel2 points for each test point...")
s2_distances, s2_indices = find_nearest_neighbors(s2_coords, test_coords, k=5)

# Create a combined test dataset with nearest Sentinel data
print("\nCreating combined test dataset...")
combined_test = test.copy()

# Add distance columns
combined_test['s1_distance_km'] = s1_distances[:, 0]
combined_test['s2_distance_km'] = s2_distances[:, 0]

# For each test point, get the nearest Sentinel1 data
s1_nearest = sentinel1.iloc[s1_indices[:, 0]].reset_index(drop=True)
s1_cols = [col for col in s1_nearest.columns if col not in ['ID', 'translated_lat', 'translated_lon']]
for col in s1_cols:
    combined_test[f's1_{col}'] = s1_nearest[col].values

# For each test point, get the nearest Sentinel2 data
s2_nearest = sentinel2.iloc[s2_indices[:, 0]].reset_index(drop=True)
s2_cols = [col for col in s2_nearest.columns if col not in ['ID', 'translated_lat', 'translated_lon']]
for col in s2_cols:
    combined_test[f's2_{col}'] = s2_nearest[col].values

# Save the combined dataset
combined_test.to_csv('test_combined_spatial.csv', index=False)
print("\nCombined test set saved to 'test_combined_spatial.csv'")

Loading datasets...

Test data shape: (600, 4)
Sentinel1 data shape: (1752570, 9)
Sentinel2 data shape: (5610393, 17)

Test IDs: 600
Sentinel1 unique IDs: 600
Sentinel2 unique IDs: 600
Test IDs found in Sentinel1: 600
Test IDs found in Sentinel2: 600

Finding nearest Sentinel1 points for each test point...
Finding nearest Sentinel2 points for each test point...

Creating combined test dataset...

Combined test set saved to 'test_combined_spatial.csv'


In [6]:
# Display a sample of the final combined dataset
print("\nCombined test set sample:")
display(combined_test.head())


Combined test set sample:


Unnamed: 0,ID,location,translated_lat,translated_lon,s1_distance_km,s2_distance_km,s1_VH,s1_VV,s1_date,s1_orbit,s1_polarization,s1_rel_orbit,s2_B11,s2_B12,s2_B2,s2_B3,s2_B4,s2_B5,s2_B6,s2_B7,s2_B8,s2_B8A,s2_cloud_pct,s2_date,s2_solar_azimuth,s2_solar_zenith
0,ID_QAHFTR,Fergana,41.464538,71.767177,0.000186,0.001824,-16.391993,-9.285672,2022-05-26,DESCENDING,"[VV, VH]",78.0,2097,1250,350,550,446,1052,2671,3242,3326,3502,1.171564,2021-08-04,147.160594,27.128766
1,ID_XCKAFN,Fergana,41.081855,72.397795,0.00046,0.003308,-22.356335,-12.495313,2024-02-16,DESCENDING,"[VV, VH]",5.0,2369,2086,2108,2078,2120,2180,2225,2230,2286,2249,3.102715,2025-02-01,158.712271,60.749161
2,ID_AFQOFP,Fergana,41.652106,72.14447,0.000411,0.003238,-24.876382,-11.891748,2022-10-29,DESCENDING,"[VV, VH]",78.0,4221,3434,1520,2004,2556,3128,3370,3592,3574,3764,1.210448,2021-07-07,136.595029,23.482371
3,ID_VAUSIE,Fergana,41.36698,73.032185,0.000469,0.004242,-30.119432,-12.924009,2022-09-30,DESCENDING,"[VV, VH]",5.0,4117,3067,1542,2262,3210,3627,3587,3765,3780,3960,10.831764,2021-07-07,139.046555,22.880416
4,ID_YICCWP,Fergana,41.334641,71.52059,0.000267,0.005867,-24.002735,-14.260557,2022-12-28,DESCENDING,"[VV, VH]",78.0,2804,2060,383,783,709,1937,3286,3627,4284,3846,0.106552,2021-07-25,142.133433,25.573887
