In [2]:
# -----------------------------------------------------------------------------
# Load Data: Read CSV files and inspect data
# -----------------------------------------------------------------------------
import sys
sys.path.append('..')

from baseline.utilities import *

from pandarallel import pandarallel

pandarallel.initialize(progress_bar=False, nb_workers=8)


SENTINEL_TIFF_PATH = '../baseline/S2_sample.tiff' # './S2_sample_5res.tiff'
LANDSAT_TIFF_PATH = '../baseline/Landsat_LST.tiff'
MODE = 'submission'  # 'submission' 'train'

if MODE == 'train':
    ground_df = pd.read_csv("../baseline/Training_data_uhi_index.csv")
elif MODE == 'submission':
    ground_df = pd.read_csv("../baseline/Submission_template.csv")
else:
    raise ValueError("MODE should be either 'train' or 'submission")

display(ground_df[['Longitude', 'Latitude']].describe())

INFO: Pandarallel will run on 8 workers.
INFO: Pandarallel will use standard multiprocessing data transfer (pipe) to transfer data between the main process and workers.


Unnamed: 0,Longitude,Latitude
count,1040.0,1040.0
mean,-73.934816,40.807991
std,0.028661,0.0232
min,-73.993163,40.758877
25%,-73.95703,40.790802
50%,-73.934618,40.809553
75%,-73.910655,40.823054
max,-73.879537,40.859243


In [3]:
# -----------------------------------------------------------------------------
# Feature Engineering: Explore the NY MESONET Weather data
# -----------------------------------------------------------------------------
ny_bronx_point = (40.87248, -73.89352)
ny_manhattan_point = (40.76754, -73.96449)

ny_mesonet_bronx_df = pd.read_excel('../baseline/NY_Mesonet_Weather.xlsx', sheet_name='Bronx')
ny_mesonet_manhattan_df = pd.read_excel('../baseline/NY_Mesonet_Weather.xlsx', sheet_name='Manhattan')

ny_mesonet_bronx_df['Date / Time'] = pd.to_datetime(ny_mesonet_bronx_df['Date / Time'])
ny_mesonet_manhattan_df['Date / Time'] = pd.to_datetime(ny_mesonet_manhattan_df['Date / Time'])

ny_mesonet_bronx_df = ny_mesonet_bronx_df.sort_values('Date / Time')
ny_mesonet_manhattan_df = ny_mesonet_manhattan_df.sort_values('Date / Time')

print(ny_mesonet_bronx_df.dtypes)
print(ny_mesonet_manhattan_df.dtypes)

AttributeError: 'NoneType' object has no attribute 'total_seconds'

In [None]:
# -----------------------------------------------------------------------------
# Filter Data: Select Weather Data from 3 PM to 4 PM
# -----------------------------------------------------------------------------

ny_mesonet_bronx_df = ny_mesonet_bronx_df[
    (ny_mesonet_bronx_df['Date / Time'].dt.hour == 15) |
    ((ny_mesonet_bronx_df['Date / Time'].dt.hour == 16) & (ny_mesonet_bronx_df['Date / Time'].dt.minute == 0))
].reset_index(drop=True)
display(ny_mesonet_bronx_df)

ny_mesonet_manhattan_df = ny_mesonet_manhattan_df[
    (ny_mesonet_manhattan_df['Date / Time'].dt.hour == 15) |
    ((ny_mesonet_manhattan_df['Date / Time'].dt.hour == 16) & (ny_mesonet_manhattan_df['Date / Time'].dt.minute == 0))
].reset_index(drop=True)
display(ny_mesonet_manhattan_df)

In [None]:
# -----------------------------------------------------------------------------
# Calculate Distances and Bearing: Distance to Bronx and Manhattan
# -----------------------------------------------------------------------------
ny_mesonet_features = ground_df[['Latitude', 'Longitude']].copy()
ny_mesonet_features.columns = ny_mesonet_features.columns.str.lower()

# Calculate the distance from each point to the Bronx and Manhattan
ny_mesonet_features['distance_bronx'] = ny_mesonet_features.parallel_apply(
    lambda x: distance_meters(x, ny_bronx_point),
    axis=1
)
ny_mesonet_features['distance_manhattan'] = ny_mesonet_features.parallel_apply(
    lambda x: distance_meters(x, ny_manhattan_point),
    axis=1
)

# Calculate the ratio of distances between Bronx and Manhattan
ny_mesonet_features['ratio_dist_bronx_manhattan'] = (
    ny_mesonet_features['distance_bronx'] / (
        ny_mesonet_features['distance_manhattan'] + ny_mesonet_features['distance_bronx']
    )
)
ny_mesonet_features['ratio_dist_manhattan_bronx'] = (
    ny_mesonet_features['distance_manhattan'] / (
        ny_mesonet_features['distance_manhattan'] + ny_mesonet_features['distance_bronx']
    )
)

In [None]:
# -----------------------------------------------------------------------------
# Weather Data Pivot: Process Wind and Speed Data
# -----------------------------------------------------------------------------
ny_mesonet_bm_df = pd.concat([ny_mesonet_bronx_df.assign(location='bronx'), ny_mesonet_manhattan_df.assign(location='manhattan')], axis=0)

ny_mesonet_bm_wind_dir_pivot = ny_mesonet_bm_df.pivot(
    index="location", 
    columns="Date / Time", 
    values="Wind Direction [degrees]"
)
ny_mesonet_bm_wind_dir_pivot.columns = [f"Wind Direction [degrees] {col}" for col in ny_mesonet_bm_wind_dir_pivot.columns]
ny_mesonet_bm_wind_dir_pivot = ny_mesonet_bm_wind_dir_pivot.reset_index(drop=False)

nymesonet_bm_avg_wind_speed_pivot = ny_mesonet_bm_df.pivot(
    index="location",
    columns="Date / Time",
    values="Avg Wind Speed [m/s]"
)
nymesonet_bm_avg_wind_speed_pivot.columns = [f"Avg Wind Speed [m/s] {col}" for col in nymesonet_bm_avg_wind_speed_pivot.columns]
nymesonet_bm_avg_wind_speed_pivot = nymesonet_bm_avg_wind_speed_pivot.reset_index(drop=False)

ny_mesonet_bm_df_pivot = pd.merge(
    ny_mesonet_bm_wind_dir_pivot, nymesonet_bm_avg_wind_speed_pivot, on="location"
)

display(ny_mesonet_bm_df_pivot) # .to_dict(orient='tight')

In [None]:
# -----------------------------------------------------------------------------
# Weather Influence: Calculate Wind Influence Based on Bearing
# -----------------------------------------------------------------------------
ny_mesonet_bm_dict = {
    "location": {
        row["location"]: {
            col : row[col]  # f"Wind Direction {col.split()[-1]}"
            for col in ny_mesonet_bm_df_pivot.columns if col != "location"
        }
        for _, row in ny_mesonet_bm_df_pivot.iterrows()
    }
}
print(json.dumps(ny_mesonet_bm_dict, indent=2))

for loc in ny_mesonet_bm_dict['location']:
    print(loc)
    for k, v in ny_mesonet_bm_dict['location'][loc].items():
        print(f"{k}: {v}")
        if k.startswith('Wind Direction'):    
            ny_mesonet_features[f"Wind Influence {k.split()[-1]} {loc}"] = ny_mesonet_features[f'bearing_{loc}'].parallel_apply(
                lambda x: np.cos(np.radians(v - x))
            )

In [None]:
# -----------------------------------------------------------------------------
# Final Adjustments: Clean Up and Save Data
# -----------------------------------------------------------------------------
# Clean up the feature columns and drop unnecessary columns
ny_mesonet_features.columns = [
    col.replace(' ', '_').lower() for col in ny_mesonet_features.columns
]

display(ny_mesonet_features)

# Drop columns that are no longer needed
ny_mesonet_features = ny_mesonet_features.drop(
    columns=['latitude', 'longitude', 'distance_bronx', 'distance_manhattan', 'ratio_dist_bronx_manhattan', 'ratio_dist_manhattan_bronx']
)

# Save the processed data to a parquet file
ny_mesonet_features.to_parquet(f'../pipeline/data/processed/{MODE}/ny_mesonet_features.parquet')