In [1]:
import pandas as pd
import geopandas as gpd
from shapely.geometry import Point
from shapely import wkt
import numpy as np

# Loading Ecoregion data

In [2]:
ecoregions_path = r"C:\Users\matta\Desktop\Documents\Python\Geolocation\climate_data\working_data\ecoregions3.csv"

In [3]:
NA3 = pd.read_csv(ecoregions_path)
NA3["geometry"] = NA3["geometry"].apply(wkt.loads)
NA3 = gpd.GeoDataFrame(NA3, geometry="geometry")

# Testing ecoregion coordinant join

In [4]:
#Create 10 points for to create dataframe for testing function
points = [[-112.0870977, 33.4942405, 'Phoenix'], [-111.5719556, 35.1842613, 'Flagstaff'], [-109.6450123, 33.958546, 'Greer'], 
[-111.6771445, 35.3498522, 'Mt. Humphrey'], [-105.5777349, 42.8788752, 'E. Iowa'], [-106.0264983, 39.6419848, 'Silverthorne'], 
[-90.3325451, 38.4984679, 'Eureka'], [-101.9310323, 34.8903525, 'Canyon'], [-91.6049041, 43.637534, 'S. Minn'], 
[-117.5877772, 33.6247486, 'S. California'], [10.1000, 33.333, 'error']]

#Make 11 points into pandas dataframe
testdf = pd.DataFrame(points, columns = ['long', 'lat', 'Location'])
testdf.head()

Unnamed: 0,long,lat,Location
0,-112.087098,33.49424,Phoenix
1,-111.571956,35.184261,Flagstaff
2,-109.645012,33.958546,Greer
3,-111.677144,35.349852,Mt. Humphrey
4,-105.577735,42.878875,E. Iowa


In [5]:
# Convert lat/long to a GeoDataFrame of points
points_gdf = gpd.GeoDataFrame(
    testdf,
    geometry=gpd.points_from_xy(testdf["long"], testdf["lat"]),
    crs=NA3.crs)   # use same CRS as NA3

# Spatial join: assigns polygon attributes to each point
joined = gpd.sjoin(points_gdf, NA3[["NA_L1KEY", "NA_L2KEY", "NA_L3KEY", "geometry"]], how="left", predicate="within")

# Now joined has Level 1–3 keys (NaN if no match)
testdf[["Level_1", "Level_2", "Level_3"]] = joined[["NA_L1KEY", "NA_L2KEY", "NA_L3KEY"]]

In [6]:
testdf

Unnamed: 0,long,lat,Location,Level_1,Level_2,Level_3
0,-112.087098,33.49424,Phoenix,10 NORTH AMERICAN DESERTS,10.2 WARM DESERTS,10.2.2 Sonoran Desert
1,-111.571956,35.184261,Flagstaff,13 TEMPERATE SIERRAS,13.1 UPPER GILA MOUNTAINS,13.1.1 Arizona/New Mexico Mountains
2,-109.645012,33.958546,Greer,13 TEMPERATE SIERRAS,13.1 UPPER GILA MOUNTAINS,13.1.1 Arizona/New Mexico Mountains
3,-111.677144,35.349852,Mt. Humphrey,13 TEMPERATE SIERRAS,13.1 UPPER GILA MOUNTAINS,13.1.1 Arizona/New Mexico Mountains
4,-105.577735,42.878875,E. Iowa,9 GREAT PLAINS,9.3 WEST-CENTRAL SEMIARID PRAIRIES,9.3.3 Northwestern Great Plains
5,-106.026498,39.641985,Silverthorne,6 NORTHWESTERN FORESTED MOUNTAINS,6.2 WESTERN CORDILLERA,6.2.14 Southern Rockies
6,-90.332545,38.498468,Eureka,8 EASTERN TEMPERATE FORESTS,8.3 SOUTHEASTERN USA PLAINS,8.3.2 Interior River Valleys and Hills
7,-101.931032,34.890352,Canyon,9 GREAT PLAINS,9.4 SOUTH CENTRAL SEMIARID PRAIRIES,9.4.1 High Plains
8,-91.604904,43.637534,S. Minn,8 EASTERN TEMPERATE FORESTS,8.1 MIXED WOOD PLAINS,8.1.5 Driftless Area
9,-117.587777,33.624749,S. California,11 MEDITERRANEAN CALIFORNIA,11.1 MEDITERRANEAN CALIFORNIA,"11.1.1 California Coastal Sage, Chaparral, an..."


# Labeling data 

In [7]:
data_path = r"C:\Users\matta\Desktop\Documents\Python\Geolocation\climate_data\working_data\climate_data.pkl"
df = pd.read_pickle(data_path)
df.shape

(481631, 119)

In [8]:
df[['latitude', 'longitude']].head()

Unnamed: 0,latitude,longitude
0,49.416667,-95.125
1,49.375,-95.166667
2,49.375,-95.125
3,49.375,-95.083333
4,49.375,-95.041667


In [9]:
# Convert lat/long to a GeoDataFrame of points
points_gdf = gpd.GeoDataFrame(
    df,
    geometry=gpd.points_from_xy(df["longitude"], df["latitude"]),
    crs=NA3.crs)   # use same CRS as NA3

# Spatial join: assigns polygon attributes to each point
joined = gpd.sjoin(points_gdf, NA3[["NA_L1KEY", "NA_L2KEY", "NA_L3KEY", "geometry"]], how="left", predicate="within")

df[["Level_1", "Level_2", "Level_3"]] = joined[["NA_L1KEY", "NA_L2KEY", "NA_L3KEY"]]

df.shape

(481631, 122)

In [10]:
df[["latitude", "longitude", "Level_1", "Level_2", "Level_3"]].info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 481631 entries, 0 to 481630
Data columns (total 5 columns):
 #   Column     Non-Null Count   Dtype  
---  ------     --------------   -----  
 0   latitude   481631 non-null  float64
 1   longitude  481631 non-null  float64
 2   Level_1    475432 non-null  object 
 3   Level_2    475432 non-null  object 
 4   Level_3    475432 non-null  object 
dtypes: float64(2), object(3)
memory usage: 18.4+ MB


In [11]:
df[["latitude", "longitude", "Level_1", "Level_2", "Level_3"]].sample()

Unnamed: 0,latitude,longitude,Level_1,Level_2,Level_3
317693,36.833333,-107.833333,10 NORTH AMERICAN DESERTS,10.1 COLD DESERTS,10.1.6 Colorado Plateaus


In [12]:
df.to_pickle(r"C:\Users\matta\Desktop\Documents\Python\Geolocation\climate_data\working_data\labeled_climate_data.pkl")