In [1]:
import pandas as pd
import geopandas as gpd
from shapely.geometry import Point
from shapely import wkt, wkb
import numpy as np

# Loading Ecoregion data

In [2]:
ecoregions_path = r"C:\Users\matta\Desktop\Documents\Python\Geolocation\climate_data\working_data\ecoregions3.csv"
NA3 = pd.read_csv(ecoregions_path)
NA3["geometry"] = NA3["geometry"].apply(wkt.loads)
NA3 = gpd.GeoDataFrame(NA3, geometry="geometry")

# Testing ecoregion coordinant join

In [3]:
#Create 10 points for to create dataframe for testing function
points = [[-112.0870977, 33.4942405, 'Phoenix'], [-111.5719556, 35.1842613, 'Flagstaff'], [-109.6450123, 33.958546, 'Greer'], 
[-111.6771445, 35.3498522, 'Mt. Humphrey'], [-105.5777349, 42.8788752, 'E. Iowa'], [-106.0264983, 39.6419848, 'Silverthorne'], 
[-90.3325451, 38.4984679, 'Eureka'], [-101.9310323, 34.8903525, 'Canyon'], [-91.6049041, 43.637534, 'S. Minn'], 
[-117.5877772, 33.6247486, 'S. California'], [10.1000, 33.333, 'error']]

#Make 11 points into pandas dataframe
testdf = pd.DataFrame(points, columns = ['long', 'lat', 'Location'])
testdf.head()

Unnamed: 0,long,lat,Location
0,-112.087098,33.49424,Phoenix
1,-111.571956,35.184261,Flagstaff
2,-109.645012,33.958546,Greer
3,-111.677144,35.349852,Mt. Humphrey
4,-105.577735,42.878875,E. Iowa


In [4]:
# Convert lat/long to a GeoDataFrame of points
points_gdf = gpd.GeoDataFrame(
    testdf,
    geometry=gpd.points_from_xy(testdf["long"], testdf["lat"]),
    crs=NA3.crs)   # use same CRS as NA3

# Spatial join: assigns polygon attributes to each point
joined = gpd.sjoin(points_gdf, NA3[["NA_L1KEY", "NA_L2KEY", "NA_L3KEY", "geometry"]], how="left", predicate="within")

# Now joined has Level 1–3 keys (NaN if no match)
testdf[["Level_1", "Level_2", "Level_3"]] = joined[["NA_L1KEY", "NA_L2KEY", "NA_L3KEY"]]

In [5]:
testdf.head()

Unnamed: 0,long,lat,Location,Level_1,Level_2,Level_3
0,-112.087098,33.49424,Phoenix,10 NORTH AMERICAN DESERTS,10.2 WARM DESERTS,10.2.2 Sonoran Desert
1,-111.571956,35.184261,Flagstaff,13 TEMPERATE SIERRAS,13.1 UPPER GILA MOUNTAINS,13.1.1 Arizona/New Mexico Mountains
2,-109.645012,33.958546,Greer,13 TEMPERATE SIERRAS,13.1 UPPER GILA MOUNTAINS,13.1.1 Arizona/New Mexico Mountains
3,-111.677144,35.349852,Mt. Humphrey,13 TEMPERATE SIERRAS,13.1 UPPER GILA MOUNTAINS,13.1.1 Arizona/New Mexico Mountains
4,-105.577735,42.878875,E. Iowa,9 GREAT PLAINS,9.3 WEST-CENTRAL SEMIARID PRAIRIES,9.3.3 Northwestern Great Plains


# Labeling data 

In [6]:
data_path = r"C:\Users\matta\Desktop\Documents\Python\Geolocation\climate_data\working_data\climate_data.pkl"
df = pd.read_pickle(data_path)
df.shape

(481631, 119)

In [7]:
df[['latitude', 'longitude']].head()

Unnamed: 0,latitude,longitude
0,49.416667,-95.125
1,49.375,-95.166667
2,49.375,-95.125
3,49.375,-95.083333
4,49.375,-95.041667


In [8]:
# Convert lat/long to a GeoDataFrame of points
points_gdf = gpd.GeoDataFrame(
    df,
    geometry=gpd.points_from_xy(df["longitude"], df["latitude"]),
    crs=NA3.crs)   # use same CRS as NA3

# Spatial join: assigns polygon attributes to each point
joined = gpd.sjoin(points_gdf, NA3[["NA_L1KEY", "NA_L2KEY", "NA_L3KEY", "geometry"]], how="left", predicate="within")

df[["Level_1", "Level_2", "Level_3"]] = joined[["NA_L1KEY", "NA_L2KEY", "NA_L3KEY"]]

df.shape

(481631, 122)

In [9]:
df[["latitude", "longitude", "Level_1", "Level_2", "Level_3"]].info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 481631 entries, 0 to 481630
Data columns (total 5 columns):
 #   Column     Non-Null Count   Dtype  
---  ------     --------------   -----  
 0   latitude   481631 non-null  float64
 1   longitude  481631 non-null  float64
 2   Level_1    475432 non-null  object 
 3   Level_2    475432 non-null  object 
 4   Level_3    475432 non-null  object 
dtypes: float64(2), object(3)
memory usage: 18.4+ MB


# Level 4 ecoregions

In [10]:
ecoregions4_path = r"C:\Users\matta\Desktop\Documents\Python\Geolocation\climate_data\working_data\ecoregions4.parquet"
NA4 = pd.read_parquet(ecoregions4_path, engine="pyarrow")
NA4["geometry"] = NA4["geometry"].apply(lambda x: wkb.loads(bytes(x), hex=False))
NA4 = gpd.GeoDataFrame(NA4, geometry="geometry")
NA4.head(1)

Unnamed: 0,L1_KEY,L2_KEY,L3_KEY,L4_KEY,geometry
0,10 NORTH AMERICAN DESERTS,10.1 COLD DESERTS,10 Columbia Plateau,10a Channeled Scablands,"POLYGON ((-118.96463 47.97667, -118.9681 47.97..."


In [11]:
# Convert lat/long to a GeoDataFrame of points
points_gdf = gpd.GeoDataFrame(
    df,
    geometry=gpd.points_from_xy(df["longitude"], df["latitude"]),
    crs=NA3.crs)   # use same CRS as NA3

# Spatial join: assigns polygon attributes to each point
joined = gpd.sjoin(points_gdf, NA4[["L1_KEY", "L2_KEY", "L3_KEY", "L4_KEY", "geometry"]], how="left", predicate="within")

df[["L1", "L2", "L3", "Level_4"]] = joined[["L1_KEY", "L2_KEY", "L3_KEY", "L4_KEY"]]

df.shape

(481631, 126)

In [12]:
df[["latitude", "longitude", "Level_1", "Level_2", "Level_3", "L1", "L2", "L3", "Level_4"]].sample()

Unnamed: 0,latitude,longitude,Level_1,Level_2,Level_3,L1,L2,L3,Level_4
377926,34.541667,-100.333333,9 GREAT PLAINS,9.4 SOUTH CENTRAL SEMIARID PRAIRIES,9.4.3 Southwestern Tablelands,9 GREAT PLAINS,9.4 SOUTH CENTRAL SEMI-ARID PRAIRIES,26 Southwestern Tablelands,"26c Caprock Canyons, Badlands, and Breaks"


In [13]:
df[["latitude", "longitude", "Level_1", "Level_2", "Level_3", "L1", "L2", "L3", "Level_4"]].info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 481631 entries, 0 to 481630
Data columns (total 9 columns):
 #   Column     Non-Null Count   Dtype  
---  ------     --------------   -----  
 0   latitude   481631 non-null  float64
 1   longitude  481631 non-null  float64
 2   Level_1    475432 non-null  object 
 3   Level_2    475432 non-null  object 
 4   Level_3    475432 non-null  object 
 5   L1         471201 non-null  object 
 6   L2         471201 non-null  object 
 7   L3         471201 non-null  object 
 8   Level_4    471201 non-null  object 
dtypes: float64(2), object(7)
memory usage: 33.1+ MB


# WWF ecoregions

In [14]:
def load_geometry(val):
    if isinstance(val, bytes) or isinstance(val, memoryview):
        # Binary WKB
        return wkb.loads(bytes(val), hex=False)
    elif isinstance(val, str):
        # Text-based WKT
        return wkt.loads(val)
    else:
        return None 

In [15]:
wwf_path = r"C:\Users\matta\Desktop\Documents\Python\Geolocation\climate_data\working_data\wwfecos.csv"
wwf = pd.read_csv(wwf_path)
wwf["geometry"] = wwf["geometry"].apply(load_geometry)
gdf = gpd.GeoDataFrame(wwf, geometry="geometry", crs="EPSG:4326")
gdf.head(1)

Unnamed: 0.1,Unnamed: 0,OBJECTID,AREA,PERIMETER,ECO_NAME,REALM,BIOME,ECO_NUM,ECO_ID,ECO_SYM,...,G200_BIOME,G200_STAT,Shape_Leng,Shape_Area,area_km2,eco_code,PER_area,PER_area_1,PER_area_2,geometry
0,0,1,29.802942,0.219,Northern Mesoamerican Pacific mangroves,NT,14.0,4.0,61404.0,119.0,...,0.0,0.0,0.219475,0.002769,8174,NT1404,0.0,0.0,0.0,"POLYGON ((-112.26972 29.32648, -112.28809 29.3..."


In [16]:
wwf_cols = ['ECO_NAME', 'REALM', 'BIOME', 'ECO_NUM', 'ECO_ID', 'ECO_SYM', 
            'GBL_STAT', 'G200_REGIO', 'G200_NUM', 'G200_BIOME', 'G200_STAT', 
            'eco_code', 'geometry']

In [17]:
gdf[wwf_cols].sample()

Unnamed: 0,ECO_NAME,REALM,BIOME,ECO_NUM,ECO_ID,ECO_SYM,GBL_STAT,G200_REGIO,G200_NUM,G200_BIOME,G200_STAT,eco_code,geometry
4598,Kalaallit Nunaat high arctic tundra,,11.0,12.0,51112.0,42.0,3.0,,0.0,0.0,0.0,NA1112,"POLYGON ((-22.5 72.44028, -22.54861 72.44486, ..."


In [18]:
gdf = gdf[['ECO_NAME', 'geometry']]

In [19]:
# Convert lat/long to a GeoDataFrame of points
points_gdf = gpd.GeoDataFrame(
    df,
    geometry=gpd.points_from_xy(df["longitude"], df["latitude"]),
    crs="EPSG:4326")   # use same CRS as NA3

# Spatial join: assigns polygon attributes to each point
joined = gpd.sjoin(points_gdf, gdf, how="left", predicate="within")
df['ECO_NAME'] = joined['ECO_NAME']

df.shape

(481631, 127)

In [20]:
df[["latitude", "longitude", "Level_1", "Level_2", "Level_3", "L1", "L2", "L3", "Level_4", "ECO_NAME"]].info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 481631 entries, 0 to 481630
Data columns (total 10 columns):
 #   Column     Non-Null Count   Dtype  
---  ------     --------------   -----  
 0   latitude   481631 non-null  float64
 1   longitude  481631 non-null  float64
 2   Level_1    475432 non-null  object 
 3   Level_2    475432 non-null  object 
 4   Level_3    475432 non-null  object 
 5   L1         471201 non-null  object 
 6   L2         471201 non-null  object 
 7   L3         471201 non-null  object 
 8   Level_4    471201 non-null  object 
 9   ECO_NAME   475247 non-null  object 
dtypes: float64(2), object(8)
memory usage: 36.7+ MB


In [21]:
df[["latitude", "longitude", "Level_1", "Level_2", "Level_3", "L1", "L2", "L3", "Level_4", "ECO_NAME"]].dropna().info()

<class 'pandas.core.frame.DataFrame'>
Index: 470531 entries, 11 to 481597
Data columns (total 10 columns):
 #   Column     Non-Null Count   Dtype  
---  ------     --------------   -----  
 0   latitude   470531 non-null  float64
 1   longitude  470531 non-null  float64
 2   Level_1    470531 non-null  object 
 3   Level_2    470531 non-null  object 
 4   Level_3    470531 non-null  object 
 5   L1         470531 non-null  object 
 6   L2         470531 non-null  object 
 7   L3         470531 non-null  object 
 8   Level_4    470531 non-null  object 
 9   ECO_NAME   470531 non-null  object 
dtypes: float64(2), object(8)
memory usage: 39.5+ MB


# Koppen geiger climate classes

In [22]:
kg_path = r"C:\Users\matta\Desktop\Documents\Python\Geolocation\climate_data\working_data\koppen_geiger.csv"
kg = pd.read_csv(kg_path)
kg["geometry"] = kg["geometry"].apply(load_geometry)
kgdf = gpd.GeoDataFrame(kg, geometry="geometry", crs="EPSG:4326")
kgdf.head(1)

Unnamed: 0.1,Unnamed: 0,dn,climates_f,geometry
0,0,31.0,ET,"POLYGON ((-37 83.75, -36.75 83.75, -36.75 83.5..."


In [23]:
# Convert lat/long to a GeoDataFrame of points
points_gdf = gpd.GeoDataFrame(
    df,
    geometry=gpd.points_from_xy(df["longitude"], df["latitude"]),
    crs="EPSG:4326")   # use same CRS as NA3

# Spatial join: assigns polygon attributes to each point
joined = gpd.sjoin(points_gdf, kgdf, how="left", predicate="within")
df[['climates_f', 'dn']] = joined[['climates_f', 'dn']]

df.shape

(481631, 129)

In [24]:
df[["latitude", "longitude", "Level_1", "Level_2", "Level_3", "L1", "L2", "L3", "Level_4", "ECO_NAME", "climates_f", "dn"]].info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 481631 entries, 0 to 481630
Data columns (total 12 columns):
 #   Column      Non-Null Count   Dtype  
---  ------      --------------   -----  
 0   latitude    481631 non-null  float64
 1   longitude   481631 non-null  float64
 2   Level_1     475432 non-null  object 
 3   Level_2     475432 non-null  object 
 4   Level_3     475432 non-null  object 
 5   L1          471201 non-null  object 
 6   L2          471201 non-null  object 
 7   L3          471201 non-null  object 
 8   Level_4     471201 non-null  object 
 9   ECO_NAME    475247 non-null  object 
 10  climates_f  480352 non-null  object 
 11  dn          481476 non-null  float64
dtypes: float64(3), object(9)
memory usage: 44.1+ MB


In [25]:
df[["latitude", "longitude", "Level_1", "Level_2", "Level_3", "L1", "L2", "L3", "Level_4", "ECO_NAME", "climates_f", "dn"]].dropna().info()

<class 'pandas.core.frame.DataFrame'>
Index: 470342 entries, 11 to 481565
Data columns (total 12 columns):
 #   Column      Non-Null Count   Dtype  
---  ------      --------------   -----  
 0   latitude    470342 non-null  float64
 1   longitude   470342 non-null  float64
 2   Level_1     470342 non-null  object 
 3   Level_2     470342 non-null  object 
 4   Level_3     470342 non-null  object 
 5   L1          470342 non-null  object 
 6   L2          470342 non-null  object 
 7   L3          470342 non-null  object 
 8   Level_4     470342 non-null  object 
 9   ECO_NAME    470342 non-null  object 
 10  climates_f  470342 non-null  object 
 11  dn          470342 non-null  float64
dtypes: float64(3), object(9)
memory usage: 46.6+ MB


In [28]:
df.dropna().shape

(470342, 129)

# Saving dataframe

In [26]:
df.to_pickle(r"C:\Users\matta\Desktop\Documents\Python\Geolocation\climate_data\working_data\labeled_climate_data.pkl")