In [4]:
import gc
import geopandas as gpd
import matplotlib as mpl
import matplotlib.pyplot as plt
import numpy as np
import os
import pandas as pd 
import pickle
from sklearn.model_selection import train_test_split
from tqdm import tqdm
from xgboost import XGBClassifier

In [5]:
data_path = './kaggle/input/geolifeclef-2024'
output_path = './kaggle/working'

# Set up the environmental raster file paths
climate_raster_path = data_path + '/EnvironmentalRasters/EnvironmentalRasters/Climate'
elevation_raster_path = data_path + '/EnvironmentalRasters/EnvironmentalRasters/Elevation'
human_raster_path = data_path + '/EnvironmentalRasters/EnvironmentalRasters/Human Footprint'
land_cover_raster_path = data_path + '/EnvironmentalRasters/EnvironmentalRasters/LandCover'
soil_grid_raster_path = data_path + '/EnvironmentalRasters/EnvironmentalRasters/SoilGrids'

In [6]:
pa_metadata_test_csv_filename = data_path + "/GLC24_PA_metadata_test.csv"
pa_metadata_test_df = pd.read_csv(pa_metadata_test_csv_filename)

pa_metadata_test_df = pd.merge(pa_metadata_test_df,
                               pd.get_dummies(pa_metadata_test_df[['country', 'region']], dtype='int'),
                               how='left',
                               left_index=True,
                               right_index=True)

In [7]:
# Load the climate raster csv file
average_climate_raster_test_csv_filename = (climate_raster_path + 
                                             '/Average 1981-2010/GLC24-PA-test-bioclimatic.csv')
monthly_climate_raster_test_csv_filename = (climate_raster_path + 
                                             '/Monthly/GLC24-PA-test-bioclimatic_monthly.csv')

average_climate_raster_test_df = pd.read_csv(average_climate_raster_test_csv_filename)
monthly_climate_raster_test_df = pd.read_csv(monthly_climate_raster_test_csv_filename)

pa_elevation_test_csv_filename = (elevation_raster_path + '/GLC24-PA-test-elevation.csv')
pa_elevation_test_df = pd.read_csv(pa_elevation_test_csv_filename)

pa_human_test_csv_filename = (human_raster_path + '/GLC24-PA-test-human_footprint.csv')
pa_human_test_df = pd.read_csv(pa_human_test_csv_filename)

pa_land_cover_test_csv_filename = (land_cover_raster_path + '/GLC24-PA-test-landcover.csv')
pa_land_cover_test_df = pd.read_csv(pa_land_cover_test_csv_filename)

pa_soil_grid_test_csv_filename = (soil_grid_raster_path + '/GLC24-PA-test-soilgrids.csv')
pa_soil_grid_test_df = pd.read_csv(pa_soil_grid_test_csv_filename)

In [8]:
# Combine the dataframes into a single, larger dataframe.

# To avoid any potential issues with data leakage I have cut off the monthly climate raster dataframe
# at the end of 2016.

monthly_climate_end_idx = 817

pa_test_df = pd.merge(pa_metadata_test_df.drop(['areaInM2', 'geoUncertaintyInM'], axis=1),
                      monthly_climate_raster_test_df.iloc[:, 0:monthly_climate_end_idx],
                      how='left',
                      on='surveyId')

pa_test_df = pd.merge(pa_test_df,
                      average_climate_raster_test_df,
                      how='left',
                      on='surveyId')

pa_test_df = pd.merge(pa_test_df,
                       pa_elevation_test_df,
                       how='left',
                       on='surveyId')

pa_test_df = pd.merge(pa_test_df,
                       pa_human_test_df,
                       how='left',
                       on='surveyId')

pa_test_df = pd.merge(pa_test_df,
                       pa_land_cover_test_df,
                       how='left',
                       on='surveyId')

pa_test_df = pd.merge(pa_test_df,
                       pa_soil_grid_test_df,
                       how='left',
                       on='surveyId')

In [9]:
# Add the LandSat data to the test dataset
landsat_test_path = data_path + '/PA-test-landsat_time_series'

blue_landsat_test_csv_filename = landsat_test_path + '/GLC24-PA-test-landsat_time_series-blue.csv'
green_landsat_test_csv_filename = landsat_test_path + '/GLC24-PA-test-landsat_time_series-green.csv'
red_landsat_test_csv_filename = landsat_test_path + '/GLC24-PA-test-landsat_time_series-red.csv'
nir_landsat_test_csv_filename = landsat_test_path + '/GLC24-PA-test-landsat_time_series-nir.csv'
swir1_landsat_test_csv_filename = landsat_test_path + '/GLC24-PA-test-landsat_time_series-swir1.csv'
swir2_landsat_test_csv_filename = landsat_test_path + '/GLC24-PA-test-landsat_time_series-swir2.csv'

blue_landsat_test_df = pd.read_csv(blue_landsat_test_csv_filename)
green_landsat_test_df = pd.read_csv(green_landsat_test_csv_filename)
red_landsat_test_df = pd.read_csv(red_landsat_test_csv_filename)
nir_landsat_test_df = pd.read_csv(nir_landsat_test_csv_filename)
swir1_landsat_test_df = pd.read_csv(swir1_landsat_test_csv_filename)
swir2_landsat_test_df = pd.read_csv(swir2_landsat_test_csv_filename)

In [10]:
def reindex_landsat_data(row, band, year_df):
    year = year_df.loc[year_df.surveyId == row.surveyId, "year"].values[0]
    start_idx = (year - 2017)*4 + 1
    end_idx = start_idx + 68
    obs = row[start_idx:end_idx]
    obs.index = [f"{band}_{lag}" for lag in list(range(68, 0, -1))]
    return obs

In [11]:
landsat_df_list = [blue_landsat_test_df,
                   green_landsat_test_df,
                   red_landsat_test_df,
                   nir_landsat_test_df,
                   swir1_landsat_test_df,
                   swir2_landsat_test_df]

landsat_bands = ['blue', 'green', 'red', 'nir', 'swir1', 'swir2']

for band, landsat_df in zip(landsat_bands, landsat_df_list):
    landsat_merge_df = landsat_df.apply(reindex_landsat_data,
                                        axis=1,
                                        band=band,
                                        year_df=pa_metadata_test_df)
    
    landsat_merge_df['surveyId'] = landsat_df['surveyId']
    
    pa_test_df = pd.merge(pa_test_df,
                          landsat_merge_df,
                          how='left',
                          on='surveyId')

In [13]:
# Import the model using pickle
with open("xgb_model.pkl", "rb") as file:
    model = pickle.load(file)

In [56]:
# Import n_species_model
with open("n_species_model.pkl", "rb") as file:
    n_species_model = pickle.load(file)

In [57]:
X_test = pa_test_df.drop(['surveyId', 'region', 'country'], axis=1)

missing_cols = list(set(model.get_booster().feature_names) - set(X_test.columns))
X_test[missing_cols] = 0
X_test = X_test[model.get_booster().feature_names]

X_test_species = X_test[n_species_model.get_booster().feature_names]

In [43]:
y_test = model.predict_proba(X_test)

In [78]:
n_species_pred = n_species_model.predict(X_test_species)

In [88]:
n_species_df = pd.DataFrame((n_species_pred*1.1).round(), columns=['n_species'], index=pa_test_df.surveyId).astype(int)

In [44]:
y_test_columns = list(range(1, y_test.shape[1] + 1))
y_test = pd.DataFrame(y_test, columns=y_test_columns, index=pa_test_df.surveyId)

In [89]:
prediction_df = y_test.apply(lambda x: ' '.join(x.nlargest(n_species_df.loc[x.name][0]).index.sort_values().astype(str)), axis=1).reset_index().rename({0: 'predictions'}, axis=1)

In [90]:
prediction_df.to_csv('test_predictions_n_species_model_adj.csv', index=False)

In [38]:
# prediction_df = y_test.stack()[y_test.stack() == 1]
# prediction_df = prediction_df.index.to_frame(index=False).rename({1: 'predictions'}, axis=1)
# prediction_df['predictions'] = prediction_df.predictions.astype(str)

# df = prediction_df.groupby('surveyId')['predictions'].transform(lambda x: ' '.join(x)).drop_duplicates()

# df = pd.merge(df,
#               prediction_df.drop('predictions', axis=1),
#               how='left',
#               left_index=True,
#               right_index=True)

# df = pd.merge(pa_test_df['surveyId'],
#               df,
#               how='left',
#               on='surveyId').fillna('')

# df.to_csv('test_predictions.csv', index=False)