Preparation of geospatial data and matching to waterbody and fish data
准备地理空间数据，并匹配到水体和鱼类数据中

In [None]:
import sys
sys.path.append('..')
import pandas as pd
import numpy as np
import myfunction as mf
path_data_raw = "C:/Users/dell/OneDrive/file/"
path_country_nc = "C:/Users/dell/OneDrive/file/nc"
path_one_spdb = 'C:/Users/dell/OneDrive/file/SPDB/'
drive_letter = 'E:'

path_pre = drive_letter + "/wyy/code_project/running_outcome/final_data/SPDB/part0_treat/pretreatment/"
path_match = drive_letter + "/wyy/code_project/running_outcome/final_data/SPDB/part0_treat/match/"
path_var_data = drive_letter + "/wyy/SPDB_database/data/raw/"
path_match_geo = drive_letter + "/wyy/code_project/running_outcome/final_data/SPDB/part0_treat/match/geo/"

meta_name = "meta_data.csv"
mark_num = "25"
list_color = ["#ee877c", "#8bd0e3", "#6abeae", "#808eaf", "#f7bba8", "#acb4cc", "#b5e0d5", "#e86462", "#a89687"]

## GEO-DATA-PREPARATION

In [None]:
# Generate the required data tables
# 生成需要的数据表格

import pandas as pd
import numpy as np


lat_range = np.arange(-90, 90, 1)
lon_range = np.arange(-180, 180, 1)
year_range = np.arange(2000, 2021, 1)


df1 = pd.DataFrame({
    'lat_grid': np.repeat(lat_range, len(lon_range) * len(year_range)),
    'lon_grid': np.tile(np.repeat(lon_range, len(year_range)), len(lat_range)),
    'year': np.tile(year_range, len(lat_range) * len(lon_range))
})


df2 = pd.DataFrame({
    'lat_grid': np.repeat(lat_range, len(lon_range)),
    'lon_grid': np.tile(lon_range, len(lat_range))
})

print(df1.shape)
print(df2.shape)

df1.to_csv(path_match + "lat_lon_year.csv", index=False)
df2.to_csv(path_match + "lat_lon.csv", index=False)


In [None]:
# Pre-processing of geospatial variables
# 预处理地理空间变量
import os
import pandas as pd
import numpy as np
import xarray as xr


# ——————————————————————————————————————————————————————————————————
int_grid = 1
data_type = "raw"
# ——————————————————————————————————————————————————————————————————

nc_dir_year = path_var_data + str(int_grid) + "\\year"
nc_dir_one = path_var_data + str(int_grid) + "\\one"


df_meta = pd.read_csv(path_data_raw + meta_name, encoding="utf-8")
list_var = df_meta['var_name'][(df_meta['file_type']=='nc')&(df_meta['var_select' + mark_num]==1)].tolist()
list_var = [var + "_" + str(int_grid) + "x" + str(int_grid) + ".nc" for var in list_var]


def dataset_to_dataframe(ds, var_name, index_vars):
    """
    Convert an xarray Dataset to a pandas DataFrame.
    
    :param ds: xarray Dataset to convert
    :param var_name: the variable name to extract from the Dataset
    :param index_vars: a list of variable names to use as index in the DataFrame
    :return: pandas DataFrame with the specified variable and index
    """
    df = ds[var_name].to_dataframe().reset_index()
    df = df.set_index(index_vars)
    return df

def process_nc_dir_one(nc_dir_one, list_var, df_geo, int_grid):
    for filename in os.listdir(nc_dir_one):
        if filename in list_var:
            with xr.open_dataset(os.path.join(nc_dir_one, filename)) as ds:
                var_name = [var for var in ds.data_vars][0]
                column_name = filename.replace("_" + str(int_grid) + "x" + str(int_grid) + ".nc", "")
                # Convert the Dataset to a DataFrame
                df_nc = dataset_to_dataframe(ds, var_name, ['lon', 'lat'])
                # Merge the DataFrame with df_geo
                df_geo = df_geo.merge(df_nc, left_on=['lon_grid', 'lat_grid'], right_index=True, how='left')
                df_geo = df_geo.rename(columns={var_name: column_name})
    return df_geo

def process_nc_dir_year(nc_dir_year, list_var, df_geo, int_grid):
    for filename in os.listdir(nc_dir_year):
        if filename in list_var:
            with xr.open_dataset(os.path.join(nc_dir_year, filename)) as ds:
                ds = ds.sortby('lon')
                ds = ds.sortby('lat')
                ds = ds.sortby('year')
                var_name = [var for var in ds.data_vars][0]
                column_name = filename.replace("_" + str(int_grid) + "x" + str(int_grid) + ".nc", "")
                # Convert the Dataset to a DataFrame
                df_nc = dataset_to_dataframe(ds, var_name, ['lon', 'lat', 'year'])
                # Merge the DataFrame with df_geo
                df_geo = df_geo.merge(df_nc, left_on=['lon_grid', 'lat_grid', 'year'], right_index=True, how='left')
                df_geo = df_geo.rename(columns={var_name: column_name})
    return df_geo


for year in range(2000, 2021):
    print(year)
    df_geo = pd.read_csv(path_match + "lat_lon.csv")
    df_geo = df_geo.drop_duplicates(subset=['lat_grid', 'lon_grid'])
    df_geo['year'] = year
    print(df_geo.columns)
    print(df_geo.shape)
    
    df_geo = process_nc_dir_one(nc_dir_one,list_var, df_geo, int_grid)
    df_geo = process_nc_dir_year(nc_dir_year,list_var, df_geo, int_grid)
    if data_type == "normalization" or data_type == "standardization":
        for column in df_geo.columns:
            if column not in ['lat_grid', 'lon_grid', 'year', 'country_id']:
                if df_geo[column].lt(0).any() or df_geo[column].gt(1).any() or df_geo[column].lt(1e-20).any():
                    print(f'Column {column} has values out of range [0, 1] or less than 1e-20. Replacing these values with 0.')
                    df_geo.loc[df_geo[column].lt(0) | df_geo[column].gt(1) | df_geo[column].lt(1e-20), column] = 0
    df_geo.fillna(0, inplace=True)

    def replace_nan_in_array(array):
        return np.where(np.isnan(array), 0, array)

    df_geo2 = df_geo.applymap(replace_nan_in_array)
    df_geo2 = df_geo2.fillna(0)
    df_geo2 = df_geo2.replace(["nan", "NaN", "NAN", "", None], 0)
    # 如果上面两行替换不了<class 'numpy.ndarray'>，大概率是碰到数组了
    # If you can't replace <class ‘numpy.ndarray’> with the two lines above, the odds are that you've run into the array
    column_means = df_geo2.mean()
    filtered_columns = column_means[(column_means > 1) | (column_means == 0)]
    columns_list = filtered_columns.index.tolist()
    print(columns_list)
    df_geo2.to_csv(path_match_geo + 'geo_global_'+str(year)+'.csv', encoding='utf-8-sig', index=False)



In [None]:
# file merging
# 文件合并
import os
import pandas as pd


folder_path = r'E:\wyy\code_project\running_outcome\final_data\SPDB\part0_treat\match\geo'

file_names = [file for file in os.listdir(folder_path) if file.endswith('.csv')]

dfs = [pd.read_csv(os.path.join(folder_path, file)) for file in file_names]
merged_df = pd.concat(dfs, ignore_index=True)

output_file = r'E:\wyy\code_project\running_outcome\final_data\SPDB\part0_treat\match\geo_global.csv'

merged_df.to_csv(output_file, index=False)




## GEO-DATA

In [None]:
# Match the data related to the sampled coordinates first, for quick use later. 
# 先把与采样坐标相关的数据匹配出来，方面后续快速使用 
import os
import pandas as pd
import numpy as np
import xarray as xr

# 读取csv文件

df = pd.read_csv(path_pre + "lr_grid.csv")
df2 = pd.read_csv(path_pre + "sw_grid.csv")
# ——————————————————————————————————————————————————————————————————
int_grid = 1
data_type = "raw"
# ——————————————————————————————————————————————————————————————————
df.rename(columns={'time_year': 'year'}, inplace=True)

df = df[['lat_grid', 'lon_grid', 'year']]
df = df.drop_duplicates(subset=['lat_grid', 'lon_grid', 'year'])
df2 = df2[['lat_grid', 'lon_grid', 'year']]
df2 = df2.drop_duplicates(subset=['lat_grid', 'lon_grid', 'year'])

df_geo = pd.concat([df,df2],axis=0)
df_geo = df_geo.drop_duplicates(subset=['lat_grid', 'lon_grid', 'year'])

print(df_geo.columns)
print(df_geo.shape)


nc_dir_year = path_var_data + str(int_grid) + "\\year"
nc_dir_one = path_var_data + str(int_grid) + "\\one"
# 处理one文件夹下的nc文件
# Handling of nc files in the one folder
for filename in os.listdir(nc_dir_one):
    if filename.endswith(".nc"):
        with xr.open_dataset(os.path.join(nc_dir_one, filename)) as ds:
            var_name = [var for var in ds.data_vars][0]  
            column_name = filename.replace("_" + str(int_grid) + "x" + str(int_grid) + ".nc", "") 
            matched_data = []
            for row in df_geo.itertuples():
                matched_data.append(ds[var_name].sel(lon=row.lon_grid, lat=row.lat_grid, method='nearest').values)
            df_geo[column_name] = matched_data

# 处理year文件夹下的nc文件
# Handling of nc files in the year folder
for filename in os.listdir(nc_dir_year):
    if filename.endswith(".nc"):
        print(filename)
        with xr.open_dataset(os.path.join(nc_dir_year, filename)) as ds:
            ds = ds.sortby('lon') 
            ds = ds.sortby('lat') 
            ds = ds.sortby('year') 
            var_name = [var for var in ds.data_vars][0]  
            column_name = filename.replace("_" + str(int_grid) + "x" + str(int_grid) + ".nc", "")  

            matched_data = []
            for row in df_geo.itertuples():
                matched_data.append(ds[var_name].sel(lon=row.lon_grid, lat=row.lat_grid, year=row.year, method='nearest').values)
            df_geo[column_name] = matched_data


df_geo.fillna(0, inplace=True)

def replace_nan_in_array(array):
    return np.where(np.isnan(array), 0, array)
df_geo2 = df_geo.applymap(replace_nan_in_array)
df_geo2 = df_geo2.fillna(0)
df_geo2 = df_geo2.replace(["nan", "NaN", "NAN", "", None], 0)
column_means = df_geo2.mean()
filtered_columns = column_means[(column_means > 1) | (column_means == 0)]

columns_list = filtered_columns.index.tolist()

df_geo2.to_csv(path_match + 'geo_all.csv', encoding='utf-8-sig', index=False)



## FISH-DATA

In [None]:
# Matching geospatial variables to fish data
# 为鱼类数据匹配地理空间变量
import os
import pandas as pd
import numpy as np
import xarray as xr
import myfunction as mf
from scipy import stats
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import MinMaxScaler


df = pd.read_csv(path_pre + "lr_grid_avg.csv")

int_grid = 1

data_type = "raw"

df.rename(columns={'time_year': 'year'}, inplace=True)

df = df.dropna(subset=['value'])

dict_inf1 = {'pon':['poid']}
df = mf.append_inf(df, dict_inf1)

# ----------------------------------------------

posname_df = pd.read_csv(path_one_spdb + 'posname.csv')
posname_df = posname_df.drop(columns=["50%"])
df = df.merge(posname_df, how='left', on='posname').drop('posname', axis=1).rename(columns={'id':'posname'})
# ----------------------------------------------


df = df[['lat_grid', 'lon_grid', 'habitat', 'organ', 'year', 'poid', "spid", "value","posname"]]

pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)
print(df.shape)

dict_inf1 = {"po":["po_carbon","po_f_carbon","po_classification","po_chain","po_m_w","po_xlogp","log_Koc", "log_Kow","log_Kaw",
                    "log_Koa_dry","log_Koa_wet","log_KHxd_air","log_Koil_w","log_Koil_air","density","melting_point",
                    "boiling_point","solubility"]
                    ,"sp":["length_last","weight_last","troph_last","class","order","family","genus"]}
df["spid"] = df["spid"].astype(int) 
df_imputer = mf.append_inf(df, dict_inf1)
print(df_imputer.shape)

df_imputer = df_imputer.drop(columns=["poid"])
# ----------------------------------------------
if data_type == "raw":
    class_df = pd.read_csv(path_one_spdb + 'class.csv')
    class_df = class_df.drop(columns=["50%"])

    order_df = pd.read_csv(path_one_spdb + 'order.csv')
    order_df = order_df.drop(columns=["50%"])

    family_df = pd.read_csv(path_one_spdb + 'family.csv')
    family_df = family_df.drop(columns=["50%"])


    genus_df = pd.read_csv(path_one_spdb + 'genus.csv')
    genus_df = genus_df.drop(columns=["50%"])

    df_imputer = df_imputer.merge(class_df, how='left', on='class').drop('class', axis=1).rename(columns={'id':'class'})
    df_imputer = df_imputer.merge(family_df, how='left', on='family').drop('family', axis=1).rename(columns={'id':'family'})
    df_imputer = df_imputer.merge(order_df, how='left', on='order').drop('order', axis=1).rename(columns={'id':'order'})
    df_imputer = df_imputer.merge(genus_df, how='left', on='genus').drop('genus', axis=1).rename(columns={'id':'genus'})  

    print(df_imputer[['length_last','weight_last','troph_last']].isnull().sum())
    df_imputer.to_csv(path_match + "lr_all_avg.csv", index=False)

df_all = pd.read_csv(path_match + "lr_all_avg.csv")
print(df_all.shape)
df_fish = df_all[df_all['class']==0]
list_organ = ['liver','muscle','whole']
df_fish = df_fish[df_fish['organ'].isin(list_organ)]
print(df_fish.shape)
df_fish.to_csv(path_match + "lr_fish_avg.csv", index=False)


## WATER-DATA

In [None]:
# Matching geospatial variables to water data
# 为水体数据匹配地理空间变量
import os
import pandas as pd
import numpy as np
import xarray as xr
import myfunction as mf
from scipy import stats
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import MinMaxScaler
# 读取csv文件

df = pd.read_csv(path_pre + "sws7_grid_1.csv")
# ---------------------------------------------------------------
dict_inf1 = {'pon':['poid']}
df = mf.append_inf(df, dict_inf1)
# ----------------------------------------------

posname_df = pd.read_csv(path_one_spdb + 'posname.csv')
posname_df = posname_df.drop(columns=["50%"])
df = df.merge(posname_df, how='left', on='posname').drop('posname', axis=1).rename(columns={'id':'posname'})
# ----------------------------------------------
df = df[['lat_grid', 'lon_grid','year', 'poid',"value","posname"]]

pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)
print(df.shape)
# ----------------------------------------------

dict_inf1 = {"po":["po_carbon","po_f_carbon","po_classification","po_chain","po_m_w","po_xlogp","log_Koc", "log_Kow","log_Kaw",
                    "log_Koa_dry","log_Koa_wet","log_KHxd_air","log_Koil_w","log_Koil_air","density","melting_point",
                    "boiling_point","solubility"]}

df_imputer = mf.append_inf(df, dict_inf1)

df_imputer = df_imputer.drop(columns=["poid"])
# ----------------------------------------------

df_imputer.to_csv(path_match + "sw_s7_avg.csv", index=False)


## FISH-WATER

In [None]:
# Obtaining spatial and temporal overlap data for water and fish
# 获取水和鱼时空重叠数据
df_sp = pd.read_csv(path_match + "lr_fish_avg.csv")
df_sw = pd.read_csv(path_match + "sw_s7_avg.csv")
print("sp-shape", df_sp.shape)
print("sw-shape", df_sw.shape)


df_sw = df_sw.rename(columns={'value': 'sw_value'})

df_merged = pd.merge(df_sp, df_sw[['lat_grid', 'lon_grid', 'posname', 'year', 'sw_value']], on=['lat_grid', 'lon_grid', 'posname', 'year'], how='left')

df_merged = df_merged[df_merged["sw_value"].notna()]
print("sw-sp", df_merged.shape)
print(mf.col_describe(df_merged,'posname'))

df_merged.to_csv(path_match + "lr_sws7_fish_avg.csv", index=False)
