# Confesor_DataCleanupMerge (2023)
# Part of OEAS 895 Capstone Project
## This Jupyter Notebook takes available K.brevis HAB data from the WFS (FWRI & CMEMs), indexes/subests the data, combines them (replace NaN salinity/temperature values and adds nutrient columns), and then saves this combined data to a separate .csv file.

### Import Packages - Using Python Version 3.11.2 
#### (make sure ConfesorCapstoneEv is set as current kernel & all the files are in the same directory!)

In [1]:
#Python script with all of the necessary functions
import Confesor_Functions as CF

#For data editing
import pandas as pd
import numpy as np
import string
import csv

#Opening NETCDF (.nc) files
import xarray as xr
import os

#If the current working directory is not where you have saved all the necessary files, change the directory.
os.chdir('C:\\Users\\krist\\OneDrive\\Desktop\\Files\\ODU\\Spring2023\\OEAS895\\MLP_Classifier_KbrevisHABs')
os.getcwd()

'C:\\Users\\krist\\OneDrive\\Desktop\\Files\\ODU\\Spring2023\\OEAS895\\MLP_Classifier_KbrevisHABs'

### Clean & Index FWRI Data

In [2]:
# Download CSV & organize & clean data
Kbrevis_df=CF.DownloadCSV('Kbrevis_Abundance.csv','Kbrevis_df')
Kbrevis_df['BloomID'] = Kbrevis_df.apply(CF.BloomClassification, axis=1)
Kbrevis_df1= Kbrevis_df[Kbrevis_df['Depth_m']<0.51]
Kbrevis_df2= Kbrevis_df1[Kbrevis_df1['Date']<'2021-01-01']

# Save indexed Kbrevis_df data into a new csv file & open it for later
CF.SaveCSV(Kbrevis_df2,'Kb_Indexed.csv')
Kb_df=CF.DownloadCSV('Kb_Indexed.csv','Kb_df')

# Save Date as datetime & remove unnamed column
Kb_df[['Date']] = Kb_df[['Date']].apply(pd.to_datetime)
Kb_df=Kb_df.drop(columns= ['Unnamed: 0'])

Kb_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 25867 entries, 0 to 25866
Data columns (total 8 columns):
 #   Column     Non-Null Count  Dtype         
---  ------     --------------  -----         
 0   Date       25867 non-null  datetime64[ns]
 1   Depth_m    25867 non-null  float64       
 2   Latitude   25867 non-null  float64       
 3   Longitude  25867 non-null  float64       
 4   Kbrevis    25867 non-null  int64         
 5   Temp_C     16158 non-null  float64       
 6   Salinity   15047 non-null  float64       
 7   BloomID    25867 non-null  int64         
dtypes: datetime64[ns](1), float64(5), int64(2)
memory usage: 1.6 MB


In [3]:
#Time, Lat, Lon, and Depth are from Indexed K.brevis in situ data- matching up with CMEMS
Lat=CF.VariableCSV(Kb_df,'Latitude','Lat')
Lon=CF.VariableCSV(Kb_df,'Longitude','Lon')
Time=CF.VariableCSV(Kb_df,'Date','Time')
Depth=CF.VariableCSV(Kb_df,'Depth_m','Depth')
Kbrevis=CF.VariableCSV(Kb_df,'Kbrevis','Kbrevis')

#Turn into Arrays for matchup to work
LatArray=CF.Array('LatArray',Lat)
LonArray=CF.Array('LonArray',Lon)
TimeArray=CF.Array('TimeArray',Time)
DepthArray=CF.Array('DepthArray',Depth)

### Download & Matchup CMEM NETCDF Data to FWRI Data

In [4]:
#Download CMEM NETCDF Files, then match up with insitu time, lat,lon, and depth.

SalTemp_df=CF.DownloadNETCDF('SalTempTotal.nc','SalTemp_df')
SalTemp=CF.Matchup('SalTemp',SalTemp_df,LonArray,LatArray,TimeArray,DepthArray)
#SalTemp

Nutrient_df=CF.DownloadNETCDF('NutrientsTotal.nc','SalTemp_df')
Nutrient=CF.Matchup('Nutrient',Nutrient_df,LonArray,LatArray,TimeArray,DepthArray)
#Nutrient

#Assign the variables, for indexed NETCDF data
thetao = CF.VariableNETCDF(SalTemp,'thetao','thetao')
so = CF.VariableNETCDF(SalTemp,'so','so')
no3= CF.VariableNETCDF(Nutrient,'no3','no3')
po4 = CF.VariableNETCDF(Nutrient,'po4','po4')
si = CF.VariableNETCDF(Nutrient,'si','si')

#Marked data below is not needed for this particular model, but still is important to have on hand
#o2 = VariableNETCDF(Nutrient,'o2','o2')
#chl = VariableNETCDF(Nutrient,'chl','chl')

In [5]:
#Get # of samples (rows)
Count=Kbrevis.count()

#Grab only the variables of interest from indexed NETCDF files and put them into a dataframe
CMEMthetao=CF.NETCDFtoDataframe('Thetao',Count,thetao,'Thetao_df','Thetao','Type','arrthetao','CMEMthetao')
CMEMso=CF.NETCDFtoDataframe('So',Count,so,'so_df','So','Type','arrthetao','CMEMso')
CMEMno3=CF.NETCDFtoDataframe('NO3',Count,no3,'no3_df','NO3','Type','arrno3','CMEMno3')
CMEMpo4=CF.NETCDFtoDataframe('PO4',Count,po4,'po4_df','PO4','Type','arrpo4','CMEMpo4')
CMEMsi=CF.NETCDFtoDataframe('Si',Count,si,'si_df','Si','Type','arrsi','CMEMsi')

#Marked data below is not needed for this particular model, but still is important to have on hand
#CMEMo2=CF.NETCDFtoDataframe('O2',Count,o2,'o2_df','O2','Type','arro2','CMEMo2')
#CMEMchl=CF.NETCDFtoDataframe('Chl',Count,chl,'chl_df','chl','Type','arrchl','CMEMchl')

In [6]:
#Replace NaN values in insitu data with CMEM data as float variables
Temp=CF.Replace(Kb_df,'Temp_C',CMEMthetao,'Thetao')
Sal=CF.Replace(Kb_df,'Salinity',CMEMso,'So')

#Add Nutrients
NO3=CF.AddNutrient(Kb_df,'NO3',CMEMno3)
PO4=CF.AddNutrient(Kb_df,'PO4',CMEMpo4)
si=CF.AddNutrient(Kb_df,'Si',CMEMsi)

#Marked data below is not needed for this particular model, but still is important to have on hand
#O2=CF.AddNutrient(Kb_df,'O2',CMEMo2)
#chl=CF.AddNutrient(Kb_df,'chl',CMEMchl)

#Remove NaN in the entire file, then save to new CSV file
CF.DropNaN('FinalKb_df',Kb_df,'Kb_final.csv')

#What we will be using to train our MLP Model!
Kb_final=CF.DownloadCSV('Kb_final.csv','Kb_final')
Kb_final

Unnamed: 0.1,Unnamed: 0,Date,Depth_m,Latitude,Longitude,Kbrevis,Temp_C,Salinity,BloomID,NO3,PO4,Si
0,2,2015-01-19,0.5,27.613300,-82.739100,0,17.300000,34.240000,0,1.959385,0.002489,11.138027
1,3,2015-01-19,0.5,27.605500,-82.650800,0,17.200000,30.500000,0,1.959385,0.002489,11.138027
2,5,2015-01-19,0.5,27.495900,-82.652200,0,17.600000,32.720000,0,1.959385,0.002489,11.138027
3,6,2015-01-19,0.5,27.446020,-82.687980,0,17.400000,34.770000,0,1.959385,0.002489,11.138027
4,7,2015-01-19,0.5,27.538500,-82.740200,0,17.100000,34.680000,0,1.959385,0.002489,11.138027
...,...,...,...,...,...,...,...,...,...,...,...,...
10059,25861,2023-02-20,0.5,28.350340,-80.612980,0,20.500000,1.550000,0,0.020753,0.004036,3.892709
10060,25862,2023-02-20,0.5,26.416100,-82.410400,26000,21.280000,36.400000,1,0.982610,0.002495,14.416743
10061,25863,2023-02-20,0.5,26.330000,-82.228100,1891391,21.830000,35.720000,1,0.866491,0.002416,14.053338
10062,25865,2023-02-20,0.5,29.592800,-83.447100,0,13.708517,34.681232,0,2.500659,0.003386,14.317658
