# Notebook for exploration of the EEL data at specific depth lvl


In [6]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

pd.set_option("display.precision", 4)

## 1. Load data into Dataframe

In [36]:
pathdir = '../data/raw'

# List available data files
import glob
listfiles= glob.glob(pathdir+'/'+'*.csv');
print(*listfiles, sep = "\n")


../data/raw/EELCTDandLADCP_refpos_origCTD.csv
../data/raw/EELCTDandLADCP_refdate.csv
../data/raw/EELCTDandLADCP_2Dfield.csv
../data/raw/EELCTDandLADCP_refpos_gvel.csv
../data/raw/EELCTDandLADCP_3Dfield.csv


#### Import 3D Fields

In [37]:
file1 = pathdir+'/'+'EELCTDandLADCP_3Dfield.csv'
df3D = pd.read_csv(file1,sep=',', index_col=None, 
                     header=0)
df3D


Unnamed: 0,CruiseID,Staname,Refdist,Depth,PTMP,PSAL,Sigma0,Vrel,Vladcp,Vabs,Vladcpalong
0,d22396,13G+,1186.1527,25,12.9947,35.1611,26.5186,7.1146e-02,-0.0274,0.0317,-0.1860
1,d22396,13G+,1186.1527,35,12.9717,35.1606,26.5229,7.1017e-02,-0.0061,0.0316,-0.1952
2,d22396,13G+,1186.1527,45,12.7560,35.1493,26.5563,7.1887e-02,-0.0021,0.0324,-0.1987
3,d22396,13G+,1186.1527,55,12.3461,35.1448,26.6325,7.0745e-02,0.0071,0.0313,-0.2105
4,d22396,13G+,1186.1527,65,11.9363,35.1403,26.7087,6.4738e-02,0.0279,0.0253,-0.2432
...,...,...,...,...,...,...,...,...,...,...,...
63894,dy078,IB22S+,6.7210,65,7.8667,35.0407,27.3261,-6.5029e-06,0.0393,0.0065,-0.0091
63895,dy078,IB22S+,6.7210,75,7.7710,35.0435,27.3424,2.5855e-04,0.0483,0.0067,-0.0345
63896,dy078,IB22S+,6.7210,85,7.7483,35.0504,27.3512,-3.9223e-05,0.0504,0.0064,-0.0517
63897,dy078,IB22S+,6.7210,95,7.7317,35.0519,27.3548,-1.2993e-04,0.0437,0.0063,-0.0619


#### Import Metadata

In [38]:
# Date of each cruise
file2 = pathdir+'/'+'EELCTDandLADCP_refdate.csv'
dfdate = pd.read_csv(file2,sep=',', index_col=None, 
                     header=0)
print(dfdate)

   CruiseID  Year  Month
0    d22396  1996     10
1    d23097  1997      9
2    d23398  1998      5
3    d24299  1999      9
4    d24500  2000      2
5    d25301  2001     12
6   cd17605  2005     10
7    d31206  2006     10
8    d32107  2007      8
9    d34009  2009      6
10   d35110  2010      5
11   d36511  2011      5
12    jc086  2013      5
13    jr302  2014      7
14    dy031  2015      6
15    dy052  2016      6
16    dy078  2017      5


In [39]:
# Location of EEL stations
file3 = pathdir+'/'+'EELCTDandLADCP_refpos_gvel.csv'
dfloc = pd.read_csv(file3,sep=',', index_col=None, 
                     header=0)
#print(dfloc.info())

# Make sure the station name are sorted by their distance along the section
sdfloc = dfloc.sort_values('Refdist', ascending=True)
print(f"\n {sdfloc.iloc[:,:2]}")


    Staname    Refdist
67  IB22S+     6.7210
66  IB21S+    19.4086
65  IB20S+    40.4960
64  IB19S+    69.8213
63  IB18S+   103.0651
..     ...        ...
4      5G+  1272.9178
3      4G+  1282.0609
2      3G+  1289.5191
1      2G+  1295.3205
0      1G+  1302.8994

[68 rows x 2 columns]


<br><br>
## 2. Create Pivot Tables of Absolute geostrophic velocities at specific depth

#### Create several pivot tables for different depths and store them in a dictionary

In [30]:
# Depth of interest
zlist= [105, 505, 805, 2505]

In [31]:
# Create list of empty dictionary
Vabsbyz = []
# Create list of dictionary
for k in zlist:
    df = df3D[df3D['Depth']==k].pivot(values="Vabs", index="CruiseID", columns="Staname")
    df= df.round(decimals=3)
    Vabsbyz.append({'Depth':str(k),'Data':df})
    
print(f" Dataframe for {Vabsbyz[0]['Depth']}m depth:")
Vabsbyz[0]['Data']

 Dataframe for 105m depth:


Staname,13G+,14G+,15G+,8G+,9G+,A+,B+,C+,D+,E+,...,L+,M+,N+,O+,P+,Q+,Q1+,R+,S+,T+
CruiseID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
cd17605,-0.038,-0.034,,0.054,,,,-0.046,-0.022,0.088,...,0.125,0.107,0.108,-0.112,0.044,0.207,0.086,0.258,,
d22396,,,,,,,,,0.096,0.055,...,-0.069,-0.154,-0.025,0.087,0.063,0.186,0.124,,,
d23097,,,,,,,,,,0.025,...,,,-0.067,-0.017,0.037,0.025,-0.014,,,
d23398,,,,,,,,,,-0.113,...,,,-0.059,-0.083,0.038,0.269,0.205,,,
d24299,,,,,,-0.027,-0.003,0.038,0.271,0.103,...,0.085,-0.052,-0.223,-0.032,0.07,0.167,0.173,,,
d24500,,,,,,,,,,,...,0.013,-0.001,-0.226,-0.116,0.156,0.104,0.161,,,
d31206,,,-0.169,,,-0.076,0.076,-0.009,-0.118,-0.069,...,0.065,0.022,0.059,0.018,0.028,0.146,0.109,0.046,-0.12,-0.073
d34009,,,-0.223,,,,,0.004,-0.085,-0.388,...,-0.139,-0.17,-0.097,-0.084,0.034,0.176,0.089,0.095,-0.135,-0.159
d35110,,,-0.083,,0.086,,,0.173,0.083,-0.117,...,-0.367,-0.357,-0.198,0.23,0.126,0.107,0.094,0.045,-0.06,-0.064
d36511,,,,,,,,,-0.306,,...,-0.299,-0.366,0.093,0.31,0.173,0.129,0.07,0.099,-0.024,


\
##### Add the table *df_DA* calculated from the depth average to the list:

In [32]:
# Create Pivot Table average over depth
df_DA = df3D.pivot_table(values="Vabs", index="CruiseID", columns="Staname")

Vabsbyz.append({'Depth': 'Mean', 'Data': df_DA})
print(len(Vabsbyz))

5


<br><br>  
## 3. Sort the dataframe 
### 3.a. Sort the rows according to year of the cruise
##### Merge the dataframe with the dataframe *dfdate* which link each cruise ID to a year and a month

In [33]:
for k,c in enumerate(Vabsbyz):
    df = Vabsbyz[k]['Data']
    dfnew=pd.merge(dfdate[['CruiseID','Year']],
                    df,
                    how='left',
                    on='CruiseID')
    dfnew = dfnew.drop(columns='CruiseID')    
    dfnew = dfnew.set_index('Year',drop=True)  
    Vabsbyz[k]['Data'] = dfnew




# # Visual check
# print(f"First 3 columns of original dataframe for {Vabsbyz[k]['Depth']} depth :\n {df.iloc[:,:3]}")
# print(f"\n and for the new merged dataframe:\n {dfnew.iloc[:,:6]}")

<br><br>
### 3.b. Sort the columns according to location of the station on the reference section *list2*

In [34]:
# importing "copy" for copy operations 
import copy 

list2 = list(sdfloc.Staname)

# Copy list so the new list is independant (In Python, Assignment statements do not copy objects)
Vabsbyz_sort = copy.deepcopy(Vabsbyz)

# Sorty the Dataframe element in the list of dictionary
for k,c in enumerate(Vabsbyz_sort):
    df = Vabsbyz_sort[k]['Data']
    
    # List of the station name from the dataframe
    list1 = list(df.columns[:])  
    
    # Using list comprehension and the enumerate() function to sort list1 elements according to list2 order:
    isort = [c for xref in list2 for c,values in enumerate(list1,0) if values == xref] 
        
    # Sort the order of the column station name in dataframe
    Vabsbyz_sort[k]['Data']= df.iloc[:,isort]
    
# print(Vabsbyz[k]['Data'])
# print(Vabsbyz_sort[k]['Data'])

\
Display first columns of original and new list:

In [35]:
print(Vabsbyz[0]['Data'].iloc[:,:5])
print("\n")
print(Vabsbyz_sort[0]['Data'].iloc[:,:5])

       13G+   14G+   15G+    8G+    9G+
Year                                   
1996    NaN    NaN    NaN    NaN    NaN
1997    NaN    NaN    NaN    NaN    NaN
1998    NaN    NaN    NaN    NaN    NaN
1999    NaN    NaN    NaN    NaN    NaN
2000    NaN    NaN    NaN    NaN    NaN
2001    NaN    NaN    NaN    NaN    NaN
2005 -0.038 -0.034    NaN  0.054    NaN
2006    NaN    NaN -0.169    NaN    NaN
2007    NaN    NaN    NaN    NaN    NaN
2009    NaN    NaN -0.223    NaN    NaN
2010    NaN    NaN -0.083    NaN  0.086
2011    NaN    NaN    NaN    NaN    NaN
2013    NaN    NaN    NaN    NaN    NaN
2014    NaN    NaN    NaN    NaN    NaN
2015    NaN    NaN  0.150  0.028  0.107
2016    NaN    NaN  0.023 -0.124  0.038
2017    NaN    NaN  0.199  0.127 -0.011


      IB22S+  IB21S+  IB20S+  IB19S+  IB18S+
Year                                        
1996     NaN     NaN     NaN     NaN     NaN
1997     NaN     NaN     NaN     NaN     NaN
1998     NaN     NaN     NaN     NaN     NaN
1999     NaN 

\
### Save temporary data

In [36]:
import pickle
pathdata = "../data/interim/"
filename = pathdata + "Zlvl_Vabs"
with open(filename, 'wb') as f:
    pickle.dump(Vabsbyz_sort, f)

## 4. Repeat the process for other variables

#### 4.1 Define a function which "slice" a 3D section (Depth x Lon x Time) for specific depths and store the 2D table generated (Lon x Time) in a list of dictionary.

The function read a csv file (generated in Matlab) where each row of the file corresponds to an element (i,j,k) of a 3D grid. The total number of rows is the product of the 3 dimensions (Depth x Lon x Time). The number of columns of the CSV file is the sum of the number of dimensions (3 in the case of a 3D grid) with the numbers of 3D variables extracted at each grid point (e.g. temperature, salinity, velocity,...)  + number of other informative variable (e.g. distance along the section)

In [31]:
def extract2Ddata_bydepth(Vartoextract,pathcsv='../data/raw'+'/'+'EELCTDandLADCP_3Dfield.csv',zdepths= [105, 505, 805, 2505]):
    """ Function which extracts the variable $Vartoextract$ of a dataframe for specific depths and 
    stores the 2D tables generated (Lon x Time) in a dictionary.
    
    The function read a csv file (generated in Matlab) where each row of the file corresponds to 
    an element (i,j,k) of a 3D grid (Depth x Lon x Time). The total number of rows is the product 
    of the 3 dimensions. The number of columns of the CSV file is the sum of the number of dimensions 
    (3 in the case of a 3D grid) with the numbers of 3D variables extracted at each grid point 
    (e.g. temperature, salinity, velocity,...)  + number of other informative variable 
    (e.g. distance along the section)
    
    Returns:
    A list of dictionary where the list length correspond to the number of depth specified + 1 
    (the depth average variable). Each list element consists in a dictionary with two keys 'Depth' and 'Data'.
    'Depth' is a string indicating the depth of extraction of the variable
    'Data' is the 2D dataframe extracted from the main dataframe

    """
    
    # Load csv file  
    df3D = pd.read_csv(path3Dfield,sep=',', index_col=None, 
                     header=0)

    # Create list of empty dictionary
    ListDict = []
    # Extract 
    for k in zlist:
        df = df3D[df3D['Depth']==k].pivot(values=Vartoextract, index="CruiseID", columns="Staname")
        df= df.round(decimals=3)
        ListDict.append({'Depth':str(k)+'m','Data':df})

    # Create Pivot Table average over depth
    df_DA = df3D.pivot_table(values=Vartoextract, index="CruiseID", columns="Staname")

    ListDict.append({'Depth': 'Mean', 'Data': df_DA})
    return (ListDict)


#### 4.2 Define a function which is going to sort the row and column of a 2D dataframe according to other metadata

In our case the dimensions of the 2D dataframe (Pivot Table) are CruiseID X Station_Name. By default in Pandas the row and columns of the pivot table are sorted. In our case the alphabetical order doesnt match the chronological order for the Cruise ID or the distance of the station for the Station_Name. We need to reorder columns and rows according to this. 

In [43]:
def sortPivotTable(dftosort,rowdf=[],columndf=[]):
    """ 
    Function the row and column of a 2D dataframe according to other dataframe containing the metadata information 
    (rowdf for metadata information on the row, and columndf for the metadata information on the column).
    By default in Pandas, the row and columns of pivot table are in ascending order. In our case the alphabetical order 
    doesnt match the chronological order for the Cruise ID or the section distance for the Station_Name variable. 
    We need to reorder columns and rows according to this. 

    """
    df = dftosort    
    
    if not rowdf.empty:
        # Sort the rows according to year of the cruise and replace the 
        # Merge the dataframe with the dataframe *dfdate* which link each cruise ID to a year and a month   
        dfnew=pd.merge(rowdf[['CruiseID','Year']],
                        df,
                        how='left',
                        on='CruiseID')
         # Here we drop the Cruise ID index and replace it by the year variable
        dfnew = dfnew.drop(columns='CruiseID')
        dfnew = dfnew.set_index('Year',drop=True)  
        df = dfnew

    if not columndf.empty:
        # Sort the columns according to geographical location of the station and not the alphabetical order of their name
        list2 = list(columndf.Staname)        
        # List of the station name from the dataframe
        list1 = list(df.columns[:])  
        # Using list comprehension and the enumerate() function to sort list1 elements according to list2 order:
        isort = [c for xref in list2 for c,values in enumerate(list1,0) if values == xref]   
        # Sort the order of the column station name in dataframe
        df= df.iloc[:,isort]
    
    if (rowdf.empty==True) and (columndf.empty==True):
        print("\nNeed to reference dataframe")
        
    return df


In [53]:
import copy 
import pickle

pathdata = "../data/interim/"


# Variable available in dataframe: PTMP	PSAL	Sigma0	Vrel	Vladcp	Vabs	Vladcpalong
for vartoextract in ['Vabs','Vladcp','Vladcpalong']:
    Varbyz=extract2Ddata_bydepth(vartoextract)

    # Copy list so the new list is a new object (In Python, Assignment statements do not copy objects)
    Varbyz_sort = copy.deepcopy(Varbyz)

    for k,c in enumerate(Varbyz):
        df = Varbyz[k]['Data']
        Varbyz_sort[k]['Data']= sortPivotTable(df,rowdf=dfdate,columndf=sdfloc)

    # Save the file
    filename = pathdata + "Zlvl_" + vartoextract
    with open(filename, 'wb') as f:
        pickle.dump(Varbyz_sort, f)
    
    

In [62]:
# test load
filename = pathdata + "Zlvl_Vladcp"
with open(filename, 'rb') as f:
    Vladcp= pickle.load(f)
Vladcp[0]['Data'].head()

Unnamed: 0_level_0,IB22S+,IB21S+,IB20S+,IB19S+,IB18S+,IB17+,IB16A+,IB16+,IB15+,IB14+,...,Q1+,Q+,R+,S+,15G+,T+,14G+,13G+,9G+,8G+
Year,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1996,,,,,,,,,,,...,0.138,0.222,,,,,,,,
1997,,,,,,,,,,,...,-0.035,0.01,,,,,,,,
1998,,,,,,,0.067,0.009,-0.049,-0.108,...,0.247,0.312,,,,,,,,
1999,,,,,,,,,,,...,0.201,0.188,,,,,,,,
2000,,,,,,,,,,,...,0.176,0.134,,,,,,,,


In [59]:
# test load
filename = pathdata + "Zlvl_Vladcpalong"
with open(filename, 'rb') as f:
    Vladcpalong= pickle.load(f)
Vladcpalong[0]['Data'].head()

Unnamed: 0_level_0,IB22S+,IB21S+,IB20S+,IB19S+,IB18S+,IB17+,IB16A+,IB16+,IB15+,IB14+,...,Q1+,Q+,R+,S+,15G+,T+,14G+,13G+,9G+,8G+
Year,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1996,,,,,,,,,,,...,-0.097,-0.068,,,,,,,,
1997,,,,,,,,,,,...,-0.11,-0.155,,,,,,,,
1998,,,,,,,-0.146,-0.104,-0.038,0.054,...,-0.131,-0.102,,,,,,,,
1999,,,,,,,,,,,...,-0.026,0.005,,,,,,,,
2000,,,,,,,,,,,...,-0.166,-0.199,,,,,,,,
