In [3]:
import pandas as pd 
import numpy as np 
import xarray as xr

In [4]:
def decode_single_column(column):
    decoded_rows = []
    for row in column:
        row = row.decode("utf-8")
        row = float(row) if row != "" else np.nan
        decoded_rows.append(row)
    return decoded_rows


def decode_all_columns(ocean_data):
    columns_to_decode = [
        "Phosphate",
        "Nitrite_Nitrate",
        "Temperature",
        "Prochlorococcus",
        "Pico_eukaryotes",
    ]
    for i in columns_to_decode:
        ocean_data[i] = decode_single_column(ocean_data[i])
    return ocean_data

In [7]:

def drop_erroneous(ocean_measurements):
    ocean_measurements = ocean_measurements.query("Year <= 2.008e+03")
    ocean_measurements = ocean_measurements.query("Day <= 9.96e+30")
    return ocean_measurements

In [84]:
def years():
    return [i for i in np.arange(1987.0, 2009.0, 1)]


def months():
    return pd.Series([1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12])


In [81]:
data = xr.open_dataset("/Users/leebardon/Dropbox/Development/stats_biogeo_2021/data/raw/ocean_observations.netcdf")

In [82]:
data = data.to_dataframe()
data = decode_all_columns(data)
ocean_data = drop_erroneous(data)

In [70]:
day = ocean_data["Day"]
year = 2005
ocean_data["Month"] = 0

In [71]:
_year = ocean_data["Year"] == year
_jan = ocean_data["Day"].between(1.0, 31.0) 
mask  = _year & _jan



In [95]:
def get_month_masks():
    masks = [
        ocean_data["Day"].between(1.0, 32.0, inclusive=True),
        ocean_data["Day"].between(32.0, 60.0, inclusive=True),
        ocean_data["Day"].between(61.0, 91.0, inclusive=True),
        ocean_data["Day"].between(92.0, 121.0, inclusive=True),
        ocean_data["Day"].between(122.0, 152.0, inclusive=True),
        ocean_data["Day"].between(153.0, 182.0, inclusive=True),
        ocean_data["Day"].between(183.0, 213.0, inclusive=True),
        ocean_data["Day"].between(214.0, 244.0, inclusive=True),
        ocean_data["Day"].between(245.0, 274.0, inclusive=True),
        ocean_data["Day"].between(275.0, 305.0, inclusive=True), 
        ocean_data["Day"].between(306.0, 335.0, inclusive=True),
        ocean_data["Day"].between(336.0, 366.0, inclusive=True),
    ]
    return masks

def get_season_masks():
    masks = [
        ocean_data["Day"].between(336.0, 366.0, inclusive=True) | ocean_data["Day"].between(1.0, 60.0, inclusive=True),
        ocean_data["Day"].between(61.0, 152.0, inclusive=True),
        ocean_data["Day"].between(153.0, 244.0, inclusive=True),
        ocean_data["Day"].between(245.0, 335.0, inclusive=True),
    ]
    return masks

In [96]:
def assign_months(year, months_list, ocean_data):
    _year = ocean_data["Year"] == year
    month_masks = get_month_masks()
    for i in range(0, len(months_list)):
        _month = month_masks[i]
        _mask = _year & _month
        ocean_data["Month"] = ocean_data["Month"].where(~_mask, other=months_list[i])

In [104]:
def assign_seasons(year, seasons_list, ocean_data):
    _year = ocean_data["Year"] == year
    season_masks = get_season_masks()
    for i in range(0, len(seasons_list)):
        _season = season_masks[i]
        _mask = _year & _season
        ocean_data["Season"] = ocean_data["Season"].where(~_mask, other=seasons_list[i])

In [102]:
def create_seasons_column(ocean_data):
    ocean_data["Season"] = None
    years_list = years() 
    seasons_list = ["winter", "spring", "summer", "autumn"]
    for year in years_list:
        assign_seasons(year, seasons_list, ocean_data)
    return ocean_data


In [93]:
def create_months_column(ocean_data):
    ocean_data["Month"] = 0
    years_list = years() 
    months_list = months()
    for year in years_list:
        assign_months(year, months_list, ocean_data)
        months_list = months_list + 12
    return ocean_data

In [99]:
create_months_column(ocean_data)
ocean_data["Month"].describe()

count    58851.000000
mean       149.353911
std         54.804807
min          9.000000
25%        105.000000
50%        154.000000
75%        196.000000
max        263.000000
Name: Month, dtype: float64

In [105]:
create_seasons_column(ocean_data)

Unnamed: 0_level_0,Year,Day,Latitude,Longitude,Nitrite_Nitrate,Phosphate,Temperature,Depth,Prochlorococcus,Synechococcus,Pico_eukaryotes,Month,Season
unlimited,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
0,1997.0,8.0,16.370000,119.950000,,,,0.5,,3.977000e+04,,121,winter
1,1997.0,9.0,16.348000,119.930000,,,,0.5,,1.500000e+04,,121,winter
2,1997.0,14.0,16.460000,119.920000,,,,0.5,,8.800000e+02,,121,winter
3,1997.0,15.0,16.380000,119.910000,,,,0.5,,9.500000e+02,,121,winter
4,1997.0,24.0,16.348000,119.930000,,,,0.5,,2.730000e+03,,121,winter
...,...,...,...,...,...,...,...,...,...,...,...,...,...
59549,1994.0,178.0,43.830556,297.180556,8.58,0.87,3.54,42.0,,9.969210e+36,4040.0,90,summer
59550,1994.0,178.0,43.830556,297.180556,20.09,1.24,6.92,63.0,,9.969210e+36,274.9,90,summer
59551,1994.0,178.0,43.830556,297.180556,,,8.38,75.0,,9.969210e+36,,90,summer
59552,1994.0,178.0,43.830556,297.180556,27.32,1.25,9.80,100.0,,9.969210e+36,113900.0,90,summer


In [4]:
ecosys_interim = pd.read_pickle("/Users/leebardon/Dropbox/Development/stats_biogeo_2021/data/interim/darwin_interim_data/present/ecosys_interim_p.pkl")

In [133]:
def darwin_season_masks(darwin):
    masks = [
        darwin["Month"][3::12] | darwin["Month"][4::12] | darwin["Month"][5::12],
        darwin["Month"][6::12] | darwin["Month"][7::12] | darwin["Month"][8::12],
        darwin["Month"][9::12] | darwin["Month"][10::12] | darwin["Month"][11::12],
        darwin["Month"][12::12] | darwin["Month"][13::12] | darwin["Month"][14::12],
    ]
    return masks

In [135]:
def darwin_seasons_col(darwin_data):
    darwin_data["Season"] = None
    seasons_list = ["spring", "summer", "autumn", "winter"]
    return darwin_assign_seasons(seasons_list, darwin_data)


def darwin_assign_seasons(seasons_list, darwin_data):
    season_masks = darwin_season_masks(darwin_data)
    for i in range(0, len(seasons_list)):
        _mask = season_masks[i]
        darwin_data["Season"] = darwin_data["Season"].where(
            ~_mask, other=seasons_list[i]
        )
    return darwin_data

In [5]:
spring = ecosys_interim["Month"][3::12] | ecosys_interim["Month"][4::12] | ecosys_interim["Month"][5::12]

In [6]:
ecosys_interim["Season"] = None
ecosys_interim["Season"] = ecosys_interim["Season"].where(
            ~spring, other="spring"
        )

In [42]:
import numpy
x = range(24)
l = numpy.array_split(numpy.array(x),5)

In [43]:
l

[array([0, 1, 2, 3, 4]),
 array([5, 6, 7, 8, 9]),
 array([10, 11, 12, 13, 14]),
 array([15, 16, 17, 18, 19]),
 array([20, 21, 22, 23])]

In [30]:
li = np.arange(0, 120)
x, y, z = li[1::12], li[2::12], li[3::12]
s = li[[1:3]:12] | li[2::12] | li[3::12]

In [31]:
s

array([  3,  15,  27,  39,  51,  63,  75,  87,  99, 111])

In [23]:
print(x,y,z)

[  1  13  25  37  49  61  73  85  97 109] [  2  14  26  38  50  62  74  86  98 110] [  3  15  27  39  51  63  75  87  99 111]


In [8]:
ecosys_interim["Season"].describe()

count     1852571
unique          1
top        spring
freq      1852571
Name: Season, dtype: object

In [117]:
ecosys_interim.describe()

Unnamed: 0,diag_levels,iter,TRAC01,TRAC02,TRAC03,TRAC04,TRAC05,TRAC06,TRAC07,TRAC08,...,TRAC71,PP,Nfix,Denit,pH,pCO2,X,Y,Z,Month
count,2223085.0,2223085.0,2223085.0,2223085.0,2223085.0,2223085.0,2223085.0,2223085.0,2223085.0,2223085.0,...,2223085.0,2223085.0,2223085.0,2223085.0,2223085.0,2223085.0,2223085.0,2223085.0,2223085.0,2223085.0
mean,1.0,855540.0,1666.57,0.445955,0.235635,17.24202,0.7579823,12.96144,0.0004902011,5.632965,...,0.0009997496,4.570229e-06,2.82392e-08,0.0,8.070178,0.0002824468,193.1319,-1.183931,1.0,133.0
std,0.0,13769.71,237.382,0.8679441,0.2248783,9.017628,0.7021748,22.16176,0.0004130322,5.651853,...,0.006221793,5.346257e-06,6.529651e-08,0.0,0.05705402,2.619033e-05,98.59461,47.85651,0.0,76.49838
min,1.0,831780.0,-1424.879,9.836914e-11,4.605408e-10,1.053179e-08,5.960977e-05,3.658294e-09,1.619316e-07,3.757966e-07,...,-5.960355e-27,0.0,0.0,0.0,6.809876,5.711331e-05,1.25,-78.0,1.0,1.0
25%,1.0,843660.0,1482.376,0.008862907,0.07813289,8.413961,0.1384026,0.003971213,3.460578e-06,1.311421,...,5.91209e-24,4.82375e-07,1.388662e-11,0.0,8.029513,0.0002653396,116.25,-42.0,1.0,67.0
50%,1.0,855540.0,1602.283,0.04857397,0.2082712,17.45069,0.5217834,0.3782859,0.0005201417,3.992435,...,1.18086e-14,2.23018e-06,1.470064e-09,0.0,8.075103,0.0002804835,196.25,-8.0,1.0,133.0
75%,1.0,867420.0,1771.923,0.4422804,0.335925,24.84288,1.115567,9.6924,0.0008960076,8.353237,...,1.45754e-08,7.42767e-06,1.614191e-08,0.0,8.115204,0.000296027,273.75,34.0,1.0,199.0
max,1.0,879300.0,2201.397,20.84249,18.16567,33.06812,2.189375,74.18663,0.001052969,52.0285,...,0.2174307,5.765267e-05,1.001402e-06,0.0,8.334769,0.001291977,358.75,88.5,1.0,265.0


In [136]:
darwin_seasons_col(ecosys_interim)

Unnamed: 0,diag_levels,iter,TRAC01,TRAC02,TRAC03,TRAC04,TRAC05,TRAC06,TRAC07,TRAC08,...,PP,Nfix,Denit,pH,pCO2,X,Y,Z,Month,Season
10,1.0,833220,2149.222412,1.648718e-06,0.000035,32.415684,2.137044,67.373428,0.000937,0.002703,...,9.165449e-14,9.778816e-17,0.0,8.124066,0.000300,1.25,-70.0,1,9,winter
11,1.0,833220,2147.434082,8.242774e-06,0.000177,32.385727,2.135199,67.092377,0.000941,0.013594,...,3.981707e-13,3.036517e-16,0.0,8.124928,0.000299,1.25,-68.0,1,9,winter
12,1.0,833220,2150.374268,7.787668e-05,0.001622,32.361332,2.133392,67.212936,0.000938,0.127367,...,1.154114e-11,1.868019e-14,0.0,8.125634,0.000299,1.25,-66.0,1,9,winter
13,1.0,833220,2165.442871,5.970550e-04,0.011919,32.208023,2.123554,67.915428,0.000915,0.772793,...,3.899572e-09,7.348246e-12,0.0,8.131565,0.000296,1.25,-64.0,1,9,winter
14,1.0,833220,2172.339355,1.959288e-03,0.035316,31.784969,2.098763,68.678925,0.000880,2.110428,...,1.733191e-08,3.270573e-11,0.0,8.138751,0.000292,1.25,-62.0,1,9,winter
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3434395,1.0,832320,1760.266113,7.981590e-04,0.013508,26.005625,0.843346,3.074813,0.001048,0.811644,...,2.345686e-10,7.043241e-12,0.0,8.116233,0.000251,358.75,80.0,1,4,winter
3434396,1.0,832320,1767.850952,2.707174e-04,0.005684,25.506079,0.869203,3.572733,0.001052,0.399762,...,2.959652e-11,1.112098e-12,0.0,8.114406,0.000253,358.75,82.0,1,4,winter
3434397,1.0,832320,1775.604248,8.718633e-05,0.001873,24.715918,0.906506,4.381703,0.001052,0.134918,...,4.039041e-12,1.513587e-13,0.0,8.113550,0.000255,358.75,84.0,1,4,winter
3434398,1.0,832320,1775.774902,7.570384e-06,0.000161,23.865545,0.944657,5.142447,0.001052,0.012236,...,0.000000e+00,0.000000e+00,0.0,8.117414,0.000253,358.75,86.0,1,4,winter


In [130]:
ecosys_interim["Season"].describe()

count     2223085
unique          2
top        winter
freq      1852573
Name: Season, dtype: object