In [1]:
import pandas as pd
import numpy as np
import os
import matplotlib.pyplot as plt
from mpl_toolkits.basemap import Basemap
import matplotlib.ticker as mticker
import geopy.distance

xl_file = "/Volumes/KeithSSD/CB_V4/otu_data/mixing_data/Particle_Data.xlsx"

assert os.path.exists(xl_file)
unformatted_df1 = pd.read_excel(xl_file, sheet_name='2015')
unformatted_df2 = pd.read_excel(xl_file, sheet_name='2016')

env_data_file = "/Volumes/KeithSSD/CB_V4/otu_data/WaterQualityData/matched_cleaned_data/all_mdata_with_habitat.txt"
env_data = pd.read_csv(env_data_file, sep="\t")
stat_latlon = env_data[['StationName', 'Latitude', 'Longitude']].groupby('StationName').agg('mean').round(3)
print(stat_latlon.shape)
stat_latlon.head()

(20, 2)


Unnamed: 0_level_0,Latitude,Longitude
StationName,Unnamed: 1_level_1,Unnamed: 2_level_1
CB22,39.349,-76.176
CB31,39.25,-76.24
CB32,39.164,-76.306
CB33C,38.998,-76.359
CB41C,38.826,-76.399


In [2]:
# prep df by add columns of station lat, lon, and depth*-1 to individual particle data
def prep_particle_chunk(sample_pt_i, sample_pts, stat_latlon, particle_data, index_range_i):
    one_sample = sample_pts.loc[sample_pt_i, :].to_dict()
    one_sample['StatLat'] = stat_latlon.loc[one_sample['StationName'], 'Latitude']
    one_sample['StatLon'] = stat_latlon.loc[one_sample['StationName'], 'Longitude']
    one_sample['DepthName'] *= -1
    origin_cols = pd.DataFrame({p_i: one_sample for p_i in index_range_i}).T
    one_section = particle_data.loc[index_range_i, :].copy()
    particle_sub = pd.concat((one_section, origin_cols), axis=1, sort=1, verify_integrity=1)
    return particle_sub.copy()


In [3]:

unformatted_df = unformatted_df2.copy()

# these are the station location data
sample_pts = unformatted_df.iloc[: , :6].dropna()

# pull out this data and add the right column names 
particle_data = unformatted_df.iloc[1:, 6:].dropna()
particle_data.columns = ['y_km', 'x_km', 'lat', 'lon', 'depth']
particle_data = particle_data.apply(pd.to_numeric, axis=1)

# longitude is wrong
particle_data['lon'] = particle_data['lon'] - 360

lowest_points, subset_to_use = {}, set()
for s in sample_pts.StationName.unique():
    sample_sub = sample_pts.loc[sample_pts.StationName == s, :]
    depth_sub = sample_sub.loc[sample_sub.DepthName == sample_sub.DepthName.max(), :]
    lowest_points[s] = (list(depth_sub.index), len(depth_sub.DateMMDDYY.unique()))
    subset_to_use.update(list(depth_sub.index))

# the following 300 rows are the point locations 
index_ranges = {i:list(range(i+1,i+301)) for i in sample_pts.index}

assert set([j for i in index_ranges.values() for j in i]) == set(particle_data.index)

particle_chunks = {}
for s_p, ir_i in list(index_ranges.items()):
    particle_chunks[s_p] = prep_particle_chunk(s_p, sample_pts, stat_latlon, particle_data, ir_i)

print(len(index_ranges), len(subset_to_use))
list(particle_chunks.values())[0].head()


75 37


Unnamed: 0,y_km,x_km,lat,lon,depth,CollectionAgency,DateMMDDYY,DepthName,Samples,StatLat,StatLon,StationName,Time
1,907451.69,4334940.5,39.071159,-76.29114,-9.697164,Preheim,62716,-1,SB062716TAWCSCB33CD1BR2TR1I80,38.998,-76.359,CB33C,09:13:00
2,911187.13,4331585.0,39.039276,-76.25012,-0.102223,Preheim,62716,-1,SB062716TAWCSCB33CD1BR2TR1I80,38.998,-76.359,CB33C,09:13:00
3,896647.69,4324071.0,38.978493,-76.42194,-2.09832,Preheim,62716,-1,SB062716TAWCSCB33CD1BR2TR1I80,38.998,-76.359,CB33C,09:13:00
4,909035.63,4327793.5,39.006233,-76.27716,-4.127914,Preheim,62716,-1,SB062716TAWCSCB33CD1BR2TR1I80,38.998,-76.359,CB33C,09:13:00
5,897784.5,4315252.0,38.898766,-76.41397,-6.03072,Preheim,62716,-1,SB062716TAWCSCB33CD1BR2TR1I80,38.998,-76.359,CB33C,09:13:00


In [4]:
ptcle_stat_distmat = pd.DataFrame(index=list(lowest_points.keys()), 
                                  columns=list(lowest_points.keys()))

horiz_dist = lambda x: geopy.distance.distance((x[0], x[1]), (x[2], x[3])).km

for s_stat in ptcle_stat_distmat.columns:
    for p_stat in ptcle_stat_distmat.index:    
        # these are the particle sets from the 'p' station
        stat_p_ixs = lowest_points[p_stat][0]
        
        # replace the index with the dataframe full of particle locations
        stat_p_ixs = [particle_chunks[s_p].loc[:, ['lat', 'lon']] for s_p in stat_p_ixs]
        
        # unpack all the points
        stat_p_locs = [list(stat_p.loc[idx,['lat', 'lon']]) for stat_p in stat_p_ixs for idx in stat_p.index]

        # this is the location of the station
        stat_s_loc = list(stat_latlon.loc[s_stat, :])

        arg_set = [(stat_s_loc[0], stat_s_loc[1], sx[0], sx[1]) for sx in stat_p_locs]
        
        dist_set = np.array(list(map(horiz_dist, arg_set)))
        ptcle_stat_distmat.loc[p_stat, s_stat] = dist_set.mean()
        print("{} {} {:.2f}".format(s_stat, p_stat, dist_set.mean()))





CB33C CB33C 16.60
CB33C CB22 41.55
CB33C CB43C 54.71
CB33C CB44 84.60
CB33C CB51 78.38
CB33C CB52 123.88
CB33C CB53 147.56
CB33C CB54 153.12
CB33C CB71 161.92
CB33C CB62 181.91
CB33C CB63 182.55
CB33C CB72 196.63
CB33C CB64 209.63
CB33C CB74 215.27
CB33C CB31 31.54
CB33C CB32 13.33
CB33C CB41C 38.21
CB33C CB42C 64.74
CB33C CB61 170.47
CB33C CB73 220.11
CB22 CB33C 56.42
CB22 CB22 4.66
CB22 CB43C 95.11
CB22 CB44 123.65
CB22 CB51 117.36
CB22 CB52 161.92
CB22 CB53 185.23
CB22 CB54 190.65
CB22 CB71 198.76
CB22 CB62 219.95
CB22 CB63 220.78
CB22 CB72 234.23
CB22 CB64 247.86
CB22 CB74 251.33
CB22 CB31 10.98
CB22 CB32 29.19
CB22 CB41C 79.77
CB22 CB42C 105.05
CB22 CB61 208.51
CB22 CB73 257.49
CB43C CB33C 35.55
CB43C CB22 90.29
CB43C CB43C 11.36
CB43C CB44 37.55
CB43C CB51 32.13
CB43C CB52 76.80
CB43C CB53 100.47
CB43C CB54 106.11
CB43C CB71 115.54
CB43C CB62 134.08
CB43C CB63 134.55
CB43C CB72 149.11
CB43C CB64 161.48
CB43C CB74 169.11
CB43C CB31 80.61
CB43C CB32 62.43
CB43C CB41C 12.88
CB43C CB

In [5]:
from itertools import combinations

grps_by_size = {}
all_grps = set()
for grp_size in range(1,5):
    gps_of_size_i = combinations(list(ptcle_stat_distmat.index), grp_size)
    gps_of_srt_i = [tuple(sorted(list(j))) for j in gps_of_size_i]
    assert len(gps_of_srt_i) == len(set(gps_of_srt_i))
    print("Groups of size {}: {} {}".format(grp_size, len(gps_of_srt_i), gps_of_srt_i[:3]))
    grps_by_size[grp_size] = gps_of_srt_i
    all_grps.update(gps_of_srt_i)

print(len(all_grps))

other_cols =  ['F', 'G', 'H', 'I', 'J', 'K', 'L', 'M', 'N', 'O']
solver_sheet = pd.DataFrame(index=range(1, len(all_grps)+1), columns=['Q', 'A', 'B', 'C', 'D', 'E']+other_cols)

counter = 0
for gs_, gs_i in grps_by_size.items():
    print(gs_)
    for g_i in gs_i:
        i = solver_sheet.index[counter]
        solver_sheet.loc[i, 'Q'] = gs_
        solver_sheet.loc[i, 'A'] = g_i
        solver_sheet.loc[i, 'B'] = 0
        inter_group_dist = 0
        for p_stat in g_i:
            for s_stat in g_i:
                inter_group_dist += ptcle_stat_distmat.loc[p_stat, s_stat]
        solver_sheet.loc[i, 'C'] = inter_group_dist
        counter += 1


Groups of size 1: 20 [('CB33C',), ('CB22',), ('CB43C',)]
Groups of size 2: 190 [('CB22', 'CB33C'), ('CB33C', 'CB43C'), ('CB33C', 'CB44')]
Groups of size 3: 1140 [('CB22', 'CB33C', 'CB43C'), ('CB22', 'CB33C', 'CB44'), ('CB22', 'CB33C', 'CB51')]
Groups of size 4: 4845 [('CB22', 'CB33C', 'CB43C', 'CB44'), ('CB22', 'CB33C', 'CB43C', 'CB51'), ('CB22', 'CB33C', 'CB43C', 'CB52')]
6195
1
2
3
4


In [31]:
limits_by_size = {2:50, 3:60, 4:60}

best_indexes = set()
best_indexes.update(list(solver_sheet.loc[solver_sheet.Q == 1, 'A']))
print(len(best_indexes), 'best_indexes')

for grp_size, size_lim in limits_by_size.items():
    subsolver = solver_sheet[solver_sheet.Q == grp_size]
    remaining_groups = set(subsolver.A)
    added_groups  = set()
    while len(added_groups) < size_lim:
        # see whats remaining among all 
        avail_bool = subsolver.A.isin(remaining_groups)
        # make into a set 
        remaining_size_i = set(subsolver[avail_bool].A)
        print(len(remaining_size_i), "1x")
        # find the best single group
        best_avail = subsolver[avail_bool].sort_values(by='C')
        best_grp = best_avail.iloc[0, 1]
        # set up a set of the remaining stations 
        coherency_ = set(ptcle_stat_distmat.index) - set(list(best_grp))
        # set up a set of the stations pulled into this set group
        coherent_group = set([best_grp])
        # make the best group unavailable
        remaining_size_i.remove(best_grp)
        remaining_size_i = list(filter(lambda x: len(set(x) - coherency_) == 0, remaining_size_i))
        print(len(remaining_size_i), len(coherent_group), "2x")
        while len(remaining_size_i) > 0:
            # remove anything with any stations from the best group(s)
            print(len(remaining_size_i), "3x")
            # resort and pull the best
            avail_bool = subsolver.A.isin(remaining_size_i)
            best_avail = subsolver[avail_bool].sort_values(by='C')
            best_remaining = best_avail.iloc[0, 1]
            # add it to the local set group
            coherent_group.update([best_remaining])
            # make the stations added unavailable
            for br in best_remaining:
                coherency_.remove(br)
            remaining_size_i.remove(best_remaining)
            remaining_size_i = list(filter(lambda x: len(set(x) - coherency_) == 0, remaining_size_i))
            print(len(remaining_size_i), len(coherent_group), "4x")
        
        if len(coherency_) > 0:
            coherent_group.update([tuple(sorted(coherency_))])
        print(len(remaining_size_i), len(coherent_group), "5x")  
        
        added_groups.update(coherent_group)
        print(len(added_groups), 'added_groups out of', size_lim)
        for cg in coherent_group:
            if cg in remaining_groups:
                remaining_groups.remove(cg)
        print(len(remaining_groups), 'remaining_groups')
    
    best_indexes.update(added_groups)
        
    
    

20 best_indexes
190 1x
153 1 2x
153 3x
120 2 4x
120 3x
91 3 4x
91 3x
66 4 4x
66 3x
45 5 4x
45 3x
28 6 4x
28 3x
15 7 4x
15 3x
6 8 4x
6 3x
1 9 4x
1 3x
0 10 4x
0 10 5x
10 added_groups out of 50
180 remaining_groups
180 1x
145 1 2x
145 3x
114 2 4x
114 3x
86 3 4x
86 3x
62 4 4x
62 3x
43 5 4x
43 3x
27 6 4x
27 3x
15 7 4x
15 3x
6 8 4x
6 3x
1 9 4x
1 3x
0 10 4x
0 10 5x
20 added_groups out of 50
170 remaining_groups
170 1x
137 1 2x
137 3x
108 2 4x
108 3x
80 3 4x
80 3x
59 4 4x
59 3x
40 5 4x
40 3x
25 6 4x
25 3x
15 7 4x
15 3x
6 8 4x
6 3x
1 9 4x
1 3x
0 10 4x
0 10 5x
30 added_groups out of 50
160 remaining_groups
160 1x
129 1 2x
129 3x
101 2 4x
101 3x
74 3 4x
74 3x
55 4 4x
55 3x
36 5 4x
36 3x
22 6 4x
22 3x
10 7 4x
10 3x
5 8 4x
5 3x
0 9 4x
0 10 5x
39 added_groups out of 50
151 remaining_groups
151 1x
122 1 2x
122 3x
95 2 4x
95 3x
70 3 4x
70 3x
47 4 4x
47 3x
33 5 4x
33 3x
21 6 4x
21 3x
12 7 4x
12 3x
5 8 4x
5 3x
0 9 4x
0 10 5x
48 added_groups out of 50
142 remaining_groups
142 1x
115 1 2x
115 3x
89 2 4x
8

In [46]:
print(np.unique([len(i) for i in best_indexes], return_counts=True))
print(best_indexes - set(solver_sheet.A))
solver_sheet2 = solver_sheet.copy().loc[solver_sheet.A.isin(best_indexes), :]
print(len(solver_sheet2))
solver_sheet2 = solver_sheet2.sort_values(by=['Q', 'A'])
solver_sheet2 = solver_sheet2.rename(index={old_idx:ix+1 for ix, old_idx in enumerate(solver_sheet2.index)})

solver_sheet2.drop('Q', axis=1, inplace=True)
# the objective function is easy, its sumproduct(A, B) 
solver_sheet2.loc[1, 'D'] = 'Objective Function'
solver_sheet2.loc[2, 'D'] = "=SUMPRODUCT(B:B,C:C)"
solver_sheet2.loc[3, 'D'] = 'Group Creation Limit'
solver_sheet2.loc[4, 'D'] = '=SUM(B:B)'
solver_sheet2.loc[4, 'E'] = '==' 
solver_sheet2.loc[4, 'F'] = '6' 
solver_sheet2.loc[5, 'D'] = 'Group Constraints'

for st_ix, stat in enumerate(sorted(ptcle_stat_distmat.index)):
    solver_sheet2.loc[6+st_ix, 'D'] = stat
    bcells = [str(idx) for idx in solver_sheet2.index if stat in solver_sheet2.loc[idx, 'A']]
    solver_sheet2.loc[6+st_ix, 'E'] = "=B" + "+B".join(bcells)


empty_cols = solver_sheet2.columns[solver_sheet2.isnull().sum() == len(solver_sheet2)]
print(empty_cols)
solver_sheet3 = solver_sheet2.drop(empty_cols, axis=1)
solver_sheet3.to_csv("/Volumes/KeithSSD/CB_V4/otu_data/mixing_data/station_group_opto.csv", header=False, 
                    index=False)

(array([1, 2, 3, 4]), array([20, 61, 54, 62]))
set()
197
Index(['G', 'H', 'I', 'J', 'K', 'L', 'M', 'N', 'O'], dtype='object')


In [None]:

    



    solver_sheet.loc[6+st_ix, 'D'] = stat
    bcells = [str(s+1) for s in range(len(stat_grps)) if stat in stat_grps[s]]
    per_cell = (len(bcells) // len(other_cols)) + 1
    for sc in range(len(other_cols)):
        sub_formula = "=B" + "+B".join(bcells[(sc*per_cell):((sc+1)*per_cell)])
        solver_sheet.loc[6+st_ix, other_cols[sc]] = sub_formula
    solver_sheet.loc[6+st_ix, 'E'] = "=SUM(F{}:O{})".format((6+st_ix), (6+st_ix))

solver_sheet.to_csv("/Volumes/KeithSSD/CB_V4/otu_data/mixing_data/station_group_opto.csv", header=False, 
                    index=False)

In [None]:
horiz_dist = lambda x: geopy.distance.distance((x[0], x[1]), (x[2], x[3])).km

def summarize_chunk(particle_sub):
    pos_cols = ['lat', 'lon', 'StatLat', 'StatLon']
    particle_sub['horiz_dist'] = particle_sub[pos_cols].apply(horiz_dist, axis=1)
    particle_sub['vert_dist'] = particle_sub['DepthName'] - particle_sub['depth']
    sub_summary = {}
    sub_summary['sum_horiz_dist'] = particle_sub['horiz_dist'].sum()
    sub_summary['sum_vert_dist'] = abs(particle_sub['vert_dist']).sum()
    sub_summary['sum_downwelling'] = particle_sub.loc[particle_sub.vert_dist < 0, 'vert_dist'].sum()
    sub_summary['sum_upwelling'] = particle_sub.loc[particle_sub.vert_dist > 0, 'vert_dist'].sum()
    sub_summary['north_lim'] = np.percentile(particle_sub.lat, 75) - particle_sub.StatLat.unique()[0]
    sub_summary['south_lim'] = np.percentile(particle_sub.lat, 25) - particle_sub.StatLat.unique()[0]
    sub_summary['east_lim'] = np.percentile(particle_sub.lon, 75) - particle_sub.StatLon.unique()[0]
    sub_summary['west_lim'] = np.percentile(particle_sub.lon, 25) - particle_sub.StatLon.unique()[0]
    sub_summary['up_lim'] = np.percentile(particle_sub.depth, 75) - particle_sub.DepthName.unique()[0]
    sub_summary['down_lim'] = np.percentile(particle_sub.depth, 25) - particle_sub.DepthName.unique()[0]
    return sub_summary


In [None]:
def summarize_particle_year(unformatted_df):
    # the first six columns are date time depth station collection agency 
    sample_pts = unformatted_df.iloc[: , :6].dropna()
    print(sample_pts.StationName.unique())
    print(sample_pts.head())
    # pull out this data and add the right column names 
    particle_data = unformatted_df.iloc[1:, 6:].dropna()
    particle_data.columns = ['y_km', 'x_km', 'lat', 'lon', 'depth']
    particle_data = particle_data.apply(pd.to_numeric, axis=1)
    # longitude is wrong
    particle_data['lon'] = particle_data['lon'] - 360
    print(particle_data.head())
    # the following 300 rows are the point locations 
    index_ranges = {i:list(range(i+1,i+301)) for i in sample_pts.index}
    
    assert set([j for i in index_ranges.values() for j in i]) == set(particle_data.index)

    particle_chunks = {}
    for s_p, ir_i in index_ranges.items():
        particle_chunks[s_p] = prep_particle_chunk(s_p, sample_pts, stat_latlon, particle_data, ir_i)
    
    print(particle_chunks[s_p].head())
    print(len(particle_chunks), sum([len(i) for i in particle_chunks.values()]))
    
    p_summary = {}
    for sp_i, pcdf in particle_chunks.items():
        p_summary[sp_i] = summarize_chunk(pcdf)

    column_order = ['sum_horiz_dist', 'sum_vert_dist', 'sum_downwelling', 'sum_upwelling',
                    'north_lim', 'south_lim', 'east_lim', 'west_lim', 'up_lim', 'down_lim']
    psum_df = pd.DataFrame(p_summary).T.loc[:, column_order]
    print(((psum_df.corr() > 0.6) & (psum_df.corr() < 1.0)).sum().sum() / 2)
    
    samples_summarized = pd.concat((sample_pts, psum_df), axis=1, sort=1, verify_integrity=1)
    return samples_summarized.copy(), particle_chunks

In [None]:
unformatted_df2_x = unformatted_df2.drop(15050, axis=0)
unformatted_df2_x = unformatted_df2_x.drop(range(15050+1,15050+301), axis=0)

print(unformatted_df2_x.shape)
particles2016, particle_chunks16 = summarize_particle_year(unformatted_df2_x.copy())
print(particles2016.shape)

In [None]:
#particles2016.sort_values(by='DateMMDDYY').tail(50)
particle_chunks16[8127].sort_values('depth').head(10)

In [None]:
particles2015, particle_chunks15 = summarize_particle_year(unformatted_df1.copy())

In [None]:
depth_index_june16 = [0, 301, 602, 903, 1204, 1505, 1806, 2107, 2408, 2709, 3010]
depth_index_july16 = [3612, 3913, 4214, 4515, 4816, 5117, 5418, 5719, 6020, 6321]
depth_index_aug16 = [12040, 12341, 12642, 12943, 13244, 13545, 13846, 14147]
depths_to_add_back = [3010] + [6321] + [14147]
depth_prof_idxs = depth_index_june16+depth_index_july16+depth_index_aug16

transect_set = [i for i in depth_prof_idxs if not i in depths_to_add_back]
particles_transect_2016 = particles2016.drop(transect_set, axis=0)
depth_profiles_2016 = particles2016.loc[depth_prof_idxs, :]
print(particles_transect_2016.shape)
print(depth_profiles_2016.shape)

In [None]:
depth_profiles_1516 = pd.concat((particles2015, depth_profiles_2016), axis=0, sort=1, ignore_index=1)
particles1516srt =  depth_profiles_1516.sort_values(by=['DateMMDDYY', 'DepthName'])

#particles2015srt.loc[particles2015srt.index[:20], ['DateMMDDYY', 'DepthName']]

y1 = particles1516srt['sum_upwelling'].values
y2 = particles1516srt['sum_downwelling'].values
xlab  = particles1516srt[['DateMMDDYY', 'DepthName']].apply(lambda x: "-".join([str(int(x[0])), 
                                                                                str(int(x[1]))]), axis=1)

x12 = list(range(1,len(y1)+1))

fig = plt.figure(figsize=(12, 8))
ax = fig.add_axes([0,0,1,1])

ax.bar(x12, y2*-1, 0.5, label='Downwelling', bottom=[0]*len(y2), color='r')
ax.bar(x12, y1*-1, 0.5, label='Upwelling', bottom=[0]*len(y2), color='b')
ax.set_ylabel('Sum Vertical Distance Traveled by Particles')
ax.set_xticks(x12)
ax.set_xticklabels(list(xlab), rotation = 90)
#ax.set_yticks(np.arange(0, 81, 10))
ax.legend()
plt.show()


In [None]:

y1 = particles1516srt['up_lim'].values
y2 = particles1516srt['down_lim'].values
xlab  = particles1516srt[['DateMMDDYY', 'DepthName']].apply(lambda x: "-".join([str(int(x[0])), 
                                                                                str(int(x[1]))]), axis=1)

x12 = list(range(1,len(y1)+1))

fig = plt.figure(figsize=(12, 8))
ax = fig.add_axes([0,0,1,1])

ax.bar(x12, y1, 0.5, label='Q1 of Vertical Travel', bottom=[0]*len(y1), color='r')
ax.bar(x12, y2, 0.5, label='Q3 of Vertical Travel', bottom=[0]*len(y2), color='b')
ax.set_ylabel('Sum Vertical Distance Traveled by Particles')
ax.set_xticks(x12)
ax.set_xticklabels(list(xlab), rotation = 90)
#ax.set_yticks(np.arange(0, 81, 10))
ax.legend()
plt.show()


In [None]:
y1 = particles1516srt['south_lim'].values
y2 = particles1516srt['north_lim'].values
y3 = particles1516srt['east_lim'].values
y4 = particles1516srt['west_lim'].values

print(particles1516srt[[i for i in particles1516srt.columns if 'lim' in i]].head())

x12 = list(range(1,len(y1)+1))

fig = plt.figure(figsize=(11, 6))
ax = fig.add_axes([0,0,1,1])

ax.bar(x12, y1, 0.5, label='SouthRange', color='r')
ax.bar(x12, y2, 0.5, label='NorthRange', color='b')
ax.bar(x12, y3, 0.5, label='EastRange', bottom=[0.4]*len(y3), color='cyan')
ax.bar(x12, y4, 0.5, label='WestRange', bottom=[0.4]*len(y4), color='magenta')
ax.set_ylabel('Long/Lat Range for 95% of Particles')
ax.set_xticks(x12)
ax.set_xticklabels(list(xlab), rotation = 90)
#ax.set_yticks(np.arange(0, 81, 10))
ax.legend()
plt.show()
                      

In [None]:
particles2016srt =  particles_transect_2016.sort_values(by=['StationName', 'DateMMDDYY', 'DepthName'])
quant_cols = ['sum_upwelling', 'sum_downwelling', 'up_lim', 'down_lim']

updown_df = particles2016srt.loc[:, quant_cols + ['StationName']].groupby('StationName').agg([np.mean, np.std])
#updown_df = updown_df.fillna(0)
print(updown_df.head())

y1 = updown_df[('sum_upwelling', 'mean')].values
y2 = updown_df[('sum_downwelling', 'mean')].values
y1s = updown_df[('sum_upwelling', 'std')].values
y2s = updown_df[('sum_downwelling', 'std')].values

y3 = updown_df[('up_lim', 'mean')].values
y4 = updown_df[('down_lim', 'mean')].values
y3s = updown_df[('up_lim', 'std')].values
y4s = updown_df[('down_lim', 'std')].values

xlab  = list(updown_df.index)
x12 = np.array(list(range(1,len(y1)+1)))


fig = plt.figure(figsize=(6, 6), dpi=300)
ax1, ax2 = fig.subplots(2, 1, sharex=True)

ax1.bar(x12-0.2, y2*-1, 0.4, label='Downwelling', bottom=[0]*len(y2), color='white', edgecolor='grey',
        yerr=y2s, ecolor='black', capsize=2)
ax1.bar(x12+0.2, y1, 0.4, label='Upwelling', bottom=[0]*len(y2), color='white', edgecolor='grey', hatch='xxx',
        yerr=y1s, ecolor='black', capsize=2)
ax1.set_ylabel('Sum distance travelled')
ax1.spines['right'].set_visible(False)
ax1.spines['top'].set_visible(False)
ax1.set_xticks(x12)
ax1.set_xticklabels(list(xlab), rotation = 45)
#ax1.tick_params(bottom=False, top=False, left=True, right=False, 
#                labelbottom=False, labeltop=False, labelleft=True, labelright=False)
ax1.legend()
ax2.bar(x12-0.2, y3, 0.4, label='Q1', bottom=[0]*len(y3), color='white', edgecolor='grey',
        yerr=y3s, ecolor='black', capsize=2)
ax2.bar(x12+0.2, y4, 0.4, label='Q3', bottom=[0]*len(y4), color='white', edgecolor='grey', hatch='xxx',
        yerr=y4s, ecolor='black', capsize=2)
ax2.set_ylabel('Sinking Distance')
ax2.set_xticks(x12)
ax2.set_xticklabels(list(xlab), rotation = 45)
ax2.spines['right'].set_visible(False)
ax2.spines['top'].set_visible(False)
ax2.legend()
plt.show()


In [None]:
quant_cols = ['south_lim', 'north_lim', 'east_lim', 'west_lim']
allround_df = particles2016srt.loc[:, quant_cols + ['StationName']].groupby('StationName').agg([np.mean, np.std])
print(allround_df.head())

y1 = allround_df[('south_lim', 'mean')].values
y2 = allround_df[('north_lim', 'mean')].values
y1s = allround_df[('south_lim', 'std')].values
y2s = allround_df[('north_lim', 'std')].values

y3 = allround_df[('east_lim', 'mean')].values
y4 = allround_df[('west_lim', 'mean')].values
y3s = allround_df[('east_lim', 'std')].values
y4s = allround_df[('west_lim', 'std')].values

xlab  = list(allround_df.index)
x12 = np.array(list(range(1,len(y1)+1)))


fig = plt.figure(figsize=(6, 6), dpi=300)
ax1, ax2 = fig.subplots(2, 1, sharex=True)

ax1.bar(x12-0.2, y2, 0.4, label='North', bottom=[0]*len(y2), color='white', edgecolor='grey',
        yerr=y2s, ecolor='black', capsize=2)
ax1.bar(x12+0.2, y1, 0.4, label='South', bottom=[0]*len(y2), color='white', edgecolor='grey', hatch='xxx',
        yerr=y1s, ecolor='black', capsize=2)
ax1.set_ylabel(r'$Lon_{quartile} - Lon_{stat}$')
ax1.spines['right'].set_visible(False)
ax1.spines['top'].set_visible(False)
ax1.set_xticks(x12)
ax1.set_xticklabels(list(xlab), rotation = 45)
#ax1.tick_params(bottom=False, top=False, left=True, right=False, 
#                labelbottom=False, labeltop=False, labelleft=True, labelright=False)
ax1.legend()
ax2.bar(x12-0.2, y3, 0.4, label='East', bottom=[0]*len(y3), color='white', edgecolor='grey',
        yerr=y3s, ecolor='black', capsize=2)
ax2.bar(x12+0.2, y4, 0.4, label='West', bottom=[0]*len(y4), color='white', edgecolor='grey', hatch='xxx',
        yerr=y4s, ecolor='black', capsize=2)
ax2.set_ylabel(r'$Lat_{quartile} - Lat_{stat}$')
ax2.set_xticks(x12)
ax2.set_xticklabels(list(xlab), rotation = 45)
ax2.spines['right'].set_visible(False)
ax2.spines['top'].set_visible(False)
ax2.legend()
plt.show()


In [None]:
env_data['StatName'] = pd.Series()
statg1 = ['CB22', 'CB31', 'CB32']
statg2 = ['CB33C', 'CB41C']
statg3 = ['CB42C', 'CB43C', 'CB44', 'CB51']
statg4 = ['CB52', 'CB53', 'CB54', 'CB71']
statg5 = ['CB61', 'CB62', 'CB63', 'CB64']
statg6 = ['CB72', 'CB73', 'CB74']

env_data.loc[env_data['StationName'].isin(statg1),'StatName'] = 1
env_data.loc[env_data['StationName'].isin(statg2),'StatName'] = 2
env_data.loc[env_data['StationName'].isin(statg3),'StatName'] = 3
env_data.loc[env_data['StationName'].isin(statg4),'StatName'] = 4
env_data.loc[env_data['StationName'].isin(statg5),'StatName'] = 5
env_data.loc[env_data['StationName'].isin(statg6),'StatName'] = 6

statdat = env_data[['StationName', 'StatName', 'Latitude', 'Longitude']].groupby('StationName').agg(['mean', 'count'])
statdat.columns = statdat.columns.droplevel()

statdat.columns = ['StationGroup', 'NumSamples', 'Lat', 'x', 'Lon', 'y']
statdat.drop(['x', 'y'], axis=1, inplace=True)
statdat

In [None]:
all_particles = pd.concat(particle_chunks16.values(), axis=0, sort=True, verify_integrity=True)
#ap_m = all_particles[['lat', 'lon']].mean()
#ap_sd = all_particles[['x_km', 'y_km']].std()


part_data = {}
for stat_group_i in statdat.StationGroup.unique():
    stat_group_names = list(statdat[statdat.StationGroup == stat_group_i].index)
    stat_group_bool = particles_transect_2016.StationName.isin(stat_group_names)
    stat_group_ixs = list(particles_transect_2016[stat_group_bool].index)
    print("Group", int(stat_group_i), ":", stat_group_names)
    print("Row #s:", stat_group_ixs)

    stat_grp_particles = pd.concat([particle_chunks16[i] for i in stat_group_ixs], 
                                    axis=0, sort=True, verify_integrity=True)

    print("N particles:", len(stat_grp_particles))
    x_pos = stat_grp_particles['lat']# - ap_m['lat']) #/ap_sd['x_km']
    y_pos = stat_grp_particles['lon']# - ap_m['lon']) #/ap_sd['y_km']
    xy_vec = np.sqrt(x_pos**2 + y_pos**2)
    dist_lims = np.percentile(xy_vec, [5, 95])
    x_pos = x_pos[(xy_vec > dist_lims[0]) & (xy_vec < dist_lims[1])]
    y_pos = y_pos[(xy_vec > dist_lims[0]) & (xy_vec < dist_lims[1])]
    part_data[int(stat_group_i)] = {'x': x_pos, 'y': y_pos, 
                               'sx': statdat.loc[stat_group_names, 'Lat'],
                               'sy': statdat.loc[stat_group_names, 'Lon'],
                               'l': "Group{}".format(int(stat_group_i))}

import seaborn as sns
np.random.seed(10)
sns.set_style("whitegrid")

#sns.set(color_codes=True, font_scale=1)
kde_f, kde_ax = plt.subplots(figsize=(6,8), dpi=150)
#kde_ax.tick_params(axis='both', which='major', labelsize=22)
ax1 = sns.kdeplot(part_data[1]['y'], part_data[1]['x'], cmap="Reds", shade=True, shade_lowest=False, 
                  alpha=0.8, ax=kde_ax)
kde_ax.scatter(x=part_data[1]['sy'], y=part_data[1]['sx'], c='red', edgecolors='k')

ax2 = sns.kdeplot(part_data[2]['y'], part_data[2]['x'], cmap="Greens", shade=True, shade_lowest=False, 
                  alpha=0.8, ax=kde_ax)
kde_ax.scatter(x=part_data[2]['sy'], y=part_data[2]['sx'], c='green', edgecolors='k')

ax3 = sns.kdeplot(part_data[3]['y'], part_data[3]['x'], cmap="Blues", shade=True, shade_lowest=False, 
                  alpha=0.8, ax=kde_ax)
kde_ax.scatter(x=part_data[3]['sy'], y=part_data[3]['sx'], c='blue', edgecolors='k')

ax4 = sns.kdeplot(part_data[4]['y'], part_data[4]['x'], cmap="Purples", shade=True, shade_lowest=False, 
                  alpha=0.8, ax=kde_ax)
kde_ax.scatter(x=part_data[4]['sy'], y=part_data[4]['sx'], c='purple', edgecolors='k')

ax5 = sns.kdeplot(part_data[5]['y'], part_data[5]['x'], cmap="Greys", shade=True, shade_lowest=False, 
                  alpha=0.8, ax=kde_ax)
kde_ax.scatter(x=part_data[5]['sy'], y=part_data[5]['sx'], c='k', edgecolors='k')

ax5 = sns.kdeplot(part_data[5]['y'], part_data[5]['x'], cmap="Greys", shade=True, shade_lowest=False, 
                  alpha=0.8, ax=kde_ax)
kde_ax.scatter(x=part_data[5]['sy'], y=part_data[5]['sx'], c='k', edgecolors='k')

ax6 = sns.kdeplot(part_data[6]['y'], part_data[6]['x'], cmap="Oranges", shade=True, shade_lowest=False, 
                  alpha=0.8, ax=kde_ax)
kde_ax.scatter(x=part_data[6]['sy'], y=part_data[6]['sx'], c='orange', edgecolors='k')

#kde_ax.spines['right'].set_visible(False)
#kde_ax.spines['top'].set_visible(False)
kde_ax.set_xlim((-77, -75))
kde_ax.set_ylim((36.75, 39.75))


m2 = Basemap(llcrnrlat=36.75,urcrnrlat=39.75,llcrnrlon=-77,urcrnrlon=-75, resolution='f', ax=kde_ax)
m2.drawcoastlines(linewidth=0.5)
m2.fillcontinents(color='lightgrey',lake_color='white')
m2.drawparallels(np.arange(36.5,40,0.5), labels=[True,True,False,False], dashes=[2,2])
m2.drawmeridians(np.arange(-78.,-74,0.5), labels=[False,False,False,True], dashes=[2,2])
m2.drawmapboundary(fill_color='white')

plt.show()


In [None]:
f = "~/Google Drive/SiYi_Xiaotong_Materials/bray_curtis_betadiversity.txt"
bd_df = pd.read_csv(f, sep="\t", index_col=0)
bd_df.columns = [i.replace("_wu", "") for i in bd_df.columns]

sn_df = pd.DataFrame(index=list(range(1,7)),
                     columns=list(range(1,7)))

for i in sn_df.index:
    for j in sn_df.index:
        clust_ixs = env_data[env_data.StatName == i].index
        clust_jxs = env_data[env_data.StatName == j].index
        sn_df.loc[i, j] = bd_df.loc[clust_ixs, clust_jxs].mean().mean()

score_c =  []
for i in sn_df.index:
    v = sn_df.iloc[i-1, i-1]
    j = list(sn_df.iloc[i-1,:])
    j.remove(v)
    l = min(j)
    print(i, round(v/l, 3))
    score_c.append(v/l)
    
print(sum(score_c))  

In [None]:
# ['CB72', 'CB73', 'CB74'] 5.739730923390547
#['CB63', 'CB73', 'CB74'] : 5.811574898837427
#73 and 74 alone : 5.713039267650133

In [None]:
import matplotlib as mpl
print(particles2016.DateMMDDYY.unique())
date_range = ['71316', '71116', '71216']
sub2016_1 = particles_transect_2016[particles_transect_2016.DateMMDDYY.isin(date_range)]
station_set_3 = list(particles_transect_2016.StationName.unique())

sub2016_jul = sub2016_1[sub2016_1.StationName.isin(station_set_3)]

date_range_2 = ['81016', '81216', '80816', '80916']
sub2016_2 = particles_transect_2016[particles_transect_2016.DateMMDDYY.isin(date_range_2)]
sub2016_aug = sub2016_2[sub2016_2.StationName.isin(station_set_3)]
mlp_colors = list(mpl.rcParams['axes.prop_cycle']) * 5



In [None]:
fig, axes = plt.subplots(nrows=1, ncols=2, figsize=(12, 10), dpi=150)
m = Basemap(llcrnrlat=36.75,urcrnrlat=39.75,llcrnrlon=-77,urcrnrlon=-75, resolution='h', ax=axes[0])
m.drawcoastlines(linewidth=0.5)
m.fillcontinents(color='tan',lake_color='lightblue')
# draw parallels and meridians.
m.drawparallels(np.arange(36.5,40,0.5),labels=[True,True,False,False], dashes=[2,2])
m.drawmeridians(np.arange(-78.,-74,0.5), labels=[False,False,False,True], dashes=[2,2])
m.drawmapboundary(fill_color='lightblue')
m.drawcountries(linewidth=2, linestyle='solid', color='k' ) 
m.drawstates(linewidth=0.5, linestyle='solid', color='k')
m.drawrivers(linewidth=0.5, linestyle='solid', color='blue')

for idx_ in range(len(sub2016_jul)):
    this_col = mlp_colors[idx_]['color']
    idx_no = sub2016_jul.index[idx_]
    statname = sub2016_jul.loc[idx_no, 'StationName']
    m.scatter(x=particle_chunks16[idx_no]['lon'], 
              y=particle_chunks16[idx_no]['lat'], s=1, c=this_col, label=None)
    m.scatter(x = stat_latlon.loc[statname, 'Longitude'], 
              y = stat_latlon.loc[statname, 'Latitude'], 
              s=35, marker='d', edgecolor='k', linewidths=0.7, c=this_col, label=statname)

m2 = Basemap(llcrnrlat=36.75,urcrnrlat=39.75,llcrnrlon=-77,urcrnrlon=-75, resolution='h', ax=axes[1])
m2.drawcoastlines(linewidth=0.5)
m2.fillcontinents(color='tan',lake_color='lightblue')
m2.drawparallels(np.arange(36.5,40,0.5), labels=[True,True,False,False], dashes=[2,2])
m2.drawmeridians(np.arange(-78.,-74,0.5), labels=[False,False,False,True], dashes=[2,2])
m2.drawmapboundary(fill_color='lightblue')
m2.drawcountries(linewidth=2, linestyle='solid', color='k' ) 
m2.drawstates(linewidth=0.5, linestyle='solid', color='k')
m2.drawrivers(linewidth=0.5, linestyle='solid', color='blue')
for idx_ in range(len(sub2016_aug)):
    this_col = mlp_colors[idx_]['color']
    idx_no = sub2016_aug.index[idx_]
    statname = sub2016_aug.loc[idx_no, 'StationName']
    m2.scatter(x=particle_chunks16[idx_no]['lon'], 
               y=particle_chunks16[idx_no]['lat'], s=1, c=this_col, label=None)
    m2.scatter(x = stat_latlon.loc[statname, 'Longitude'], 
              y = stat_latlon.loc[statname, 'Latitude'], 
              s=35, marker='d', edgecolor='k', linewidths=0.7, c=this_col, label=statname)

axes[0].legend()
axes[1].legend()

In [None]:
outp = '/Volumes/KeithSSD/CB_V4/otu_data/mixing_data'
outf = "2016_transect_mixing_data_products.txt"
particles_transect_2016.to_csv(os.path.join(outp, outf), sep="\t")


In [None]:
mpl.rcParams.update(mpl.rcParamsDefault)

font = {'size': 10}

mpl.rc('font', **font)

fig, axes = plt.subplots(nrows=1, ncols=1, figsize=(9, 8), dpi=300)
m = Basemap(llcrnrlat=36.75,urcrnrlat=39.75,llcrnrlon=-77.5,urcrnrlon=-74.5, resolution='i', ax=axes,
           projection='merc')
m.drawcoastlines(linewidth=0.5)
m.fillcontinents(color='lightgrey',lake_color='white')
m.drawparallels(np.arange(36.5,40,0.5),labels=[True,False,False,False], dashes=[2,2])
m.drawmeridians(np.arange(-78.,-74,0.5), labels=[False,False,False,True], dashes=[2,2])
m.drawmapboundary(fill_color='white')
#m.drawcountries(linewidth=1.5, linestyle='solid', color='k' ) 
#m.drawstates(linewidth=1.5, linestyle='solid', color='k')
#m.drawrivers(linewidth=2, linestyle='solid', color='white')
m.drawmapscale(lon=-74.75, lat=36.875, lon0=-75.25, lat0=36.875, length=50)

marker_list = ['s', 'o', 'p', 'x', 'd', '*']
for idx_ in statdat.StationGroup.unique():
    this_col = mlp_colors[int(idx_)]['color']
    substat = statdat[statdat.StationGroup == idx_].copy()
    subx, suby = m(substat['Lon'].values, substat['Lat'].values)
    m.scatter(x=subx, y=suby, s=20, c='k', label="Group"+str(int(idx_)),
              marker=marker_list[int(idx_)-1], zorder=10)

bbox_args = dict(boxstyle="round", fc="0.7")
for row in statdat.index:
    rownew = row[:3] + '.' + row[3:]
    realx, realy = statdat['Lon'][row], statdat['Lat'][row]
    
    if row == 'CB63':
        subx, suby = m(realx-0.35, realy)
    elif row in ['CB22', 'CB32', 'CB54', 'CB61']:
        subx, suby = m(realx-0.35, realy+0.03)
    elif row in ['CB44']:
        subx, suby = m(realx-0.35, realy-0.01)
    elif row in ['CB42C']:
        subx, suby = m(realx-0.39, realy+0.03)
    elif row in ['CB31']:
        subx, suby = m(realx+0.1, realy-0.02)
    elif row in ['CB32', 'CB43C', 'CB51']:
        subx, suby = m(realx+0.1, realy-0.02)
    elif row in ['CB71']:
        subx, suby = m(realx+0.1, realy+0.01)
    elif row in ['CB72']:
        subx, suby = m(realx+0.09, realy-0.02)
    elif row in ['CB62']:
        subx, suby = m(realx+0.09, realy+0.03)
    else:
        subx, suby = m(realx+0.1, realy)
    
    allnotes = "{}, n={}".format(rownew, int(statdat['NumSamples'][row]))
    axes.annotate(rownew, (subx, suby), bbox=bbox_args, size=10)
    
axes.legend()
plt.show()

In [None]:
import numpy as np

import pandas
import matplotlib.pyplot as plt
from matplotlib import rcParams



rcParams['font.family'] = 'sans-serif'
rcParams['font.sans-serif'] = ['Arial']

plt.ioff()


mean, cov = [0, 2], [(2, 1), (.5, 1)]
x1, y1 = np.random.multivariate_normal(mean, cov, size=50).T

mean, cov = [5, 7], [(3, 2), (7, 1)]
x2, y2 = np.random.multivariate_normal(mean, cov, size=50).T



plt.xlabel("foo", fontsize=22)
plt.ylabel("bar", fontsize=22)
plt.savefig("foo_vs_bar.png")