In [1]:
import pandas as pd
import numpy as np

# Review of previous data

## Notes

After the end of the IQAASL project the stored separately and other surveys were conducted. Those results were stored in separate places. The codes used for the different projects were not all the same.

Here we standardize, indentify locations or records that cannot be verified and define the set of codes used since 2015.

### Eliminated survey locations

The following locations were either duplicated or the length of the shoreline could not be verified.

```python

not_these = [
    'sihlsee_einsiedeln_schilligerllacherl',
    'schiffenensee_duedingen_hirschij',
    'lac-leman-hammerdirt',
    'thur_schoenenberg_schaera',
    'katzenbach_zuerich_sanesim',
    'inn_pradella_kohlt',
    'emme_luterbach_huggenbergerk',
    'lotschebach_bern_scheurerk',
    'mammern-swisslitterreport',
    'berlingen-swisslitterreport'    
]
```

### Eliminated codes

The codes G909, G910, G911 and G912 were eliminated. The recorded value was placed under the parent code:

1. G909, G910 => G74
2. G911 => G81
3. G912 => G82

### quantity

In [None]:
test_this = pd.read_csv("data/end_process/new_all.csv")
t_t = test_this.groupby(c_cols, as_index=False).agg(agg_this)


In [2]:
def collect_vitals(data):
    total = data.quantity.sum()
    median = data.pcs_m.median()
    samples = data.loc_date.nunique()
    ncodes = data.code.nunique()
    nlocations = data.location.nunique()
    nbodies = data.water_name_slug.nunique()
    max_date = data["date"].max()
    min_date = data["date"].min()
    return total, median, samples, ncodes, nlocations, nbodies, max_date, min_date

def find_missing(more_than, less_than):
    return np.setdiff1d(more_than, less_than)
def find_missing_loc_dates(done, dtwo):
    locs_one = done.loc_date.unique()
    locs_two = dtwo.loc_date.unique()
    return find_missing(locs_one, locs_two)

def aggregate_gcaps_gfoams_gfrags(data, codes,columns=["Gfoams", "Gfrags", "Gcaps"]):
    for col in columns:
        change = codes.loc[codes.parent_code == col].index
        data.loc[data.code.isin(change), "code"] = col
        
    return data
dfCodes = pd.read_csv("data/end_process/codes.csv")
code_cols = ['material', 'description', 'source', 'parent_code', 'single_use', 'groupname']
# dfCodes = dfCodes[columns].copy()
dfCodes = dfCodes.set_index("code", drop=True)

not_these = [
    'sihlsee_einsiedeln_schilligerllacherl',
    'schiffenensee_duedingen_hirschij',
    'lac-leman-hammerdirt',
    'thur_schoenenberg_schaera',
    'katzenbach_zuerich_sanesim',
    'inn_pradella_kohlt',
    'emme_luterbach_huggenbergerk',
    'lotschebach_bern_scheurerk',
    'mammern-swisslitterreport',
    'berlingen-swisslitterreport'
    
]




a_cols = [
    'loc_date',
    'date',
    'water_name_slug',
    'location',
    'code',
    'pcs_m',
    'quantity',
    'river_bassin',
    'length', 
    'groupname',
    'city']
agg_this = {
    "quantity":"sum",
    "pcs_m": "sum"
}
    


a = pd.read_csv("data/from_prev/agg_results_with_land_use_2015.csv")

    
a = a[~a.location.isin(not_these)].copy()
a_cols = [
    'loc_date',
    'date',
    'water_name_slug',
    'location',
    'code',
    'pcs_m',
    'quantity',
    'river_bassin',
    'length', 
    'groupname',
    'city']

c_cols = [
    'loc_date',
    'location',
    'date',
    'water_name_slug',     
    'river_bassin',
    'length', 
    'groupname',
    'city',
    'code',
]

a_c = a[a_cols].copy()
a_c = a_c.drop_duplicates(["loc_date", "location", "code", "quantity"])
a_c.loc[a_c.code.isin(["G909", "G910"]), "code"] = "G74"
a_c.loc[a_c.code == "G911", "code"] = "G81"
a_c.loc[a_c.code == "G912", "code"] = "G82"
loc_dates_a = a_c.loc_date.unique()

In [3]:
test_this = pd.read_csv("data/end_process/new_all.csv")
t_t = test_this.groupby(c_cols, as_index=False).agg(agg_this)
collect_vitals(t_t)

(196842, 0.0, 1352, 239, 245, 59, '2022-10-06', '2015-11-23')

In [4]:
a_c_vitals = collect_vitals(a_c)
a_c_vitals

(194965, 0.0, 1348, 184, 245, 59, '2021-08-28', '2015-11-23')

In [5]:
find_missing_loc_dates(t_t, a_c)

array(["('parc-des-pierrettes', '2022-10-06')",
       "('plage-de-st-sulpice', '2021-10-07')",
       "('plage-de-st-sulpice', '2022-10-06')",
       "('tiger-duck-beach', '2021-10-07')",
       "('villa-barton', '2021-11-14')"], dtype=object)

In [6]:
find_missing_loc_dates(a_c, t_t)

array(["('maladaire', '2021-06-02')"], dtype=object)

In [7]:
c = pd.read_csv("data/from_prev/all_the_data_eos_not_aggregated.csv")

c = c[~c.location.isin(not_these)].copy()


c_c = c[a_cols].copy()
c_c = c_c.drop_duplicates(["loc_date", "location", "code", "quantity"])
c_c.loc[c_c.code.isin(["G909", "G910"]), "code"] = "G74"
c_c.loc[c_c.code == "G911", "code"] = "G81"
c_c.loc[c_c.code == "G912", "code"] = "G82"
loc_dates_c = c_c.loc_date.unique()
collect_vitals(c_c)

(193826, 0.0, 1341, 235, 238, 57, '2021-08-28', '2015-11-23')

In [8]:
find_missing_loc_dates(t_t, c_c)

array(["('clean-up-tour-airolo', '2021-08-05')",
       "('clean-up-tour-charmey', '2021-05-08')",
       "('clean-up-tour-grindelwald', '2021-05-29')",
       "('clean-up-tour-les-diablerets', '2021-05-15')",
       "('clean-up-tour-san-bernardino', '2021-08-06')",
       "('clean-up-tour-val-calanca', '2021-08-07')",
       "('monte-generoso', '2021-05-02')",
       "('parc-des-pierrettes', '2022-10-06')",
       "('plage-de-st-sulpice', '2021-10-07')",
       "('plage-de-st-sulpice', '2022-10-06')",
       "('tiger-duck-beach', '2021-10-07')",
       "('villa-barton', '2021-11-14')"], dtype=object)

In [9]:
alp_c = pd.read_csv("data/from_prev/checked_alpes_survey_data.csv")
alp_c = alp_c.drop_duplicates(["loc_date", "location", "code", "quantity"])
len(alp_c.location.unique())

20

In [10]:
tgh = find_missing(loc_dates_a, loc_dates_c)
m_alpes = alp_c.copy()
m_alpes.loc[m_alpes.code.isin(["G909", "G910"]), "code"] = "G74"
m_alpes.loc[m_alpes.code == "G911", "code"] = "G81"
m_alpes.loc[m_alpes.code == "G912", "code"] = "G82"
m_alpes.loc[m_alpes.code.isin(["G909", "G910"]), "code"] = "G74"
collect_vitals(m_alpes)

(7776, 0.0, 20, 230, 20, 7, '2021-08-28', '2021-04-24')

In [11]:
find_missing_loc_dates(m_alpes, t_t)

array([], dtype=object)

In [12]:
find_missing_loc_dates(t_t, m_alpes)

array(["('aabach', '2020-10-22')", "('aare-limmatspitz', '2020-07-13')",
       "('aare-port', '2021-04-23')", ...,
       "('zurichsee_wollishofen_langendorfm', '2020-12-10')",
       "('zurichsee_wollishofen_langendorfm', '2021-01-10')",
       "('zurichsee_wollishofen_langendorfm', '2021-02-12')"],
      dtype=object)

In [13]:
mm = pd.concat([m_alpes[a_cols], c_c[c_c.river_bassin != "les-alpes"][a_cols]])
mm =mm.drop_duplicates(["loc_date", "location", "code", "quantity"])
collect_vitals(mm)

(194915, 0.0, 1348, 238, 245, 59, '2021-08-28', '2015-11-23')

In [14]:
find_missing_loc_dates(t_t, mm)

array(["('parc-des-pierrettes', '2022-10-06')",
       "('plage-de-st-sulpice', '2021-10-07')",
       "('plage-de-st-sulpice', '2022-10-06')",
       "('tiger-duck-beach', '2021-10-07')",
       "('villa-barton', '2021-11-14')"], dtype=object)

In [15]:
dfCodesx = pd.read_csv("data/from_prev/codes_with_group_names_2015 (1).csv")
code_cols = ['material', 'description', 'source', 'parent_code', 'single_use', 'groupname']
# dfCodes = dfCodes[columns].copy()
dfCodesx = dfCodesx.set_index("code", drop=True)

In [16]:
mmlocd = mm.loc_date.unique()
find_missing(loc_dates_a, mmlocd)

array([], dtype=object)

In [17]:
find_missing(mmlocd, loc_dates_c)

array(["('clean-up-tour-airolo', '2021-08-05')",
       "('clean-up-tour-charmey', '2021-05-08')",
       "('clean-up-tour-grindelwald', '2021-05-29')",
       "('clean-up-tour-les-diablerets', '2021-05-15')",
       "('clean-up-tour-san-bernardino', '2021-08-06')",
       "('clean-up-tour-val-calanca', '2021-08-07')",
       "('monte-generoso', '2021-05-02')"], dtype=object)

In [18]:
b = pd.read_csv("data/from_prev/u_all_data.csv")
b = b[~b.location.isin(not_these)].copy()
b = b.drop_duplicates(["loc_date", "location", "code", "quantity"])
b = b[~b.loc_date.isin(not_these)].copy()
b_vitals = collect_vitals(b)
b_vitals

(188521, 0.0, 1331, 228, 225, 52, '2021-11-14', '2015-11-23')

In [19]:
missingx = find_missing(alp_c.loc_date.unique(), b.loc_date.unique())
missingxb = find_missing(loc_dates_a, b.loc_date.unique())
missingxc = find_missing(mm.loc_date.unique(), b.loc_date.unique())
missingbm = find_missing(b.loc_date.unique(), mm.loc_date.unique())

In [20]:
find_missing_loc_dates(t_t, b)

array(["('clean-up-tour-airolo', '2021-08-05')",
       "('clean-up-tour-andermatt', '2021-06-19')",
       "('clean-up-tour-cabanes-des-diablerets', '2021-08-28')",
       "('clean-up-tour-charmey', '2021-05-08')",
       "('clean-up-tour-crans-montana', '2021-06-12')",
       "('clean-up-tour-grindelwald', '2021-05-29')",
       "('clean-up-tour-la-berra', '2021-06-05')",
       "('clean-up-tour-la-robella', '2021-05-15')",
       "('clean-up-tour-la-tzoumaz', '2021-05-22')",
       "('clean-up-tour-les-crosets', '2021-06-06')",
       "('clean-up-tour-les-diablerets', '2021-05-15')",
       "('clean-up-tour-les-paccots', '2021-04-24')",
       "('clean-up-tour-morgins', '2021-06-05')",
       "('clean-up-tour-nendaz', '2021-07-04')",
       "('clean-up-tour-san-bernardino', '2021-08-06')",
       "('clean-up-tour-val-calanca', '2021-08-07')",
       "('clean-up-tour-verbier', '2021-06-13')",
       "('clean-up-tour-veysonnaz', '2021-07-03')",
       "('clean-up-tour-villars', '2021-

In [21]:
find_missing_loc_dates(b, t_t)

array(["('maladaire', '2021-06-02')"], dtype=object)

### codes

In [22]:
dfCodes.head()

Unnamed: 0_level_0,material,description,source,parent_code,single_use,groupname
code,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
G708,Metal,Batons de ski,Usagers,G199,False,recreation
G712,Cloth,Gants de ski,Usagers,G135,False,recreation
G902,Cloth,"Mask medical, cloth",Personal hygiene,G145,False,personal items
G917,Glass,Terracotta balls,Utility items,G210,False,unclassified
G921,Glass,Ceramic tile and pieces,Construction,G204,False,infrastructure


### survey areas

### after august 2021

In [23]:
dfCodes.loc["Gcaps"]

material                                           plastic
description    plastic caps, lid rings: G21, G22, G23, G24
source                                             Usagers
parent_code                                          Gcaps
single_use                                           False
groupname                               packaging non food
Name: Gcaps, dtype: object

In [24]:
f = pd.read_csv("data/from_prev/u_sup_after.csv")
f.rename(columns={"pcs/m":"pcs_m"}, inplace=True)
f = f.drop_duplicates(["loc_date", "location", "code", "quantity"])
after_locs = f.loc_date.unique()

In [25]:
fpp = f[f.loc_date == "('plage-de-st-sulpice', '2022-10-06')"].copy()
fpgf = fpp[fpp.code == "Gfrags"].copy()
fpp = fpp[fpp.code != "Gfrags"].copy()

fpp["pcs_m"] = fpp.pcs_m*100
vals = fpp[fpp.pcs_m >0].pcs_m/fpp[fpp.pcs_m > 0].quantity
vmean = np.mean(1/vals)
print(vmean)
fpp = pd.concat([fpp, fpgf])
# fpp["length"] = 1/(fpp.pcs_m/fpp.quantity)
# av_len = np.mean([x for x in fpp.length.values if isinstance(x, float)])
# fpp["length"] = vmean.astype(int)
f = pd.concat([f[f.loc_date != "('plage-de-st-sulpice', '2022-10-06')"].copy(), fpp])

42.50992063492071


In [26]:
fif = f[f.pcs_m > 0.01][["loc_date","quantity", "pcs_m"]].copy()
fif["length"] = 1/(fif.pcs_m/fif.quantity)

# fif.set_index("loc_date", inplace=True, drop=True)
fill_map = fif.groupby(["loc_date"]).length.mean().astype("int")
fill_map

loc_date
('maladaire', '2021-06-02')              58
('parc-des-pierrettes', '2022-10-06')    40
('plage-de-st-sulpice', '2021-10-07')    49
('plage-de-st-sulpice', '2022-10-06')    42
('tiger-duck-beach', '2021-10-07')       30
('villa-barton', '2021-11-14')           58
Name: length, dtype: int64

In [27]:
f['length'] =f.loc_date.apply(lambda x: fill_map.loc[x])
f.loc[f.code == "Gfoam",  "code"] = "Gfoams"
f["groupname"] = f.code.apply(lambda x: dfCodes["groupname"].loc[x])

In [28]:
# fm = f[(f.loc_date == "('maladaire', '2021-06-02')")].copy()
# fm = fm.drop_duplicates(["loc_date", "location", "code"])
# # fmx = fmx[~fmx.code.isin(["Gcaps", "Gfoam", "Gcaps"])]
# fnm = f[f.loc_date != "('maladaire', '2021-06-02')"].copy()
# f = pd.concat([fnm, fm])
f.head()

Unnamed: 0,loc_date,location,date,doy,water_name_slug,water,river_bassin,city,code,quantity,pcs_m,slug,Project,length,groupname
0,"('maladaire', '2021-06-02')",maladaire,2021-06-02,153,lac-leman,l,rhone,La Tour-de-Peilz,G1,0,0.0,maladaire,Testing,58,food and drink
1,"('maladaire', '2021-06-02')",maladaire,2021-06-02,153,lac-leman,l,rhone,La Tour-de-Peilz,G10,7,0.11,maladaire,Testing,58,food and drink
2,"('maladaire', '2021-06-02')",maladaire,2021-06-02,153,lac-leman,l,rhone,La Tour-de-Peilz,G100,2,0.03,maladaire,Testing,58,waste water
3,"('maladaire', '2021-06-02')",maladaire,2021-06-02,153,lac-leman,l,rhone,La Tour-de-Peilz,G101,0,0.0,maladaire,Testing,58,personal items
4,"('maladaire', '2021-06-02')",maladaire,2021-06-02,153,lac-leman,l,rhone,La Tour-de-Peilz,G102,0,0.0,maladaire,Testing,58,personal items


In [29]:
add_to_m = find_missing(f.loc_date.unique(), mm.loc_date.unique())
add_to_m

array(["('parc-des-pierrettes', '2022-10-06')",
       "('plage-de-st-sulpice', '2021-10-07')",
       "('plage-de-st-sulpice', '2022-10-06')",
       "('tiger-duck-beach', '2021-10-07')",
       "('villa-barton', '2021-11-14')"], dtype=object)

In [30]:
mm = mm[mm.loc_date != "('maladaire', '2021-06-02')"]
mm.head()

Unnamed: 0,loc_date,date,water_name_slug,location,code,pcs_m,quantity,river_bassin,length,groupname,city
0,"('clean-up-tour-cabanes-des-diablerets', '2021...",2021-08-28,alpes-vaudoises,cabanes-des-diablerets,G200,1.33,16,les-alpes,12.0,food and drink,Ormont-Dessus
1,"('clean-up-tour-cabanes-des-diablerets', '2021...",2021-08-28,alpes-vaudoises,cabanes-des-diablerets,G48,0.42,5,les-alpes,12.0,recreation,Ormont-Dessus
2,"('clean-up-tour-cabanes-des-diablerets', '2021...",2021-08-28,alpes-vaudoises,cabanes-des-diablerets,G170,0.83,10,les-alpes,12.0,agriculture,Ormont-Dessus
3,"('clean-up-tour-cabanes-des-diablerets', '2021...",2021-08-28,alpes-vaudoises,cabanes-des-diablerets,G27,0.75,9,les-alpes,12.0,tobacco,Ormont-Dessus
4,"('clean-up-tour-cabanes-des-diablerets', '2021...",2021-08-28,alpes-vaudoises,cabanes-des-diablerets,G709,0.17,2,les-alpes,12.0,recreation,Ormont-Dessus


In [31]:
mmx = pd.concat([f[f.loc_date.isin(add_to_m)][a_cols], mm[a_cols]])
mmx.drop_duplicates(["loc_date", "location", "code", "quantity"])
mmx.loc[mmx.code.isin(["G909", "G910"]), "code"] = "G74"
mmx.loc[mmx.code == "G911", "code"] = "G81"
mmx.loc[mmx.code == "G912", "code"] = "G82"
mmx.loc[mmx.code == "Gfoam", "code"] = "Gfoams"
# mmx.loc[mmx.code.isin(["G909", "G910"]), "code"] = "G74"
mmx_ncodes = mmx.groupby("loc_date").code.nunique()
mmx_ncodes.unique()
collect_vitals(mmx)

(196842, 0.0, 1352, 239, 245, 59, '2022-10-06', '2015-11-23')

In [32]:
collected = mmx.groupby(c_cols, as_index=False).agg({"quantity":"sum", "pcs_m":"sum"})
collect_vitals(collected)

(196842, 0.04, 1352, 239, 245, 59, '2022-10-06', '2015-11-23')

In [33]:
collected[(collected.loc_date == "('aabach', '2020-10-22')")&(collected.code =="G81")]

Unnamed: 0,loc_date,location,date,water_name_slug,river_bassin,length,groupname,city,code,quantity,pcs_m
6,"('aabach', '2020-10-22')",aabach,2020-10-22,zurichsee,linth,33.333333,infrastructure,Schmerikon,G81,1,0.03


In [34]:
cl_ncodes = collected.groupby("loc_date").code.nunique()
mmx[(mmx.loc_date == "('aabach', '2020-10-22')")&(mmx.code.isin(["G81"]))].groupby(c_cols, as_index=False).agg({"quantity":"sum", "pcs_m":"sum"})

Unnamed: 0,loc_date,location,date,water_name_slug,river_bassin,length,groupname,city,code,quantity,pcs_m
0,"('aabach', '2020-10-22')",aabach,2020-10-22,zurichsee,linth,33.333333,infrastructure,Schmerikon,G81,1,0.03


In [35]:
samp_one = b.loc_date.unique()
samps = list(set([*samp_one, *after_locs, *a_c.loc_date.unique()]))

In [36]:
find_missing(mmx.loc_date.unique(), samps)

array([], dtype=object)

In [37]:
mmx[mmx.river_bassin == 'les-alpes'].location.unique()

array(['cabanes-des-diablerets', 'val-calanca', 'san-bernardino',
       'airolo', 'nendaz', 'veysonnaz', 'andermatt', 'verbier',
       'crans-montana', 'villars', 'les-crosets', 'morgins', 'la-berra',
       'grindelwald', 'la-tzoumaz', 'la-robella', 'les-diablerets',
       'charmey', 'monte-generoso', 'les-paccots'], dtype=object)

In [38]:
mmx_ncodes = mmx.groupby("loc_date").code.nunique()
mmx_ncodes.unique()

array([235, 230, 228, 215,  22])

In [39]:
mmx_ncodes[mmx_ncodes == 235][:1]

loc_date
('aabach', '2020-10-22')    235
Name: code, dtype: int64

In [40]:
mmx_ncodes[mmx_ncodes == 230][:1]

loc_date
('clean-up-tour-airolo', '2021-08-05')    230
Name: code, dtype: int64

In [41]:
mmx_ncodes[mmx_ncodes == 228][:1]

loc_date
('parc-des-pierrettes', '2022-10-06')    228
Name: code, dtype: int64

In [42]:
mmx_ncodes[mmx_ncodes == 215][:1]

loc_date
('plage-de-st-sulpice', '2021-10-07')    215
Name: code, dtype: int64

In [43]:
mmx_ncodes[mmx_ncodes == 22][:1]

loc_date
('plage-de-st-sulpice', '2022-10-06')    22
Name: code, dtype: int64

In [44]:
mmx_ncodes[mmx_ncodes == 235][:1]

loc_date
('aabach', '2020-10-22')    235
Name: code, dtype: int64

In [45]:
two_three_five = mmx[mmx.loc_date =="('aabach', '2020-10-22')"].code.unique()
two_three = mmx[mmx.loc_date =="('clean-up-tour-airolo', '2021-08-05')"].code.unique()
two_two_eight = mmx[mmx.loc_date == "('parc-des-pierrettes', '2022-10-06')"].code.unique()
two_one_five = mmx[mmx.loc_date == "('plage-de-st-sulpice', '2021-10-07')"].code.unique()
find_missing(two_three_five, two_three)

array(['G75', 'G76', 'G78', 'G79', 'G80', 'G81', 'G82', 'G83'],
      dtype=object)

In [46]:
find_missing(two_three_five, two_two_eight)

array(['G21', 'G23', 'G24', 'G75', 'G76', 'G78', 'G79', 'G80', 'G81',
       'G82', 'G83'], dtype=object)

In [47]:
find_missing(two_three_five, two_one_five)

array(['G124', 'G131', 'G137', 'G194', 'G21', 'G211', 'G23', 'G24', 'G3',
       'G50', 'G74', 'G75', 'G76', 'G78', 'G79', 'G80', 'G81', 'G82',
       'G83', 'G89', 'G90', 'G913', 'G93', 'G99'], dtype=object)

In [48]:
mmx.head()

Unnamed: 0,loc_date,date,water_name_slug,location,code,pcs_m,quantity,river_bassin,length,groupname,city
176,"('tiger-duck-beach', '2021-10-07')",2021-10-07,lac-leman,tiger-duck-beach,G1,0.0,0,rhone,30.0,food and drink,Saint-Sulpice (VD)
177,"('tiger-duck-beach', '2021-10-07')",2021-10-07,lac-leman,tiger-duck-beach,G10,0.07,2,rhone,30.0,food and drink,Saint-Sulpice (VD)
178,"('tiger-duck-beach', '2021-10-07')",2021-10-07,lac-leman,tiger-duck-beach,G100,0.3,9,rhone,30.0,waste water,Saint-Sulpice (VD)
179,"('tiger-duck-beach', '2021-10-07')",2021-10-07,lac-leman,tiger-duck-beach,G101,0.0,0,rhone,30.0,personal items,Saint-Sulpice (VD)
180,"('tiger-duck-beach', '2021-10-07')",2021-10-07,lac-leman,tiger-duck-beach,G102,0.0,0,rhone,30.0,personal items,Saint-Sulpice (VD)


In [49]:
two_three_five

array(['G942', 'G23', 'G35', 'G70', 'G89', 'G941', 'G112', 'G30', 'G67',
       'G79', 'G93', 'G95', 'G27', 'G38', 'G78', 'G81', 'G200', 'G48',
       'G170', 'G709', 'G100', 'G204', 'G705', 'G919', 'G178', 'G24',
       'G122', 'G915', 'G146', 'G922', 'G208', 'G74', 'G142', 'G921',
       'G198', 'G931', 'G25', 'G33', 'G87', 'G124', 'G916', 'G936',
       'G703', 'G707', 'G139', 'G152', 'G75', 'G20', 'G905', 'G704',
       'G711', 'G191', 'G934', 'G158', 'G210', 'G7', 'G926', 'G923',
       'G702', 'G177', 'G706', 'G34', 'G712', 'G159', 'G901', 'G710',
       'G65', 'G708', 'G145', 'G175', 'G211', 'G12', 'G171', 'G8', 'G156',
       'G26', 'G943', 'G155', 'G3', 'G929', 'G28', 'G194', 'G90', 'G101',
       'G82', 'G91', 'G134', 'G939', 'G22', 'G149', 'G165', 'G31', 'G201',
       'G938', 'G137', 'G213', 'G76', 'G98', 'G10', 'G66', 'G935', 'G135',
       'G21', 'G144', 'G154', 'G904', 'G32', 'G51', 'G940', 'G157',
       'G182', 'G126', 'G73', 'G96', 'G148', 'G933', 'G914', 'G147',
    

In [50]:
lq = mmx[mmx.length.isnull()].copy()
lx = mmx[mmx.pcs_m > 0.01][["loc_date","quantity", "pcs_m"]].copy()

fif = lx[lx.pcs_m > 0.01][["loc_date","quantity", "pcs_m"]].copy()
fif["length"] = 1/(fif.pcs_m/fif.quantity)

# fif.set_index("loc_date", inplace=True, drop=True)
fill_map = fif.groupby(["loc_date"]).length.mean().astype("int")
fill_map[fill_map > 150]

loc_date
('gasi-strand', '2021-03-23')             153
('rhein-beach-tinguely', '2017-04-22')    184
('rhein-beach-tinguely', '2017-05-20')    179
('rhein-beach-tinguely', '2017-06-24')    183
('rhein-beach-tinguely', '2017-07-22')    179
('rhein-beach-tinguely', '2017-08-19')    178
('rhein-beach-tinguely', '2017-09-23')    180
('rhein-beach-tinguely', '2017-10-21')    180
('rhein-beach-tinguely', '2017-11-25')    180
('rhein-beach-tinguely', '2017-12-16')    175
('rhein-beach-tinguely', '2018-01-20')    182
('rhein-beach-tinguely', '2018-02-24')    188
('rhein-beach-tinguely', '2018-03-24')    175
('versoix', '2016-11-17')                 165
Name: length, dtype: int64

In [51]:
li = mmx[~mmx.length.isnull()].copy()
li.isna().any()

loc_date           False
date               False
water_name_slug    False
location           False
code               False
pcs_m              False
quantity           False
river_bassin       False
length             False
groupname          False
city               False
dtype: bool

In [52]:
find_missing(lq.loc_date.unique(), fill_map.index)

array([], dtype=object)

In [53]:
lq["length"] = lq.loc_date.apply(lambda x: fill_map.loc[x])
lq.isna().any()

loc_date           False
date               False
water_name_slug    False
location           False
code               False
pcs_m              False
quantity           False
river_bassin       False
length             False
groupname          False
city               False
dtype: bool

In [54]:
new_all = pd.concat([lq, li])
new_all = new_all.drop_duplicates(["loc_date", "location", "code", "quantity"])
collect_vitals(new_all)

(196842, 0.0, 1352, 239, 245, 59, '2022-10-06', '2015-11-23')

In [55]:
collect_vitals(t_t)

(196842, 0.0, 1352, 239, 245, 59, '2022-10-06', '2015-11-23')

In [56]:
new_all.isnull().any()

loc_date           False
date               False
water_name_slug    False
location           False
code               False
pcs_m              False
quantity           False
river_bassin       False
length             False
groupname          False
city               False
dtype: bool

In [57]:
new_all.isna().any()

loc_date           False
date               False
water_name_slug    False
location           False
code               False
pcs_m              False
quantity           False
river_bassin       False
length             False
groupname          False
city               False
dtype: bool

In [58]:
new_all[new_all.code.isin(["G909", "G910", "G911", "G912"])]

Unnamed: 0,loc_date,date,water_name_slug,location,code,pcs_m,quantity,river_bassin,length,groupname,city


In [59]:
new_all[new_all.length > 150].location.unique()

array(['gasi-strand', 'versoix', 'rhein-beach-tinguely',
       'walensee_walenstadt_wysse', 'thunersee_spiez_meierd_1',
       'rocky-plage', 'greifensee_fallanden_simmenc'], dtype=object)

In [60]:
# new_all.to_csv("data/end_process/new_all.csv", index=False)

In [61]:
xi = aggregate_gcaps_gfoams_gfrags(new_all.copy(), dfCodes)
collect_vitals(xi)

(196842, 0.0, 1352, 227, 245, 59, '2022-10-06', '2015-11-23')

In [62]:
xii = aggregate_gcaps_gfoams_gfrags(t_t.copy(), dfCodes)
collect_vitals(xii)

(196842, 0.0, 1352, 227, 245, 59, '2022-10-06', '2015-11-23')

In [63]:
a_c_vitals = collect_vitals(a_c)
a_c_vitals

(194965, 0.0, 1348, 184, 245, 59, '2021-08-28', '2015-11-23')

In [64]:
collect_vitals(test_this)

(196842, 0.0, 1352, 239, 245, 59, '2022-10-06', '2015-11-23')

In [65]:
collect_vitals(aggregate_gcaps_gfoams_gfrags(test_this.copy(), dfCodes))

(196842, 0.0, 1352, 227, 245, 59, '2022-10-06', '2015-11-23')

In [66]:
dfCodes.loc["Gfrags"]

material                   Plastic
description    Fragmented plastics
source                   Undefined
parent_code                 Gfrags
single_use                   False
groupname           plastic pieces
Name: Gfrags, dtype: object

In [67]:
t_tx = aggregate_gcaps_gfoams_gfrags(t_t.copy(), dfCodes)
lengths = t_tx.groupby("loc_date").length.mean()
t_t["length"] = t_t.loc_date.apply(lambda x: lengths.loc[x])
test_this["length"] = test_this.loc_date.apply(lambda x: lengths.loc[x])
t_txi = test_this.groupby(c_cols, as_index=False).agg(agg_this)

In [68]:
start_date = "2020-04-01"
end_date = "2021-05-31"




t_tx = t_tx[(t_tx["date"] > start_date) & (t_tx["date"]<end_date)].copy()
t_tx = t_tx[t_tx.water_name_slug == "lac-leman"].copy()

tii = t_tx[t_tx.code.isin(["G27"])]
ti2 = tii.groupby("loc_date", as_index=False).agg(agg_this)
ti2.pcs_m.median()

0.47

In [69]:
td = t_tx = aggregate_gcaps_gfoams_gfrags(t_txi.copy(), dfCodes)
t_txii = td[(td["date"] > start_date) & (td["date"]<end_date)].copy()
t_txii = t_txii[t_txii.water_name_slug == "lac-leman"].copy()

tii2 = t_txii[t_txii.code.isin(["G30"])]
tii2 = tii2.groupby("loc_date", as_index=False).agg(agg_this)
tii2.pcs_m.median()

0.215

In [70]:
test_this.loc[test_this.location == "cully", "city"] = 'Bourg-en-Lavaux'

In [72]:
test_this.to_csv("data/end_process/new_all.csv", index=False)

In [74]:
ic = dfCodes.index

In [71]:
break

SyntaxError: 'break' outside loop (668683560.py, line 1)

### Plastock

In [None]:
ps = pd.read_csv("data/from_prev/u_pstk.csv")
ps.rename(columns={"pcs/m":"pcs_m"}, inplace=True)
ps.head()

In [None]:
ps.location.unique()

In [None]:
change_me = {"vidy":"vidy-p", "tolochenaz":"tolochenaz-p", "preverenges":"preverenges-p"}

ps.loc[ps.location == "vidy", "location"] = "vidy-p"
ps.loc[ps.location == "tolochenaz", "location"] = "tolochenaz-p"
ps.loc[ps.location == "preverenges", "location"] = "preverenges-p"
ps.location.unique()

In [None]:
dfCodes.head()

In [None]:
ps["river_bassin"] = "lac-leman"

# g216 = {
#     "material":"undefined",
#     "description" : "various rubbish (worked wood, metal parts)",
#     "source":"Undefined",
#     "parent_code":"Parent code",
#     "single_use":False,
#     "groupname":"unclassified",
# }

# dfCodes.loc["G216"] = g216
ps.loc[ps.code == "Gfoam","code"] = "Gfoams"
ps["groupname"] = ps.code.apply(lambda x: dfCodes["groupname"].loc[x])
ps["water_name_slug"] = ps.river_bassin
ps["river_bassin"] = "rhone"
ps.head()

In [None]:
alr_pm.loc[alr_pm.code == "Gfoam", "code"] = "Gfoams"

In [None]:
alr_pm[alr_pm.city.isin(["Bourg-en-Lavaux", "Tolochenaz", "Lausanne", "Versoix", "Genève"])].location.unique()

In [None]:
alr_pm.head()

In [None]:
alr_pm_ps = pd.concat([alr_pm[merge_cols], ps[merge_cols]])

In [None]:
alr_pm_ps.loc[alr_pm_ps.location == "cully-plage"].city.unique()

In [None]:
alr_pm_ps.loc[alr_pm_ps.location == "cully", "city"] = 'Bourg-en-Lavaux'

In [None]:
alr_pm_ps.loc[alr_pm_ps.location == "vidy-p", "city"] = "Lausanne"

In [None]:
codes = alr_pm_ps.groupby("loc_date", as_index=False).code.nunique()
codes.code.unique()

In [None]:
# pcode = dfCodes[dfCodes.parent_code == "Parent code"].copy()
# pcode["parent_code"] = pcode.index

In [None]:
dfCodes.loc["G710", "parent_code"] = "G124"

In [None]:
dfCodes.loc["G79"]

In [None]:
gh = dfCodes["parent_code"]
gh.loc["G79"]

In [None]:
# wpcode = dfCodes[dfCodes.parent_code != "Parent code"].copy()
# new_codes = pd.concat([wpcode, pcode])
# new_codes.to_csv("data/end_process/codes.csv", index="code")

In [None]:

def assign_to_parent_code(data, codes):
    keys = codes["parent_code"]
    for acode in keys.index:
        # print(keys.loc[acode])
        data.loc[data.code == acode, "new_code"] = keys.loc[acode]
        
    data["code"] = data["new_code"]
    
    return data[~data.loc_date.isna()]
    

x = assign_to_parent_code(alr_pm_ps.copy(), dfCodes)

In [None]:
codes = x.groupby("loc_date", as_index=False).code.nunique()
codes.code.unique()

In [None]:
codes[codes.code == 164]

In [None]:
collect_vitals(x)

In [None]:
collect_vitals(alr_pm_ps)

In [None]:
# idnm = alr_pm_ps[alr_pm_ps.loc_date ==  "('maladaire', '2021-06-02')"]
# idnm_c = idnm.code.unique()
# not_these_one = idnm[~idnm.code.isin(["Gfrags", "Gfoams", "Gcaps"])]

# ty = alr_pm_ps[alr_pm_ps.loc_date !=  "('maladaire', '2021-06-02')"]

# alr_pm_x = pd.concat([ty, not_these_one])
codes = alr_pm_ps.groupby("loc_date", as_index=False).code.nunique()
codes.code.unique()

In [None]:
codes[codes.code == 228]

In [None]:
codes[codes.code == 184].head()

In [None]:
idn = alr_pm_ps[alr_pm_ps.loc_date == "('parc-des-pierrettes', '2022-10-06')"]

In [None]:
hmm = alr_pm_ps[alr_pm_ps.loc_date == "('aabach', '2020-10-22')"]

In [None]:
these_are_missing = find_missing(idn.code.unique(), hmm.code.unique())
idn[idn.code.isin(these_are_missing)]

In [None]:
these = ["('parc-des-pierrettes', '2022-10-06')",
       "('tiger-duck-beach', '2021-10-07')",
       "('villa-barton', '2021-11-14')"]

chx = alr_pm_x[alr_pm_x.loc_date.isin(these)].copy()
ok = [x for x in chx.code.unique() if x not in not_these_one.code.unique()]

In [None]:
x[x.code == "G79"]

In [None]:
dfCodes.head()

In [None]:
collect_vitals(alr_pm_x)

In [None]:
collect_vitals(alr_pm_ps)

In [None]:
codes_after = x.groupby("loc_date", as_index=False).code.nunique()

In [None]:
codes_after.code.unique()

In [None]:
codes_after[codes_after.code == ]