In [77]:
import pandas as pd
import numpy as np

# Aggregating all previous data

## Notes

Taking another look at the data prior to writing the software. This also defines the methods of collecting the data from the app-dev standpoint. For the moment we can use the contents of <data/from_prev/u_all_data.csv> as a proposed data model.

### quantity

In [78]:
a = pd.read_csv("data/from_prev/agg_results_with_land_use_2015.csv")
not_these = [
    'sihlsee_einsiedeln_schilligerllacherl',
    'schiffenensee_duedingen_hirschij',
    'lac-leman-hammerdirt'
]
    
a = a[~a.location.isin(not_these)].copy()
a_cols = [
    'loc_date',
    'date',
    'water_name_slug',
    'location',
    'code',
    'pcs_m',
    'quantity',
    'river_bassin',
    'length', 
    'groupname',
    'city']

a_c = a[a_cols].copy()

In [79]:
def collect_vitals(data):
    total = data.quantity.sum()
    median = data.pcs_m.median()
    samples = data.loc_date.nunique()
    ncodes = data.code.nunique()
    nlocations = data.location.nunique()
    nbodies = data.water_name_slug.nunique()
    return total, median, samples, ncodes, nlocations, nbodies

b = pd.read_csv("data/from_prev/u_all_data.csv")
b = b[~b.loc_date.isin(not_these)].copy()
b_vitals = collect_vitals(b)
b_vitals

(192380, 0.0, 1432, 228, 232, 54)

In [80]:
a_c_vitals = collect_vitals(a_c)
a_c_vitals

(198824, 0.0, 1449, 184, 252, 61)

In [81]:
def find_missing(more_than, less_than):
    return np.setdiff1d(more_than, less_than)

missing = find_missing(a_c.loc_date.unique(), b.loc_date.unique())
matched_a_b = a_c[~a_c.loc_date.isin(missing)].copy()
diff_a_b = a_c[a_c.loc_date.isin(missing)]

In [82]:
matched_vitals = collect_vitals(matched_a_b)
matched_vitals

(191048, 0.0, 1429, 184, 232, 54)

In [83]:
missing_m = find_missing(b.loc_date.unique(), matched_a_b.loc_date.unique())
missing_matched = b[b.loc_date.isin(missing_m)]
matched_b_vitals = collect_vitals(missing_matched)

In [85]:
matched_vitals[0] + matched_b_vitals[0] - b_vitals[0]

0

### codes

In [120]:
dfCodes = pd.read_csv("data/from_prev/codes_with_group_names_2015.csv")
a_codes = matched_a_b.code.unique()
b_codes = b_all_match.code.unique()

# these are codes from the alpes and other codes established after 2020
# these codes are in b and not in a
not_in_a = find_missing(b_codes, a_codes)

In [121]:
# these codes are in a not in b
# these are the aggregated codes for Gfoam, Gfrags and Gcaps
not_in_b = find_missing(a_codes, b_codes)

In [122]:
b_all_match = b[~b.loc_date.isin(missing_m)]
b_a_m_vitals = collect_vitals(b_all_match)
b_a_m_vitals
b_a_m_vitals

(191048, 0.0, 1429, 176, 232, 54)

In [123]:
matched_vitals

(191048, 0.0, 1429, 184, 232, 54)

In [124]:
columns = ['code', 'material', 'description', 'source', 'source_two',
       'source_three', 'parent_code', 'single_use', 'groupname']

In [125]:
dfCodes = dfCodes[columns].copy()
dfCodes = dfCodes.set_index("code", drop=True)

In [126]:
not_in_b

array(['G21', 'G23', 'G24', 'G75', 'G76', 'G78', 'G79', 'G80', 'G81',
       'G82', 'G83'], dtype=object)

In [127]:
not_in_a

array(['Gcaps', 'Gfoam', 'Gfrags'], dtype=object)

In [128]:
dfCodes.head()

Unnamed: 0_level_0,material,description,source,source_two,source_three,parent_code,single_use,groupname
code,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
G708,Metal,Batons de ski,Usagers,Where does it come from,Where does it come from,G199,False,recreation
G212,Chemicals,Oil nodules or coal fragments -- not from a BBQ,Undefined,Where does it come from,none,Parent code,False,unclassified
G213,Chemicals,Paraffin wax,Undefined,Where does it come from,none,Parent code,False,recreation
G214,Chemicals,Oil/tar,Construction,Where does it come from,none,Parent code,False,infrastructure
G135,Cloth,"Clothes, footware, headware, gloves",Clothing,Where does it come from,none,Parent code,False,personal items


In [129]:
dfCodes.loc[['G21', 'G22','G23', 'G24'], "parent_code"] = "Gcaps"

dfCodes.loc[['G75', 'G78', 'G79',"G80"], "parent_code"] = "Gfrags"
dfCodes.loc[['G76','G81','G82', 'G83'], "parent_code"] = "Gfoams"

dfCodes["test"] = dfCodes.parent_code.wher

AttributeError: 'DataFrame' object has no attribute 'pare'