### general tenplate


Fetches data from the local source and esyablishes the following variables:

1. dutch_codes
2. swiss_codes
3. dutch_surveys
4. swiss_surveys
5. swiss_beaches


Establishes directory variables for fetching and putting to all subdirectories:

1. data
2. beaches
3. codes
4. geo
5. output

provides a script to update the remote data.

In [1]:
# sys things
import os
import sys
import json

# networks
import requests

# data
import pandas as pd
import numpy as np
import scipy
import math
import seaborn as sns

import resources.utilities.utility_functions as ut

In [2]:
# get folder extesions
data, beaches, codes, geo, output=ut.make_local_paths()
print("look for resources here\n")
print(data, beaches, codes, geo, output)


look for resources here

resources/surveydata resources/locationdata resources/mlwcodedefs resources/geodata output


In [3]:
# code data
dutch_codes = pd.read_csv(codes+'/dutch_codes.csv')
swiss_codes = pd.read_csv(codes+'/swiss_codes.csv')
# this list was recieved from david fleet: one of the authors of the monitoring guide
joint_list = pd.read_csv(F"{codes}/jointcodes/fleetjcodes.csv")

# housekeeping
dutch_codes.fillna(0, inplace=True)
dutch_codes.rename(columns={'OSPAR_ID':'ospar_id', 'Description':'description'}, inplace=True)
swiss_codes.rename(columns={'ospar_code':'ospar_id'}, inplace=True)
swiss_codes.drop('Unnamed: 0', axis=1,inplace=True)

# survey_data
dutch_surveys = pd.read_csv(data+'/dataset_macrolitter_NL.csv')

# use the aggregated hd data. This accounts for the custom codes used in Switzerland
swiss_surveys = pd.read_csv(data+'/aggregated_hd_surveys.csv')

# location data
swiss_beaches = pd.read_csv(beaches+'/hammerdirt_beaches.csv')

In [4]:
print("Columns from cleaned up dutch data\n")
print(dutch_codes.columns)

print("\nColumns from cleaned up swiss data\n")
print(swiss_codes.columns)

Columns from cleaned up dutch data

Index(['ID', 'description', 'category', 'ospar_id'], dtype='object')

Columns from cleaned up swiss data

Index(['code', 'material', 'description', 'source', 'source_two',
       'source_three', 'parent_code', 'direct', 'single_use', 'micro',
       'ospar_id'],
      dtype='object')


In [5]:
# process the ducth codes:
# identify codes that are common to both 'ospar_id' columns
dutch_codes['parent_code'] = dutch_codes.ospar_id.round(0)
dutch_codes['parent_code'] = dutch_codes['parent_code'].astype('int') 
dutch_codes['child_code'] = dutch_codes.ospar_id - dutch_codes.parent_code


# the number of child codes:
child_codes = dutch_codes.loc[dutch_codes.child_code > 0]
ccodes = child_codes.parent_code.unique()

# all the codes with no remainder:
parent_codes = dutch_codes.loc[dutch_codes.child_code == 0]
pcodes = parent_codes.parent_code.unique()

# all the dutch codes that are not child codes:
dcodesall = dutch_codes.parent_code.unique()

print("""
This is the OSPAR code list from the dutch data.\n
OSPAR codes that could not be typed to 'int' were counted as 0.\n
Any code with an ospar value of 0 was excluded\n
""")
print(dutch_codes['parent_code'].unique())
print(F"\nThese are the detail codes used to better define the object:\n\n{ccodes}")


This is the OSPAR code list from the dutch data.

OSPAR codes that could not be typed to 'int' were counted as 0.

Any code with an ospar value of 0 was excluded


[   1    2    3    4    5    6    7    9   10   13   14   15   16   20
   21   24   25  113   31   32   33   36   38   40   42   43   44  117
   46   48 1172  462   47   22   19  472  212  481   11   39    8   17
   35   49   52   53   54   55   57   59   60   61   63   64   65   66
   67   62   68   69   72   73   74   75   81   78   79   83   77   84
   88   76   86   80   82  120   89   90   91   92   93   98  982  102
   97   99   18  100  101  103  104  105]

These are the detail codes used to better define the object:

[  4 117  46   6  47  22  19   2  43  38  39  62  67  81 102   1]


In [6]:
# process the swiss codes
# get child and parent codes:
swiss_codes_parent = swiss_codes.loc[swiss_codes.parent_code == 'Parent code'].copy()
swiss_codes_child = swiss_codes.loc[swiss_codes.parent_code != 'Parent code'].copy()

# identify the codes that have actually been used:
swiss_pcodes_used = swiss_surveys.code.unique()

# make a list of the codes in use:
scodes_used = swiss_codes.loc[swiss_codes.code.isin(swiss_pcodes_used)].copy()

def drop_bad_codes(x):
    try:
        the_x = int(x)
    except:
        the_x = 0
    else:
        pass     
    finally:
        return the_x 

scodes_used['ospar_id']=scodes_used.ospar_id.map(lambda x: drop_bad_codes(x))
scodes_used['ospar_id'] = scodes_used['ospar_id'].astype('int')
scodes_used['paired'] = list(zip(scodes_used.code, scodes_used.ospar_id))
scodes_noospar = scodes_used[scodes_used.ospar_id == 0]
scodes_ospar = scodes_used[scodes_used.ospar_id != 0]

print("""
This is the mlw/ospar code list from the swiss codes.\n
OSPAR codes that could not be typed to 'int' were counted as 0.\n
Any code with an ospar value of 0 was excluded\n
""")
print(scodes_ospar.paired.unique())
print(F"\nThese are the mlw codes that do not have a valid OSPAR code from the swiss list:\n\n{scodes_noospar.paired.unique()}")



This is the mlw/ospar code list from the swiss codes.

OSPAR codes that could not be typed to 'int' were counted as 0.

Any code with an ospar value of 0 was excluded


[('G213', 181) ('G214', 111) ('G135', 54) ('G137', 54) ('G138', 57)
 ('G140', 56) ('G141', 55) ('G144', 100) ('G145', 59) ('G200', 91)
 ('G201', 93) ('G204', 94) ('G210', 96) ('G175', 78) ('G176', 82)
 ('G177', 81) ('G178', 77) ('G181', 89) ('G182', 80) ('G188', 89)
 ('G194', 89) ('G150', 118) ('G151', 62) ('G152', 63) ('G153', 67)
 ('G155', 67) ('G1', 1) ('G10', 6) ('G100', 103) ('G11', 7) ('G12', 7)
 ('G13', 12) ('G20', 15) ('G21', 15) ('G22', 15) ('G24', 15) ('G25', 48)
 ('G26', 16) ('G27', 64) ('G28', 17) ('G29', 18) ('G3', 2) ('G30', 19)
 ('G31', 19) ('G32', 20) ('G33', 21) ('G34', 22) ('G35', 22) ('G36', 23)
 ('G38', 40) ('G4', 3) ('G40', 25) ('G41', 113) ('G43', 114) ('G6', 11)
 ('G66', 39) ('G67', 40) ('G7', 4) ('G73', 45) ('G74', 45) ('G75', 117)
 ('G76', 46) ('G78', 117) ('G79', 46) ('G8', 4) ('G81', 117) ('G

In [7]:
# process the joint_list:
# these columns names are outrageous:
joint_list.rename(columns={'TSG_ML General- Code Guidance on Monitoring 2013.2':'mlw_code','OSPAR- Code':'ospar_id'}, inplace=True)
joint_list.fillna('0', inplace=True)

joint_list.ospar_id = joint_list.ospar_id.map(lambda x: drop_bad_codes(x))

jlmlw_only = joint_list[joint_list.mlw_code != '0'].copy()

jlmlw_only['paired'] = list(zip( jlmlw_only.mlw_code,jlmlw_only.ospar_id))

# make some code pairs
jlistkeys = joint_list[['mlw_code','ospar_id']].copy()

# set up a mapper:
mlwkeyed = {x[1]:x[0] for x in list(jlmlw_only.paired.unique())}
osparkeyed = {x[0]:x[0] for x in list(jlmlw_only.paired.unique())}

In [8]:
# mlw codes are no greater than 4 characters and ospar no greater than three
# remove and save any values that don't match that criteria

a_paired_list = jlmlw_only.paired.unique()
fails = []
paired = []
def check_length(x, paired, fails):
    
    xnot = len(x[0])
    
    if xnot > 4:
        fails.append((x[0], int(x[1])))
    else:
        paired.append(x)
        
for a_pair in a_paired_list:
    check_length(a_pair, paired, fails)

print("""
This is the mlw/ospar code list from david fleet.\n
MlW codes that had a length greater than 4 were counted as 0.\n
OSPAR codes that could not be typed to 'int' were counted as 0.\n
Any code with a value of 0 was excluded\n
""")
print(paired)
print(F"\nThese are the mlw codes that did not match from davids list:\n{fails}")


This is the mlw/ospar code list from david fleet.

MlW codes that had a length greater than 4 were counted as 0.

OSPAR codes that could not be typed to 'int' were counted as 0.

Any code with a value of 0 was excluded


[('G89', 48), ('G90', 48), ('G39', 25), ('G39', 113), ('G92', 48), ('G59', 35), ('G52', 33), ('G52', 115), ('G52', 116), ('G61', 48), ('G42', 26), ('G60', 36), ('G44', 27), ('G70', 43), ('G211', 105), ('G100', 103), ('G99', 104), ('G2', 24), ('G2', 121), ('G2', 23), ('G2', 2), ('G2', 3), ('G2', 112), ('G6', 7), ('G6', 4), ('G6', 8), ('G6', 9), ('G6', 6), ('G6', 12), ('G6', 5), ('G6', 10), ('G6', 11), ('G84', 48), ('G38', 40), ('G1', 1), ('G20', 15), ('G18', 13), ('G66', 39), ('G95', 98), ('G98', 102), ('G97', 101), ('G86', 48), ('G32', 20), ('G91', 48), ('G64', 48), ('G26', 16), ('G25', 48), ('G27', 64), ('G68', 41), ('G73', 45), ('G124', 48), ('G65', 38), ('G93', 48), ('G29', 18), ('G87', 48), ('G166', 73), ('G28', 17), ('G43', 114), ('G88', 48), ('G72', 48), ('G19',

In [9]:
codepaires_from_swiss_data = scodes_ospar['paired'].unique()
codepairs_from_fleet_data = paired

a_list_of_unique_pairs = list(set(codepaires_from_swiss_data) | set(codepairs_from_fleet_data))

gcodes = [x[0] for x in a_list_of_unique_pairs]

# check to see if there are any duplicate pairs mlw ==> ospar:

instances = {}
duplicates = []

for x in gcodes:
    if x not in instances:
        instances[x] = 1
    else:
        if instances[x] == 1:
            duplicates.append(x)
        instances[x] += 1

print("These codes have more than one ospar id attributed to them\n")
print(duplicates)

print("\nThis is how they are attributed\n")
print([x for x in a_list_of_unique_pairs if x[0] in duplicates])

print("\nThis is the number of different definitions\n")
print({k:v for k,v in instances.items() if k in duplicates})

print("\nThis is the MLW definition for those codes:\n")
print(swiss_codes_parent.loc[swiss_codes_parent.code.isin(duplicates)][['code', 'description']])

print("\nThis is how many times those mlw codes have been registered in the swiss data:\n")
print(swiss_surveys[swiss_surveys.code.isin(duplicates)].groupby('code').quantity.sum())

These codes have more than one ospar id attributed to them

['G194', 'G6', 'G149', 'G2', 'G173', 'G52', 'G95', 'G153', 'G210', 'G91', 'G39', 'G197']

This is how they are attributed

[('G6', 11), ('G194', 89), ('G95', 67), ('G194', 0), ('G210', 93), ('G6', 4), ('G173', 75), ('G149', 61), ('G2', 3), ('G52', 116), ('G6', 6), ('G149', 118), ('G149', 63), ('G39', 113), ('G153', 65), ('G2', 23), ('G197', 90), ('G173', 74), ('G149', 60), ('G52', 33), ('G2', 112), ('G2', 2), ('G2', 121), ('G95', 98), ('G6', 8), ('G149', 62), ('G153', 67), ('G6', 10), ('G6', 12), ('G91', 48), ('G52', 115), ('G6', 5), ('G210', 96), ('G6', 7), ('G2', 24), ('G91', 91), ('G6', 9), ('G39', 25), ('G197', 89)]

This is the number of different definitions

{'G6': 9, 'G194': 2, 'G95': 2, 'G210': 2, 'G173': 2, 'G149': 5, 'G2': 6, 'G52': 3, 'G39': 2, 'G153': 2, 'G197': 2, 'G91': 2}

This is the MLW definition for those codes:

     code                                        description
25   G210                         

### Choose the correct definition for MLW codes that have many OSPAR ids.

The EU is putting together a list of harmonized codes that makes it easier to switch between different systems. We will try and consult that list before making any hasty decisions.

### Account for equivalencies for dutch child codes

Both projects use a coding system for items of local concern (sub codes or child codes) we need to find each projects analog and use appropriate OSPAR code.



In [10]:
## !!! refresh the data from the hammerdirt api here:

# a = requests.get('https://mwshovel.pythonanywhere.com/api/surveys/daily-totals/code-totals/swiss/')
# b = requests.get('https://mwshovel.pythonanywhere.com/api/list-of-beaches/swiss/')
# c = requests.get('https://mwshovel.pythonanywhere.com/api/mlw-codes/list/')

# # the surveys need to be unpacked:
# swiss_surveys = ut.unpack_survey_results(a.json())
# swiss_surveys = pd.DataFrame(swiss_surveys)

# # adding location date column
# swiss_surveys['loc_date'] = list(zip(swiss_surveys['location'], swiss_surveys['date']))

# # hold the original
# x = a.json()

# print("survey columns")
# print(swiss_surveys.columns)

# swiss_beaches = pd.DataFrame(b.json())
# print("beach columns")
# print(swiss_beaches.columns)

# print("code columns")
# swiss_codes = pd.DataFrame(c.json())
# print(swiss_codes.columns)

# swiss_surveys.to_csv(data+'/hammerdirt_data.csv')
# swiss_beaches.to_csv(beaches+'/hammerdirt_beaches.csv')
# swiss_codes.to_csv(codes+'/swiss_codes.csv')
