### general tenplate


Fetches data from the local source and esyablishes the following variables:

1. dutch_codes
2. swiss_codes
3. dutch_surveys
4. swiss_surveys
5. swiss_beaches


Establishes directory variables for fetching and putting to all subdirectories:

1. data
2. beaches
3. codes
4. geo
5. output

provides a script to update the remote data.

In [1]:
# sys things
import os
import sys
import json

# networks
import requests

# data
import pandas as pd
import numpy as np
import scipy
import math
import seaborn as sns

import resources.utilities.utility_functions as ut

In [2]:
# get folder extesions
data, beaches, codes, geo, output=ut.make_local_paths()
print(data, beaches, codes, geo, output)

resources/surveydata resources/locationdata resources/mlwcodedefs resources/geodata output


In [3]:
# code data
dutch_codes = pd.read_csv(codes+'/dutch_codes.csv')
swiss_codes = pd.read_csv(codes+'/swiss_codes.csv')

# housekeeping
dutch_codes.fillna(0, inplace=True)
dutch_codes.rename(columns={'OSPAR_ID':'ospar_id', 'Description':'description'}, inplace=True)
swiss_codes.rename(columns={'ospar_code':'ospar_id'}, inplace=True)

# survey_data
dutch_surveys = pd.read_csv(data+'/dataset_macrolitter_NL.csv')

# use the aggregated hd data. This accounts for the custom codes used in Switzerland
swiss_surveys = pd.read_csv(data+'/aggregated_hd_surveys.csv')

# location data
swiss_beaches = pd.read_csv(beaches+'/hammerdirt_beaches.csv')

In [4]:
# process the ducth codes:
# identify codes that are common to both 'ospar_id' columns
dutch_codes['parent_code'] = dutch_codes.ospar_id.round(0)
dutch_codes['child_code'] = dutch_codes.ospar_id - dutch_codes.parent_code

# the number of child codes:
child_codes = dutch_codes.loc[dutch_codes.child_code > 0]
ccodes = child_codes.parent_code.unique()

# all the codes with no remainder:
parent_codes = dutch_codes.loc[dutch_codes.child_code == 0]
pcodes = parent_codes.parent_code.unique()

# all the dutch codes that are not child codes:
dcodesall = dutch_codes.parent_code.unique()

In [5]:
# process the swiss codes
# get child and parent codes:
swiss_codes_parent = swiss_codes.loc[swiss_codes.parent_code == 'Parent code']
swiss_codes_child = swiss_codes.loc[swiss_codes.parent_code != 'Parent code']

# identify the codes that have actually been used:
swiss_pcodes_used = swiss_surveys.code.unique()

# make a list of the codes in use:
scodes_used = swiss_codes.loc[swiss_codes.code.isin(swiss_pcodes_used)].copy()
scodesu = scodes_used.ospar_id.unique()

def drop_bad_codes(x):
    try:
        the_x = int(x)
    except:
        the_x = 0
    else:
        pass     
    finally:
        return the_x 

fixed_swiss_codes = list(set([drop_bad_codes(x) for x in scodesu]))
scodes_used['ospar_id']=scodes_used.ospar_id.map(lambda x: drop_bad_codes(x))
scodesu = scodes_used.ospar_id.unique().astype('int')

# make sur they match

# check length
print(len(fixed_swiss_codes) ==  len(scodesu))

# check types
print(type(fixed_swiss_codes[0]), type(scodesu[0]))

# check the symmetric difference:
print(list(set(fixed_swiss_codes) ^ set(scodesu)))


True
<class 'int'> <class 'numpy.int64'>
[]


In [6]:
# codes in swisscode not in dutch codes:
noncodes = [x for x in fixed_swiss_codes if x not in dcodesall]

print("The ospar codes from the swiss data unaccounted for\n")
print(noncodes)
print("\n")

# code defininitions for noncodes:
noncodesdf = scodes_used.loc[scodes_used.ospar_id.isin(noncodes)]

# mlw codes not accounted for:
print("The the MLW codes unaccounted for\n")
print(noncodesdf.code.unique())
print("\nDefining the unaccounted ospar codes will reduce the above list to zero")

The ospar codes from the swiss data unaccounted for

[0, 12, 23, 45, 181, 56, 94, 96, 111, 114, 118]


The MLW codes unaccounted for

['G213' 'G214' 'G136' 'G139' 'G140' 'G142' 'G143' 'G202' 'G203' 'G204'
 'G205' 'G208' 'G210' 'G174' 'G179' 'G180' 'G185' 'G186' 'G190' 'G191'
 'G193' 'G195' 'G197' 'G198' 'G199' 'G146' 'G147' 'G148' 'G149' 'G150'
 'G154' 'G156' 'G157' 'G158' 'G101' 'G102' 'G103' 'G104' 'G105' 'G106'
 'G107' 'G108' 'G109' 'G111' 'G112' 'G113' 'G114' 'G115' 'G116' 'G117'
 'G118' 'G119' 'G122' 'G123' 'G124' 'G13' 'G14' 'G17' 'G19' 'G2' 'G23'
 'G36' 'G37' 'G39' 'G43' 'G48' 'G49' 'G5' 'G50' 'G52' 'G53' 'G55' 'G56'
 'G59' 'G60' 'G61' 'G62' 'G63' 'G64' 'G65' 'G68' 'G70' 'G71' 'G73' 'G74'
 'G80' 'G83' 'G84' 'G89' 'G90' 'G92' 'G94' 'G943' 'G97' 'G99' 'G126'
 'G128' 'G129' 'G131' 'G132' 'G134' 'G999' 'G159' 'G160' 'G162' 'G166'
 'G167' 'G170' 'G171' 'G172' 'G173']

Defining the unaccounted ospar codes will reduce the above list to zero


In [7]:
## !!! refresh the data from the hammerdirt api here:

# a = requests.get('https://mwshovel.pythonanywhere.com/api/surveys/daily-totals/code-totals/swiss/')
# b = requests.get('https://mwshovel.pythonanywhere.com/api/list-of-beaches/swiss/')
# c = requests.get('https://mwshovel.pythonanywhere.com/api/mlw-codes/list/')

# # the surveys need to be unpacked:
# swiss_surveys = ut.unpack_survey_results(a.json())
# swiss_surveys = pd.DataFrame(swiss_surveys)

# # adding location date column
# swiss_surveys['loc_date'] = list(zip(swiss_surveys['location'], swiss_surveys['date']))

# # hold the original
# x = a.json()

# print("survey columns")
# print(swiss_surveys.columns)

# swiss_beaches = pd.DataFrame(b.json())
# print("beach columns")
# print(swiss_beaches.columns)

# print("code columns")
# swiss_codes = pd.DataFrame(c.json())
# print(swiss_codes.columns)

# swiss_surveys.to_csv(data+'/hammerdirt_data.csv')
# swiss_beaches.to_csv(beaches+'/hammerdirt_beaches.csv')
# swiss_codes.to_csv(codes+'/swiss_codes.csv')
