In [81]:
import pandas as pd
import slugify
import datetime as dt
import numpy as np
import statsmodels.api as sm
import seaborn as sns
import matplotlib
import matplotlib.pyplot as plt
import matplotlib.dates as mdates
import matplotlib.cbook as cbook
from matplotlib.dates import MO, WeekdayLocator
# from blume.table import table
import random
from dataclasses import dataclass
import dataclasses

# coeficient for plate counts
coef = 100
the_default = "tmtc"
myFmt = mdates.DateFormatter('%m-%d')
project = "Montreux 2022 - do it together water quality sampling"
site_markers = {"svt":"o", "vnx":"D", "mrd":"X"}
species_colors = { "bioindicator":"dodgerblue", "coliform":"magenta"}
marker_colors = {"svt":"black", "vnx":"green", "mrd":"goldenrod"}
sites = ["svt", "vnx", "mrd"]

# important dates
f_start = dt.date(2022, 7, 1)
f_end = dt.date(2022, 7, 16)

# Formatting and counting methods
# Nd, nd, nan, NaN or tmtc are all converted to tmtc.
# These counts are eliminted from the data.
# another possibility is to repolace tmtc with
# an expected value based on the rest of the data
# counting it as zero would be under estimating
# estimating a max value would not be supportable
# unless we wanted to do some big side bar into probability
# theory

def handle_non_counts(x: int = None, a_func: callable = None, the_default: str = "tmtc") -> str:
    
    if x is not None:
        if isinstance(x, int):
            return x
        elif x.isdigit():
            return int(x)
        elif isinstance(x, str):            
            if a_func is not None:
                result = a_func(x)
            else:
                result = the_default
            return result

def correct_name_spelling(x, possible_names: set = None):
        
    try:
        j = x.replace(" ", "")
        site = j[:-1]
    except ValueError("That was not the expected in put"):
        print("That did not work")
            
    return site
    
# converting the date to an iso standard    
def iso_standard_date(x):
    format = "%d.%m.%y"
    adate = dt.datetime.strptime(x, format).date()
    return adate

def get_the_date_range(x):
    lmax = max(x)
    lmin = min(x)
    return (lmin, lmax)
this_file ="MCBP2022_12oct22.csv"

data = pd.read_csv(this_file)
# new_cols = ['date', 'site', 'bioindicator', 'coliform', 'turquoise', 'image']
# cols = {x:new_cols[i] for i, x in enumerate(data.columns)}
# data.rename(columns=cols, inplace=True)


pform05 = ['0.5m off platform B', '0.5m off platform C',
       '0.5m off platform D', '0.5m off platform E',
       '0.5m off platform F', '0.5m off platform A']

pform5 = ['platform 0.5A', 'platform 0.5B', 'platform 0.5C', 'platform 0.5D',
       'platform 0.5 E', 'platform 0.5F', 'platform 5m A',
       'platform 5m B', 'platform 5m C', 'platform 5m D', 'platform 5m E',
       'platform 5m F']

new_pform = {
    '0.5m off platform B': 'pform052',
    '0.5m off platform C': 'pform053',
    '0.5m off platform D': 'pform054',
    '0.5m off platform E': 'pform055',
    '0.5m off platform F': 'pform056',
    '0.5m off platform A': 'pform051',
    'platform 0.5A': 'pform051',
    'platform 0.5B': 'pform052', 
    'platform 0.5C': 'pform053', 
    'platform 0.5D': 'pform054',
    'platform 0.5 E': 'pform055',
    'platform 0.5F': 'pform056',
    'platform 5m A': 'pform51',
    'platform 5m B': 'pform52',
    'platform 5m C': 'pform53',
    'platform 5m D': 'pform54',
    'platform 5m E': 'pform55',
    'platform 5m F': 'pform56',
       
}

# post jazz sampling
# september 12 2022
# also consider the samples from svt, mrd and vnx on the same days



# first of august  and 18th of july samples taken from officially monitored 
clarens = ['Clarens 1', 'Clarens 2', 'Clarens 4', 'Clarens 5',  'Clarens 3' ]


# platforms = data[data["Unnamed: 1"].isin(alist)]

# this_file = pd.read_csv("counts_dc.csv")
# this_file

possible_names = [
    'SVT1', 'SVT2', 'SVT3', 'SVT4', 'SVT5', 'SVT6', 'VNX1', 'VNX2', 'VNX3', 'VNX4',
    'VNX5', 'VNX6', 'MRD1', 'MRD2', 'MRD3', 'MRD4', 'MRD5', 'MRD6', 'MRD 5', 'MRD 6',
    'MRD 3', 'MRD 4', 'MRD 1', 'MRD 2', 
]

combined = [*possible_names, *pform05, *pform5, *clarens]
data = data[['DATE', 'SITE', 'Count Blue', 'Pink', 'Turquoise', 'Image']]
new_cols = ['date', 'site', 'bioindicator', 'coliform', 'turquoise', 'image']
cols = {x:new_cols[i] for i, x in enumerate(data.columns)}
data.rename(columns=cols, inplace=True)

work_data = data[1:][data[1:]["site"].isin(combined)].copy()

# work_data = work_data[['DATE', 'SITE', 'Count Blue', 'Pink', 'Turquoise', 'Image']].copy()
# make the dates iso standard
work_data["date"] = work_data["date"].map(lambda x: iso_standard_date(x))
sampled = work_data[work_data.site.isin(possible_names)].copy()

sampled["plate-number"] = sampled["site"].map(lambda x: int(x[-1]))
sampled["site"] = sampled["site"].map(lambda x: slugify.slugify(x))
sampled["new_site"] = sampled["site"].map(lambda x: x[:-1])
sampled["site"] = sampled["new_site"]
sampled.site.unique()

array(['svt', 'vnx', 'mrd'], dtype=object)

In [82]:
clrn = work_data[work_data.site.isin(clarens)].copy()
clrn["site"] = clrn.site.map(lambda x: x.strip())
clrn["plate-number"] = clrn["site"].map(lambda x: int(x[-1]))
clrn["new_site"] = clrn["site"].map(lambda x: x[:-1])
clrn["site"] = clrn["new_site"]
clrn.head()

Unnamed: 0,date,site,bioindicator,coliform,turquoise,image,plate-number,new_site
212,2022-07-18,Clarens,2,58,1,IMG_7991.JPG,1,Clarens
213,2022-07-18,Clarens,1,84,2,IMG_7992.JPG,2,Clarens
214,2022-07-18,Clarens,1,65,0,IMG_7993.JPG,3,Clarens
263,2022-08-01,Clarens,0,20,4,IMG_8290.JPG,1,Clarens
264,2022-08-01,Clarens,0,29,1,IMG_8290.JPG,2,Clarens


In [83]:
pformdata = work_data[work_data.site.isin(list(new_pform.keys()))].copy()
pformdata["site"] = pformdata.site.map(lambda x: new_pform[x])
pformdata["plate-number"] = pformdata["site"].map(lambda x: int(x[-1]))
pformdata["site"] = pformdata.site.map(lambda x: x[:-1])
# pformdata["plate-number"] = pformdata["site"].map(lambda x: int(x[-1]))
pformdata.head()

Unnamed: 0,date,site,bioindicator,coliform,turquoise,image,plate-number
104,2022-07-04,pform05,0,1,0,IMG_7678.JPG,1
105,2022-07-04,pform05,0,2,0,IMG_7678.JPG,2
106,2022-07-04,pform05,0,0,0,IMG_7678.JPG,3
107,2022-07-04,pform05,0,1,0,IMG_7678.JPG,4
108,2022-07-04,pform05,0,0,0,IMG_7678.JPG,5


In [84]:
survey_data = pd.concat([pformdata, clrn, sampled])
t = survey_data[["site", "bioindicator", "coliform", "turquoise", "image",  "plate-number"]]
survey_data.columns

Index(['date', 'site', 'bioindicator', 'coliform', 'turquoise', 'image',
       'plate-number', 'new_site'],
      dtype='object')

In [85]:
sdlong = pd.melt(survey_data, id_vars=["date", "site", "plate-number"], value_vars=["bioindicator", "coliform", "turquoise"])
sdlong[sdlong.site == "pform05"]["plate-number"].unique()

array([1, 2, 3, 4, 5, 6])

In [86]:
sdlong.columns

Index(['date', 'site', 'plate-number', 'variable', 'value'], dtype='object')

In [87]:
sdlong.head()

Unnamed: 0,date,site,plate-number,variable,value
0,2022-07-04,pform05,1,bioindicator,0
1,2022-07-04,pform05,2,bioindicator,0
2,2022-07-04,pform05,3,bioindicator,0
3,2022-07-04,pform05,4,bioindicator,0
4,2022-07-04,pform05,5,bioindicator,0


In [88]:
sdlong.value.unique()

array(['0', 'nd', '2', '3', '4', '1', '12', '7', '15', '6', '8', '9',
       '14', '16', '11', '19', '36', '25', '26', '58', '84', '65', '20',
       '29', '23', '5', '13', '10', '35', '51', '45', '55', '28', '37',
       '21', '31', '27', '44', '24', '49', '32', '41', '39', '18', '40',
       '56', '79', '64', '61', '34', '43', '22', '48', '54', '57', '134',
       '214', '145', '165', '144', '142', '50', '62', '42', '53', '52',
       '38', '33', '60', '47', '72', '46', '17', '102', '100', '76', '59',
       '70', '74', '68', '95'], dtype=object)

In [89]:
sdlong = sdlong[sdlong.value != "nd"]

In [93]:
# we may want to include these results?
# combined = [*possible_names, *controls]

# combined_names = possible_names & {'eMRD1' 'eMRD2' 'eMRD3' 'eMRD4' 'eMRD5' 'eMRD6' }
# The place names are given with the plate number. The process invloves
# computing the expected value from multiple plates from one place on the
# same day (replicants). The plate number needs to be removed from the place
# name and given a column. Which allows the data to be in long form
# i.e. <date>, <site>, <category>, <count>, <plate number> for each plate in
# the survey.

# limit the data to only the possible names
# make the column names lowercase


# new_cols = {x:slugify.slugify(x) for x in work_data.columns}
# work_data.rename(columns=new_cols, inplace=True)

# # make the dates iso standard
# work_data["date"] = work_data["date"].map(lambda x: iso_standard_date(x))

# put the work_data to longformat
# long_work_data =  pd.melt(work_data, id_vars=["site", "date"], value_vars=["bioindicator", "coliform", "turquoise"])

# rename the variable and value columns
sdlong.rename(columns={"variable":"category", "value":"count"}, inplace=True)

# put the count work_data to int
# long_work_data["count"] = long_work_data["count"].map(lambda x: handle_non_counts(x))
# long_work_data = long_work_data[long_work_data["count"] != None]

# assign a number for each week:
sdlong["week"] = sdlong["date"].map(lambda x: x.isocalendar().week)


# reindex the week number:
week_nums = {x: i+1 for i, x in enumerate(long_work_data["week"].unique())}

sdlong["week"] = long_work_data["week"].map(lambda x: week_nums[x])


# # the data of interest
# doi = sdlong.copy()




# doi = doi[["site", "date", "plate-number", "category", "count", "week"]].copy()

# using the coefficient

sdlong["is-jazz"] = (sdlong["date"] >= f_start) & (sdlong["date"] <= f_end)
sdlong.to_csv("long_work_data.csv", index=False)

In [94]:
sdlong

Unnamed: 0,date,site,plate-number,category,count,week,is-jazz
0,2022-07-04,pform05,1,bioindicator,0,1,True
1,2022-07-04,pform05,2,bioindicator,0,1,True
2,2022-07-04,pform05,3,bioindicator,0,1,True
3,2022-07-04,pform05,4,bioindicator,0,1,True
4,2022-07-04,pform05,5,bioindicator,0,1,True
...,...,...,...,...,...,...,...
787,2022-09-12,mrd,2,turquoise,0,9,False
788,2022-09-12,mrd,3,turquoise,0,9,False
789,2022-09-12,mrd,4,turquoise,2,9,False
790,2022-09-12,mrd,5,turquoise,2,9,False
