forked from xiaoxuzhao/FSRS_old
/
fsrs2emolt.py
337 lines (314 loc) · 17.1 KB
/
fsrs2emolt.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
# Routine to process FSRS csv files and LatLong.txt files to make eMOLT-like ORACLE-ready files
# modeled after the old "emolt_pd.py" routine
# Manual step for now is to insert relevent csv and txt file in proper subdirectory under the "inputdir" such as /data5/jmanning/fsrs/YYYY/LFAXX/
#
# Jim Manning Feb 2020
#
import pandas as pd
#import fsrs2emolt_functions as func # functions included below for now
from datetime import datetime,timedelta
from matplotlib import pylab as plt
from pylab import *
import glob
from matplotlib.dates import num2date,date2num
import os.path
from math import radians, cos, sin, atan, sqrt
#from conversions import c2f,f2cc
## HARDCODES ###
year='2010' # where this is the start year and the data usually strattles the new year
LFAzone='34'
pc_or_linux='pc' # directory names differ depending on which machine I am working on
if pc_or_linux=='pc':
inputdir='c:\\Users\\Joann\\Downloads\\FSRS\\'+year+'\\LFA'+LFAzone+'\\' # this is the directopry with all the csv & txt files for this year and LFAzone
webdir='c:\\Users\\Joann\\Downloads\\FSRS\\web\\' # folder to store all png and html files results for the web
outfile='c:\\Users\\Joann\\Downloads\\FSRS\\Prep_for_oracle.dat'# result file with hourly data
sets_info_file='c:\\Users\\Joann\\Downloads\\FSRS\\sets_info.csv' # statistics on each deployment
else:
inputdir='/net/data5/jmanning/fsrs/'+year+'/LFA'+LFAzone+'/'
webdir='/net/newfish_www/html/nefsc/emolt/fsrs/'
outfile='/net/data5/jmanning/fsrs/Prep_for_oracle.dat'# result file with hourly data
sets_info_file='/net/data5/jmanning/fsrs/sets_info.csv' # statistics on each deployment
latlon_file=inputdir+'LatLong LFA '+LFAzone+'_'+year[2:4]+str(int(year[2:4])+1)+'.txt'# standard log file for this LFA and year
Sc='FS'+LFAzone #site code
skipr=8 # number of rows to skip in csv file since it seems to vary from year to year
crit=1.0 # number of deviations of first derivative to reject spikes (used in "clean_time_series" function)
methSE='use_lalo_file' # method to define start and end of data (ie in the water) where 'click' and 'use_lalo_file' are the options
#################
### FUNCTIONS #########
def c2f(*c):
"""
convert Celsius to Fahrenheit
accepts multiple values
"""
if not c:
c = input ('Enter Celsius value:')
f = 1.8 * c + 32
return f
else:
f = [(i * 1.8 + 32) for i in c]
return f
def haversine(lon1, lat1, lon2, lat2):
"""
Calculate the great circle distance between two points
on the earth (specified in decimal degrees)
"""
lon1, lat1, lon2, lat2 = map(radians, [lon1, lat1, lon2, lat2])
#print 34
dlon = lon2 - lon1
dlat = lat2 - lat1
a = sin(dlat/2)**2 + cos(lat1) * cos(lat2) * sin(dlon/2)**2
c = 2 * atan(sqrt(a)/sqrt(1-a))
r = 6371
d=c * r
#print type(d)
return d
def find_range_dist(lat,lon):
#find mean & max diff in distance between points
d=[] #distance
for k in range(len(lat)-1):
d.append(haversine(lon[k+1],lat[k+1],lon[k],lat[k]))
return d,mean(d),max(d)
def parse(datet,hrmn): # parses dates in the asc file
dt=datetime.datetime.strptime(datet[0:10],'%Y-%m-%d')# normal format
delta=timedelta(hours=int(hrmn[0:2]),minutes=int(hrmn[3:5]),seconds=int(hrmn[6:8]))
return dt+delta
def my_x_axis_format(ax, dt):
if dt>timedelta(days=6):
intr=int(dt.days/6)
else:
intr=2
#ax.xaxis.set_minor_locator(dates.WeekdayLocator(byweekday=(1),interval=intr))
#ax.xaxis.set_minor_locator(dates.DayLocator(interval=intr))
#ax.xaxis.set_minor_formatter(dates.DateFormatter('%b%d'))
#years= mdates.YearLocator() # every year
yearsFmt = mdates.DateFormatter('')
ax.xaxis.set_major_locator(years)
ax.xaxis.set_major_formatter(yearsFmt)
return ax
def chooseSE(df,start,end,skipr,crit):#this function employed to zoom-in picture and choose exactly points derived previously
# 1) chooses start and end points by zooming into initial click locations in time
# 2) eliminates any changes > "crit" times the standard deviation
# 3) plots the final raw and cleaned with two axis
'''
def parse(datet):
#print datet[0:10],datet[11:13],datet[14:16]
dt=datetime.strptime(datet[0:10],'%Y-%m-%d')# normal format
#dt=datetime.strptime(datet[0:10],'%d-%m-%Y') # Jerry Prezioso format
delta=timedelta(hours=int(datet[11:13]),minutes=int(datet[14:16]))
return dt+delta
'''
startfront=start[0]-2 # looking 2 day either side of the point clicked
startback=start[0]+2
sfforplot=(num2date(startfront)).replace(minute=0,second=0,microsecond=0).isoformat(" ")#transfer number to date and generate a appropriate date format
sbforplot=(num2date(startback)).replace(minute=0,second=0,microsecond=0).isoformat(" ")
ZIF=df[sfforplot[0:19]:sbforplot[0:19]]#get the DataFrame for zoom-in figure.
fig = plt.figure()
plt.plot(ZIF.index.to_pydatetime(),ZIF.values)
sfinal=ginput(n=1)#choose an exactly point.
sfinaltime=(num2date(sfinal[0][0])).replace(tzinfo=None)
sfinalforplot=sfinaltime.replace(minute=0,second=0,microsecond=0).isoformat(" ")
plt.clf()
#####for end point zoom figure###########
#below coding is very similar with the up one, it employed to choose exactly point at the end side.
endfront=end[0]-2 # looking 2 day either side of the point clicked
endback=end[0]+2
efforplot=(num2date(endfront)).replace(minute=0,second=0,microsecond=0).isoformat(" ")
ebforplot=(num2date(endback)).replace(minute=0,second=0,microsecond=0).isoformat(" ")
ZIB=df[efforplot[0:19]:ebforplot[0:19]]
fig = plt.figure()
plt.plot(ZIB.index.to_pydatetime(),ZIB.values)
efinal=ginput(n=1)
efinaltime=(num2date(efinal[0][0])).replace(tzinfo=None)
efinalforplot=efinaltime.replace(minute=0,second=0,microsecond=0).isoformat(" ")
plt.clf()
return sfinalforplot,efinalforplot
def clean_time_series(FFR,criteria):
# function to cleam time series record of "crit" times the standard deviations
a=0
for i in range(len(FFR)-4):#replace the record value which exceed 3 times standard deviations by 'Nan'.
diff1=abs(FFR.values[i+1]-FFR.values[i])
diff2=abs(FFR.values[i+2]-FFR.values[i+1])
diff3=abs(FFR.values[i+3]-FFR.values[i+2])
diff4=abs(FFR.values[i+4]-FFR.values[i+3]) # added this 11 Feb 2020 to deal with 4 data points in FSRS case
if diff1[0] > criteria and (diff2[0] > criteria or diff3[0] > criteria) or diff4[0] > criteria: # allows for two bad points in a row (added 2/5/2020)
print(str(FFR.index[i])+ ' is replaced by Nan')
a+=1
FFR.values[i+1]=float('NaN')
if diff3[0] > criteria:
FFR.values[i+2]=float('NaN')
if diff4[0] > criteria:
FFR.values[i+3]=float('NaN')
print('There are ' +str(a)+ ' points have replaced')
variables=['Date','Time','RawTemp']
FFC=FFR
return FFC
def plot_final(FFC,lalo,d,title1,title2):
# plots the original, the cleaned FFC (assumes temperature degC) and returns degF
# "d" is the previously calculated movement of trap
# puts the title (assumes fisherman's name)
fig, ax = plt.subplots( 1, 1)#, figsize=(9,3) )
#TimeDelta=FFC.index[-1]-FFC.index[0]
df=pd.read_csv(asc_file,skiprows=skipr,parse_dates={'datet':[0,1]},index_col='datet',date_parser=parse,names=['Date','Time','Temp'],encoding= 'unicode_escape')
ax.plot(df.index,df.values,'-',color='r',label="raw data")#,zorder=0)
#ax.set_ylim(min(FFC.values),max(FFC.values))
ax.set_ybound(min(FFC.values),max(FFC.values))
ax.set_xlim(min(FFC.index),max(FFC.index)) # limit the plot to logged data
ax.set_ylabel('celsius')
if FFC.index[0].year<FFC.index[-1].year: # when data strattles new year label both years
year=str(int(FFC.index[0].year))+'-'+str(int(FFC.index[-1].year))
else:
year=str(int(FFC.index[0].year))
ax.set_xlabel(year)
plt.suptitle(title1)
plt.title(title2)
FT=c2f(FFC['Temp'])
LT=c2f(lalo['Temp'].values)# logged temp in degC
OT=c2f(df['Temp'])
ax2=ax.twinx()
ax2.set_ylim(min(FT[0]),max(FT[0]))
#ax2.plot(OT[0].index,OT[0],color='m',label="raw data")
#ax2.plot(FT[0].index.to_pydatetime(),FT[0],color='b',label="clean data")# plots degF on right hand side
ax2.plot(FT[0].index,FT[0],color='b',label="clean data")# plots degF on right hand side
ax2.plot(lalo.index,LT[0],'c*',label='logged data')#,zorder=10) # plots logged data from LatLong file
moveid=list(where(array(d)>1.0)[0])#movements logged greater than 1km
for kk in moveid:
ax2.annotate('%s' % float('%6.1f' % d[kk])+' kms',
xy=(lalo.index[kk],LT[0][kk]-.3), xycoords='data',
xytext=(lalo.index[kk],min(FT[0])),
arrowprops=dict(facecolor='black', shrink=0.05),
horizontalalignment='right', verticalalignment='bottom')
#ax2.text(lalo.index[kk],LT[0][kk],'%s' % float('%6.1f' % d[kk])+' kms')
ax2.set_ylabel('fahrenheit')
lines, labels = ax.get_legend_handles_labels()
lines2, labels2 = ax2.get_legend_handles_labels()
ax2.legend(lines + lines2, labels + labels2, loc='best')
fig.autofmt_xdate()
#plt.show()
return FT
def write_html_file(Sc,year,fisherman,sn):
# writes an html file for this case (or appends to existing for this fishermen)
#file="/net/pubweb_html/epd/ocean/MainPage/lob/"+fisherman+".html"
file=webdir+fisherman+".html"
ex=os.path.exists(file)
if ex==True:
with open(file, 'r+') as fd:
contents = fd.readlines()
insert_string='<tr><td></td><td></td><td><a href="'+fisherman+'_'+str(year)+'.png">'+str(year)+'</td></tr>'
for index, line in enumerate(contents):
if '</table>' in line:
contents.insert(index -1, insert_string)
break
fd.seek(0)
fd.writelines(contents)
fd.close()
else:
#fin = open("/net/pubweb_html/epd/ocean/MainPage/lob/template.html", "rt")
#fout = open("/net/pubweb_html/epd/ocean/MainPage/lob/"+fisherman+'.html', "wt")
fin = open(webdir+"template.html", "rt")
fout = open(webdir+fisherman+'.html', "wt")
for line in fin:
if 'AA' in line:
fout.write(line.replace('AA', fisherman))
elif 'XX01' in line:
fout.write(line.replace('XX01', Sc).replace('ZZZ', str(int(sn))).replace('first setting','<a href="./fsrs/'+Sc+'_'+fisherman+'.png">'+str(year)))
elif 'next setting?' in line:
fout.write(line.replace('next setting?', '<a href="./fsrs/'+Sc+'_'+fisherman+'.png">'+str(year)))
else:
fout.write(line)
fin.close()
fout.close()
####################################
#MAIN CODE
# read LatLong file for this year and region (eventually we will loop through these variables year and LFA)
lalo_all=pd.read_csv(latlon_file,sep='\t')#.dropna()#,parse_dates={'datet':[1]},index_col='datet',date_parser=parsedate) # read in the tab-delimited header info
for k in range(len(lalo_all)):
try:
lalo_all['Date'][k]=pd.to_datetime(lalo_all['Date'][k],format='%d/%m/%Y') # this is the format most latlon files store date
except:
try:
lalo_all['Date'][k]=pd.to_datetime(lalo_all['Date'][k],format='%d-%b-%y') # case of 25-May-12 in, for example,'/net/data5/jmanning/fsrs/2011/LFA33/LatLong LFA 33_1112.txt'
except:
print('unacceptable date format in '+latlon_file)
sns=unique(lalo_all['Gauge'])# finds the unique serial numbers in the header file
sns=sns[~isnan(sns)] # keeps only those non-nan
#sns=[3377.0] # testing with this sn
#sns=sns[0:2] # testing with just first Xclo sn
numprobes=0
numhours=0 # track total number of hours
for j in sns: # loop through all the serieal numbers and look to read the appropriate csv file
numprobes=numprobes+1
lalo=lalo_all.loc[lalo_all['Gauge']==j] # delimits to this serial number where eventually we will loop through all distinct serial numbers
lalo=lalo.set_index('Date') # sets the index
lalo=lalo[~lalo.index.duplicated()] # gets rid of duplicates
if len(lalo)>1:
[d,mean_d,max_d]=find_range_dist(lalo['Latitude (degrees)'],lalo['Longitude (degrees)'])
else:
d,mean_d,max_d=0,0,0 # for case of one line in log such as 2012 sn 1332, for example
# read the minilog file for this sn
#file_path=inputdir+'Minilog-T_'+str(int(j))+'*.csv'
if (len(sorted(glob.glob(inputdir+'Minilog-T_'+str(int(j))+'*.csv')))>0) or (len(sorted(glob.glob(inputdir+'Asc-'+str(int(j))+'*.000')))>0): # if a csv file exist for this serial number
try:
asc_file=sorted(glob.glob(inputdir+'Minilog-T_'+str(int(j))+'*.csv'))[0]
except:
asc_file=sorted(glob.glob(inputdir+'Asc-'+str(int(j))+'*.000'))[0] # if it is not a "Minilog-T" file, it might be available as a "Asc-" file.
f=open(asc_file,'rb') # reads the Minilog ascii file where we need this 'rb' in Python3 or we'll get a "utf-8" error
lines=f.readlines()
if lines[0][0:4].decode('utf-8')=='Date': # case of Minlog-T_5664 in 2016 LFA34 missing header lines
continue
if 'Description' in lines[2].decode('utf-8'): # as in 2011 cases
fisherman=lines[2][19:-2].decode('utf-8').replace(' ','_') # fishermen's name is usually here in 2012 files field
else:
fisherman=lines[2][11:-2].decode('utf-8').replace(' ','_') # fishermen's name is usually here in 2011 files
f.close()
fout=open(sets_info_file,'a')
df=pd.read_csv(asc_file,skiprows=skipr,parse_dates={'datet':[0,1]},index_col='datet',date_parser=parse,names=['Date','Time','Temp'],encoding= 'unicode_escape') # dataframe with datetime index and tempC
# prepare to call the "chosseSE" function but first choose Start and End
if methSE=='click':
# make a rough plot
plt.plot(df.index,df['Temp'])
#draw a marker for each haul documented in lalo with this sn
plt.plot(lalo.index,lalo['Temp'],'r*')
plt.show()
print("click on start and stop times to save date")
[start,end]=ginput(n=2)
plt.clf()
FF,FT=chooseSE(df,start,end,skipr,crit)#calling the chooseSE function.
else:
start=min(lalo.index)
end=max(lalo.index)
sfinalforplot=start.replace(minute=0,second=0,microsecond=0).isoformat(" ")
efinalforplot=end.replace(minute=0,second=0,microsecond=0).isoformat(" ")
######for the final figure################
FF=df[sfinalforplot:efinalforplot]#FF is the DataFrame that include all the records you choosed to plot.
#criteria=crit*FF.values.std() # standard deviations
criteria=crit*std(FF.values)
FFC=clean_time_series(FF,criteria)
titlelabel=fisherman+' at '+Sc+' in '+'%s' % float('%6.1f' % mean(lalo['Depth (m)']))+'m ('+'%s' % float('%6.1f' % min(lalo['Depth (m)']))+'-'+'%s' % float('%6.1f' % max(lalo['Depth (m)']))+') using SN = '+str(int(j))
titlelabel2='max dist change = '+'%s' % float('%6.1f' % max_d)+' kilometers'
FT=plot_final(FFC,lalo,d,titlelabel,titlelabel2)
#plt.savefig('/net/pubweb_html/epd/ocean/MainPage/lob/fsrs/'+Sc+'_'+fisherman+'_'+year+'.png')
plt.savefig(webdir+Sc+'_'+fisherman+'_'+year+'.png') # where "webdir" is hardcoded at top of code
# output site,latitude,longitude,time,depth,temp to load into ORACLE
# where we get the nearest lat.lon,depth for each temp
lat,lon,dep=[],[],[]
idex=[]
for k in range(len(FFC)):
idex=lalo.index.get_loc(FFC.index[k], method='nearest')
lat.append(lalo['Latitude (degrees)'][idex])
lon.append(lalo['Longitude (degrees)'][idex])
dep.append(lalo['Depth (m)'][idex])
FFC['site']=Sc
FFC['lat']=lat
FFC['lon']=lon
FFC['depth']=dep
#with open(outfilename, 'a') as f: # write to file with header the first time
#if numprobes==1:
# FF.reindex(columns=['site','lat','lon','depth','Temp']).to_csv(outfile,header=outfile.tell()==0,float_format='%8.4f')
#else:
#FF.reindex(columns=['site','lat','lon','depth','Temp']).to_csv(outfile,mode='a',header=outfile.tell()==0,float_format='%8.4f')
FFC.reindex(columns=['site','lat','lon','depth','Temp']).to_csv(outfile,mode='a',float_format='%8.4f')
fout.write(Sc+','+str(int(j))+','+fisherman+','+str(len(FFC))+','+str(round(mean(lalo['Latitude (degrees)']),4))+','+str(round(mean(lalo['Longitude (degrees)']),4))+','+str(round(mean_d,1))+','+str(round(max_d,1))+','+str(round(mean(lalo['Depth (m)']),1))+','+str(round(min(lalo['Depth (m)']),1))+','+str(round(max(lalo['Depth (m)']),1))+'\n')
numhours=numhours+len(FFC)
write_html_file(Sc,year,fisherman,j)
fout.close()
print('Total number of hours = '+str(numhours))