In [1]:
import pandas as pd
import numpy as np
import feather
import os as os
import glob
from joblib import Parallel, delayed
import multiprocessing
import gc
from datetime import datetime, timedelta

In [65]:
def spherical_dist_populate(lat_lis, lon_lis, r=3958.75):
    lat_mtx = np.array([lat_lis]).T * np.pi / 180
    lon_mtx = np.array([lon_lis]).T * np.pi / 180

    cos_lat_i = np.cos(lat_mtx)
    cos_lat_j = np.cos(lat_mtx)
    cos_lat_J = np.repeat(cos_lat_j, len(lat_mtx), axis=1).T

    lat_Mtx = np.repeat(lat_mtx, len(lat_mtx), axis=1).T
    cos_lat_d = np.cos(lat_mtx - lat_Mtx)

    lon_Mtx = np.repeat(lon_mtx, len(lon_mtx), axis=1).T
    cos_lon_d = np.cos(lon_mtx - lon_Mtx)

    mtx = r * np.arccos(cos_lat_d - cos_lat_i*cos_lat_J*(1 - cos_lon_d))
    return mtx

def stationary_vessel(data):
    min_d_lat = data.sort_values(['lat', 'lon']).lat.iloc[0]
    min_d_lon = data.sort_values(['lat', 'lon']).lon.iloc[0]
    max_d_lat = data.sort_values(['lat', 'lon']).lat.iloc[-1]
    max_d_lon = data.sort_values(['lat', 'lon']).lon.iloc[-1]

    dist = spherical_dist_populate([min_d_lat, max_d_lat], [min_d_lon, max_d_lon])
    dist = dist[1,0]

    if dist >= 1:
        return 1
    else:
        return 0

In [92]:
import pandas as pd

def subx_y(data):
    diff = data['y'].max() - data['x'].min()
    print(data['group'])
    print('min(x) = ', data['x'].min())
    print('max(y) = ', data['y'].max())
    #print(diff)
        
    if diff == 60:
        return 0
    else:
        return 1
    
test = pd.DataFrame({'group': [1, 1, 1, 2, 2, 2], 'x': [50, 55, 60, 50, 50, 50], 'y': [100, 105, 110, 100, 100, 100]})
#print(test)
grouped = test.groupby('group').apply(subx_y)
#print(grouped)

0    1
1    1
2    1
Name: group, dtype: int64
min(x) =  50
max(y) =  110
0    1
1    1
2    1
Name: group, dtype: int64
min(x) =  50
max(y) =  110
3    2
4    2
5    2
Name: group, dtype: int64
min(x) =  50
max(y) =  100


In [54]:
outdat = test.groupby('mmsi').apply(stationary_vessel).transform()
print(outdat)

1
1
2


TypeError: transform() missing 1 required positional argument: 'func'

In [7]:
GFW_DIR = '/data2/GFW_point/'
GFW_OUT_DIR_CSV = '/home/server/pi/homes/woodilla/Data/GFW_point/Patagonia_Shelf/csv/'
GFW_OUT_DIR_FEATHER = '/home/server/pi/homes/woodilla/Data/GFW_point/Patagonia_Shelf/feather/'

In [8]:
gfw_vessel_dat = pd.read_csv('~/Data/GFW_public/fishing_vessels/fishing_vessels.csv')
dat = pd.read_csv('/data2/GFW_point/2016-01-01/messages-2016-01-01-000000000000.csv')
#gfw_identities = gfw_identities.rename(index=str, columns={"ssvid": "mmsi"})
#gfw_identities['year'] = pd.DatetimeIndex(gfw_identities['timestamp']).year 
#gfw_identities['month'] = pd.DatetimeIndex(gfw_identities['timestamp']).month
#gfw_identities['day'] = pd.DatetimeIndex(gfw_identities['timestamp']).day

  interactivity=interactivity, compiler=compiler, result=result)


In [4]:
def data_step(data): 
    # Patagonia shelf coordinates
    #lower left lat: -58
    #lower left lon: -77
    #upper right lat: -23
    #upper right lon: -22
    
    # (1) Subset all vessels that are in Patagonia Shelf region
    # (2) Keep incomplete segments
    lon1 = -77
    lon2 = -22
    lat1 = -58
    lat2 = -23
    retdat = data[(data['lon'] >= lon1) & (data['lon'] <= lon2) & (data['lat'] >= lat1) & (data['lat'] <= lat2)]

    # Get list of all vessels in region
    unique_vessels = list(retdat['mmsi'].unique())

    # Subset to allow for all segments even if outside of range
    retdat = data[data['mmsi'].isin(unique_vessels)]
    
    # (3) Remove boats from land and at port
    retdat = retdat['distance_from_shore_m' > 0]
    retdat = retdat['distance_from_port_m' > 0]    
    
    # (4) Stationary
    retdat = retdat.grouby('mmsi').transform(stationary_vessel)
    

    
    # (5) In/Out of EEZ (country)
    
    # Separate Year, month, day, hour, minute, second
    retdat.loc[:, 'timestamp'] = pd.to_datetime(retdat['timestamp'], format="%Y-%m-%d %H:%M:%S UTC")
    retdat.loc[:, 'year'] = pd.DatetimeIndex(retdat['timestamp']).year 
    retdat.loc[:, 'month'] = pd.DatetimeIndex(retdat['timestamp']).month
    retdat.loc[:, 'day'] = pd.DatetimeIndex(retdat['timestamp']).day
    retdat.loc[:, 'hour'] = pd.DatetimeIndex(retdat['timestamp']).hour
    retdat.loc[:, 'minute'] = pd.DatetimeIndex(retdat['timestamp']).minute
    retdat.loc[:, 'second'] = pd.DatetimeIndex(retdat['timestamp']).second
    
    # Merge GFW ID data
    retdat = pd.merge(retdat, gfw_vessel_dat, how='left', on='mmsi')  
    
    retdat = retdat[['timestamp', 'year', 'month', 'day', 'hour', 'minute', 'second', 'mmsi', 'lat', 'lon', \
                    'segment_id', 'message_id', 'type', 'speed', 'course', 'heading', 'shipname', 'callsign', \
                     'destination', 'elevation_m', 'distance_from_shore_m', 'distance_from_port_m', 'nnet_score', \
                     'logistic_score', 'flag', 'geartype', 'length', 'tonnage', 'engine_power', 'active_2012', \
                     'active_2013', 'active_2014', 'active_2015', 'active_2016']]
    
    return retdat

In [1]:
def GFW_directories(directory):
    
    dirs = os.listdir(directory)
    # Remove subfolders 'BK' and 'identities'
    if 'BK' in dirs:
        dirs.remove('BK')
    
    if 'identities' in dirs:
        dirs.remove('identities')
    
    return dirs

In [6]:
ndat = data_step(data=dat)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self.obj[item] = s
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self.obj[key] = _infer_fill_value(value)


In [7]:
dirs = sorted(GFW_directories(GFW_DIR))
len(dirs)

1096

In [25]:
allFiles

NameError: name 'allFiles' is not defined

In [10]:
dirs = sorted(GFW_directories(GFW_dir))

for i in dirs[1:2]:  # Change!!!

    # Get subdirectory list of files
    subdir = GFW_DIR + i
    allFiles = glob.glob(subdir + "/*.csv")
    list_ = []
    
    # Append files in subdir
    for file_ in allFiles:
        df = pd.read_csv(file_, index_col=None, header=0)
        list_.append(df)
        dat = pd.concat(list_, axis = 0, ignore_index = True)
    
    # Append data
    outdat = data_step(data=dat)
    outdat = outdat.fillna()

    # Get string for filename from timestamp
    filename = f"{outdat['year'][1]}-" + f"{outdat['month'][1]}".zfill(2) + f"-" + f"{outdat['day'][1]}".zfill(2)
    
    # Save unique mmsi for each day
    unique_mmsi_data = outdat['mmsi'].unique()
    unique_mmsi = pd.DataFrame({'mmsi':unique_mmsi_data})
    unique_mmsi.to_feather('~/Data/GFW_point/Patagonia_Shelf/vessel_list/' + filename +  '_vessel_list'  + '.feather')
    
    # Save data
    outdat.to_csv('~/Data/GFW_point/Patagonia_Shelf/csv/' + filename + '.csv', index=False)
    outdat.to_feather('~/Data/GFW_point/Patagonia_Shelf/feather/' + filename + '.feather')
    print(i)

NameError: name 'GFW_dir' is not defined

In [9]:
gfw_list_dirs = sorted(GFW_directories(GFW_DIR))

# Check for missing files
# Get csv files from output
csv_files = glob.glob(GFW_OUT_DIR_CSV + "*.csv")
csv_files = [item.replace('/home/server/pi/homes/woodilla/Data/GFW_point/Patagonia_Shelf/csv/', '') for item in csv_files]
csv_files = [item.replace('.csv', '') for item in csv_files]

feather_files = glob.glob(GFW_OUT_DIR_CSV + "*.csv")
feather_files = [item.replace('/home/server/pi/homes/woodilla/Data/GFW_point/Patagonia_Shelf/csv/', '') for item in feather_files]
feather_files = [item.replace('.csv', '') for item in feather_files]
feather_files

csv_feather_diff = list(set(csv_files).difference(feather_files))
gfw_diff = list(set(gfw_list_dirs).difference(feather_files))
gfw_diff.extend(csv_feather_diff)
gfw_list_dirs = gfw_diff
gfw_list_dirs

new_gfw_list_dirs = []
for i in gfw_list_dirs:
    indate = i
    outdate = datetime.strptime(indate, "%Y-%m-%d")
    outdate = outdate + timedelta(days=-1)
    outdate = datetime.strftime(outdate, "%Y-%m-%d")
    new_gfw_list_dirs.append(outdate)
#new_gfw_list_dirs
#gfw_list_dirs
sorted(gfw_list_dirs)
#from datetime import datetime, timedelta
#s = gfw_list_dirs
#date = datetime.strptime(s, "%Y-%m-%d")
##modified_date = date + timedelta(days=-1)
#modified_date = datetime.strftime(modified_date, "%Y/%m/%d")
#modified_date

['2016-01-01']

In [232]:
inputs = dirs
def processGFW(i):
    subdir = GFW_DIR + i
    allFiles = glob.glob(subdir + "/*.csv")
    list_ = []
    
    # Append files in subdir
    for file_ in allFiles:
        df = pd.read_csv(file_, index_col=None, header=0)
        list_.append(df)
        dat = pd.concat(list_, axis = 0, ignore_index = True)
    outdat = data_step(data=dat, filename=i)
    outdat.to_csv('~/Data/GFW_point/Patagonia_Shelf/csv/' + i + '.csv')
    outdat.to_feather('~/Data/GFW_point/Patagonia_Shelf/feather/' + i + '.feather')
    #print(i)  
 
num_cores = 20
results = Parallel(n_jobs=num_cores)(delayed(processGFW)(i) for i in inputs)