# Boat Scraping

In [51]:
import requests
from bs4 import BeautifulSoup
from csv import writer

agent = {"User-Agent":'Mozilla/5.0 (Windows NT 6.3; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/59.0.3071.115 Safari/537.36'}
response = requests.get(url, headers=agent)
soup = BeautifulSoup(response.text,"html.parser")

In [52]:
soup = BeautifulSoup(response.text,"html.parser")

In [3]:
#soup.find_all('table')

# Need to make `latlon_clean[1]` Negative

In [53]:
table = soup.find_all(class_='v3')
coords = table[9] # coordinate row
text = coords.get_text() # Text from coordinate row
latlon = text.split('/')
latlon_clean = [val[:-2] for val in latlon]
latlon_clean[1] = '-'+latlon_clean[1]  
position = ', '.join(latlon_clean)

In [54]:
coords

<td class="v3">32.02369 N/81.04659 W</td>

In [55]:
text

'32.02369 N/81.04659 W'

In [56]:
latlon

['32.02369 N', '81.04659 W']

In [57]:
latlon_clean

['32.02369', '-81.04659']

In [58]:
position

'32.02369, -81.04659'

## Reverse Address Lookup Using Lat/Long
`geopy` can reverse lookup an address from the lat/lon values.

In [59]:
from geopy.geocoders import Nominatim
geolocator = Nominatim(user_agent="jwh")
physical = geolocator.reverse(position)
location = physical.address

In [63]:
physical

Location(Thunderbolt Marine Inc. Warehouse, 3124, River Drive, Thunderbolt, Chatham County, Georgia, 31404, United States, (32.0222342, -81.04666794933618, 0.0))

In [61]:
location

'Thunderbolt Marine Inc. Warehouse, 3124, River Drive, Thunderbolt, Chatham County, Georgia, 31404, United States'

In [64]:
type(physical)

geopy.location.Location

## Adding Time Delays
We don't want to flood the website with traffic. 

`time.sleep(n)` pauses the code for n seconds

In [49]:
import time
print('Start!')
time.sleep(3)
print('Ok...its been long enough!')

Start!
Ok...its been long enough!


## Putting it All Together
Time to compile the entire script, including vessel URL dictionaries

In [7]:
import requests
import time
import os
from bs4 import BeautifulSoup
from csv import writer
from geopy.geocoders import Nominatim
from tqdm import tqdm


agent = {"User-Agent":'Mozilla/5.0 (Windows NT 6.3; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/59.0.3071.115 Safari/537.36'}
geolocator = Nominatim(user_agent="jwh")
timestamp = time.strftime('%Y%m%d-%I_%M_%p')

git st

boats = {
    "Brangus": "https://www.vesselfinder.com/vessels/BRANGUS-IMO-0-MMSI-366899270",
    "Calcasieu River": "https://www.vesselfinder.com/vessels/CALCASIEU-RIVER-IMO-0-MMSI-367313850",
    "Cavalier State": "https://www.vesselfinder.com/vessels/CAVALIER-STATE-IMO-0-MMSI-367340130",
    "Columbia River": "https://www.vesselfinder.com/vessels/COLUMBIA-RIVER-IMO-0-MMSI-367187130",
    "Cooper River": "https://www.vesselfinder.com/vessels/COOPER-RIVER-IMO-0-MMSI-366867370",
    "East River": "https://www.vesselfinder.com/vessels/EAST-RIVER-IMO-0-MMSI-366898790",
    "Evergreen State": "https://www.vesselfinder.com/vessels/EVERGREEN-STATE-IMO-0-MMSI-367156370",
    "Lone Star State": "https://www.vesselfinder.com/vessels/LONESTAR-STATE-IMO-0-MMSI-367183730",
    "McCormack Boys": "https://www.vesselfinder.com/vessels/MCCORMACK-BOYS-IMO-0-MMSI-366872170",
    "Miami River": "https://www.vesselfinder.com/vessels/MIAMI-RIVER-IMO-0-MMSI-366898810",
    "Muskegon River": "https://www.vesselfinder.com/vessels/MUSKEGON-RIVER-IMO-0-MMSI-367509760",
    "Ohio River": "https://www.vesselfinder.com/vessels/OHIO-RIVER-IMO-0-MMSI-366899290",
    "Pearl River": "https://www.vesselfinder.com/vessels/PEARL-RIVER-IMO-0-MMSI-367187120",
    "Saginaw River": "https://www.vesselfinder.com/vessels/SAGINAW-RIVER-IMO-0-MMSI-367511230",
    "St Johns River": "https://www.vesselfinder.com/vessels/SAINT-JOHNS-RIVER-IMO-0-MMSI-367313760",
    "St Louis River": "https://www.vesselfinder.com/vessels/ST-LOUIS-RIVER-IMO-0-MMSI-367609770",
    "Volunteer State": "https://www.vesselfinder.com/vessels/VOLUNTEER-STATE-IMO-0-MMSI-367314110",
    "Wolf River": "https://www.vesselfinder.com/vessels/WOLF-RIVER-IMO-0-MMSI-367060910"
    }

def abbreviate_state(state):
    state_dict = {
    'Alabama': 'AL',
    'Alaska': 'AK',
    'American Samoa': 'AS',
    'Arizona': 'AZ',
    'Arkansas': 'AR',
    'California': 'CA',
    'Colorado': 'CO',
    'Connecticut': 'CT',
    'Delaware': 'DE',
    'District of Columbia': 'DC',
    'Florida': 'FL',
    'Georgia': 'GA',
    'Guam': 'GU',
    'Hawaii': 'HI',
    'Idaho': 'ID',
    'Illinois': 'IL',
    'Indiana': 'IN',
    'Iowa': 'IA',
    'Kansas': 'KS',
    'Kentucky': 'KY',
    'Louisiana': 'LA',
    'Maine': 'ME',
    'Maryland': 'MD',
    'Massachusetts': 'MA',
    'Michigan': 'MI',
    'Minnesota': 'MN',
    'Mississippi': 'MS',
    'Missouri': 'MO',
    'Montana': 'MT',
    'Nebraska': 'NE',
    'Nevada': 'NV',
    'New Hampshire': 'NH',
    'New Jersey': 'NJ',
    'New Mexico': 'NM',
    'New York': 'NY',
    'North Carolina': 'NC',
    'North Dakota': 'ND',
    'Northern Mariana Islands':'MP',
    'Ohio': 'OH',
    'Oklahoma': 'OK',
    'Oregon': 'OR',
    'Pennsylvania': 'PA',
    'Puerto Rico': 'PR',
    'Rhode Island': 'RI',
    'South Carolina': 'SC',
    'South Dakota': 'SD',
    'Tennessee': 'TN',
    'Texas': 'TX',
    'Utah': 'UT',
    'Vermont': 'VT',
    'Virgin Islands': 'VI',
    'Virginia': 'VA',
    'Washington': 'WA',
    'West Virginia': 'WV',
    'Wisconsin': 'WI',
    'Wyoming': 'WY'
    }
    if state in state_dict:
        state = state_dict[state]
    return state

def get_city(raw):
    if 'city' in raw['address']:
        town = raw['address']['city']
        
    elif 'town' in raw['address']:
        town = raw['address']['town']
        
    elif 'village' in raw['address']:
        town = raw['address']['village']
        
    elif 'county' in raw['address']:
        town = raw['address']['county']
        
    else:
        town = 'Error'
        
    return town


start = time.time()

with open(scrape_file, 'w', newline='') as file:
    csv_writer = writer(file)
    
    # Add Headers
    csv_writer.writerow(['Vessel', 'Lat-Long','Last Update','City', 'State'])
    
    for boat, url in tqdm(boats.items()):
        response = requests.get(url, headers=agent)
        soup = BeautifulSoup(response.text,"html.parser")
        
        # Isolate table of interest
        table = soup.find_all(class_='v3')
        
        # Isolate and clean lat, long coordinates
        coords = table[9] # coordinate row
        text = coords.get_text() # Text from coordinate row
        latlon = text.split('/')
        latlon_clean = [val[:-2] for val in latlon]
        latlon_clean[1] = '-'+latlon_clean[1]  
        position = ', '.join(latlon_clean)
        
        # Reverse lat-lon lookup, isolating town, state
        physical = geolocator.reverse(position)
#         location = physical.address # Gives full address. Too much for our needs.
        vessel_raw = physical.raw
    
        town = get_city(vessel_raw)
        state = vessel_raw['address']['state']
        # Abbreviate state from full name to a 2-letter code
        state = abbreviate_state(state)
        
        # Isolate and clean last position update
        time_since_last_position = table[11].get_text() # coordinate row
        cleaned_time = time_since_last_position[:-3]
        
        # Write results to spreadsheet row
        csv_writer.writerow([boat, position, cleaned_time, town, state])
        
        # Wait n seconds in between requests
        time.sleep(1)
        
end = time.time()
duration = end - start

print(f'Finished. This script took {duration} seconds to run')

100%|██████████████████████████████████████████████████████████████████████████████████| 18/18 [00:32<00:00,  1.78s/it]

Finished. This script took 32.0228226184845 seconds to run





## View Data with Pandas

In [10]:
import pandas as pd


df = pd.read_csv('data/20201229-10_50_AM-Fleet Location.csv')

In [11]:
df

Unnamed: 0,Vessel,Lat-Long,Last Update,City,State
0,Brangus,"39.37314, -74.42712",49 days ago,Atlantic City,NJ
1,Calcasieu River,"32.76257, -79.92533",79 days ago,Charleston,SC
2,Cavalier State,"32.02369, -81.04659",7 days ago,Thunderbolt,GA
3,Columbia River,"30.39822, -81.54972",2 mins ago,Jacksonville,FL
4,Cooper River,"32.83351, -79.93199",0 min ago,North Charleston,SC
5,East River,"32.83273, -79.94228",28 mins ago,Charleston,SC
6,Evergreen State,"26.76782, -80.04475",25 hours ago,Palm Beach,FL
7,Lone Star State,"30.49491, -88.0194",197 days ago,Mobile County,AL
8,McCormack Boys,"37.01095, -76.23765",27 days ago,Hampton City,VA
9,Miami River,"31.10795, -81.43681",1 min ago,Brunswick,GA


Time units to consider for conversion:
* day
* min
* hour

In [17]:
keyword = 'min'
for file in df['Last Update']:
    print(file, keyword in file)

49 days ago False
79 days ago False
7 days ago False
2 mins ago True
0 min ago True
28 mins ago True
25 hours ago False
197 days ago False
27 days ago False
1 min ago True
2 hours ago False
9 mins ago True
2 hours ago False
18 days ago False
0 min ago True
20 hours ago False
49 days ago False
16 days ago False


In [20]:
test = df['Last Update'][0]
test

'49 days ago'

In [26]:
number = ''.join(x for x in test if x.isdigit())
number = int(number)
number

49

In [27]:
units = ['day','hour','min']

In [38]:
def convert_time(last_update):
    unit_to_minutes = {'day':24*60, 'hour': 60, 'min': 1}
    
    # Extract leading integer unit measurement
    number = ''.join(x for x in test if x.isdigit())
    number = int(number)
    
    for unit in unit_to_minutes:
        if unit in last_update:
            conversion = number * unit_to_minutes[unit]
            return f'{conversion} minutes ago'
    
    return 'No valid units found'

In [39]:
convert_time(test)

'70560 minutes ago'

### Scratch Work (Script Refinement)

In [103]:
import requests
import time
from bs4 import BeautifulSoup
from csv import writer
from geopy.geocoders import Nominatim

agent = {"User-Agent":'Mozilla/5.0 (Windows NT 6.3; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/59.0.3071.115 Safari/537.36'}
geolocator = Nominatim(user_agent="jwh")
url = 'https://www.vesselfinder.com/vessels/PEARL-RIVER-IMO-0-MMSI-367187120'

In [104]:
response = requests.get(url, headers=agent)

In [105]:
soup = BeautifulSoup(response.text,"html.parser")

In [106]:
soup.find_all('table')

[<table class="tparams npctable">
 <thead><tr class="table-header"><th class="v31">Current AIS Destination</th><th class="v31">Estimated Time of Arrival</th></tr></thead>
 <tbody>
 <tr><td class="n3ata">-</td><td class="n3ata">-</td></tr>
 </tbody>
 </table>,
 <table class="tparams">
 <thead><tr><th class="text1">Port</th><th class="text1">Arrival (UTC)</th><th class="text1">In Port</th></tr></thead>
 <tbody class="pctable" id="port-calls"></tbody></table>,
 <table class="tparams"><tbody>
 <tr><td class="n3">AIS Type</td><td class="v3">Resolution 18 ship</td></tr>
 <tr><td class="n3">Flag</td><td class="v3">USA</td></tr>
 <tr><td class="n3">Destination</td><td class="v3">-</td></tr>
 <tr><td class="n3">ETA</td><td class="v3">-</td></tr>
 <tr><td class="n3">IMO / MMSI</td><td class="v3"> -  / 367187120</td></tr>
 <tr><td class="n3">Callsign</td><td class="v3">WDD7486</td></tr>
 <tr><td class="n3">Length / Beam</td><td class="v3">-</td></tr>
 <tr><td class="n3">Current draught</td><td cl

In [107]:
table = soup.find_all(class_='v3')
time_since_last_position = table[11].get_text() # coordinate row
cleaned_time = time_since_last_position[:-3]

In [108]:
cleaned_time

'58 mins ago'

In [109]:
# Find current lat, lon coordinates
table = soup.find_all(class_='v3')
coords = table[9] # coordinate row
text = coords.get_text() # Text from coordinate row
latlon = text.split('/')
latlon_clean = [val[:-2] for val in latlon]
latlon_clean[1] = '-'+latlon_clean[1]  
position = ', '.join(latlon_clean)

# Reverse lat-lon lookup, giving address
physical = geolocator.reverse(position)

location = physical.address

# Isolate town, state
# vessel_raw = physical.raw
# town = vessel_raw['address']['town']
# state = vessel_raw['address']['state']

print(position)

32.72988, -79.82642


In [110]:
vessel_raw = physical.raw
print(vessel_raw)

{'place_id': 2514707, 'licence': 'Data © OpenStreetMap contributors, ODbL 1.0. https://osm.org/copyright', 'osm_type': 'node', 'osm_id': 357089972, 'lat': '32.7651767', 'lon': '-79.8367511', 'display_name': 'Sullivan\'s Island, Sullivan"s Island, Charleston County, South Carolina, 29482, United States', 'address': {'place': "Sullivan's Island", 'town': 'Sullivan"s Island', 'county': 'Charleston County', 'state': 'South Carolina', 'postcode': '29482', 'country': 'United States', 'country_code': 'us'}, 'boundingbox': ['32.4451767', '33.0851767', '-80.1567511', '-79.5167511']}


In [77]:
'village' in vessel_raw['address']

True

In [57]:
town = vessel_raw['address']['town']
state = vessel_raw['address']['state']

KeyError: 'town'

In [62]:
def get_city(raw):
    if 'city' in vessel_raw['address']:
        town = raw['address']['city']
    elif 'town' in vessel_raw['address']:
        town = raw['address']['town']
    else:
        town = 'Error'
    
    return town

In [64]:
get_city(vessel_raw)

'Atlantic City'

In [60]:
'city' in vessel_raw['address']

True

In [61]:
'town' in vessel_raw['address']

False

In [25]:
town

'Brunswick'

In [40]:
state = 'Georgia'

In [41]:
state

'Georgia'

In [42]:
def abbreviate_state(state):
    state_dict = {
    'Alabama': 'AL',
    'Alaska': 'AK',
    'American Samoa': 'AS',
    'Arizona': 'AZ',
    'Arkansas': 'AR',
    'California': 'CA',
    'Colorado': 'CO',
    'Connecticut': 'CT',
    'Delaware': 'DE',
    'District of Columbia': 'DC',
    'Florida': 'FL',
    'Georgia': 'GA',
    'Guam': 'GU',
    'Hawaii': 'HI',
    'Idaho': 'ID',
    'Illinois': 'IL',
    'Indiana': 'IN',
    'Iowa': 'IA',
    'Kansas': 'KS',
    'Kentucky': 'KY',
    'Louisiana': 'LA',
    'Maine': 'ME',
    'Maryland': 'MD',
    'Massachusetts': 'MA',
    'Michigan': 'MI',
    'Minnesota': 'MN',
    'Mississippi': 'MS',
    'Missouri': 'MO',
    'Montana': 'MT',
    'Nebraska': 'NE',
    'Nevada': 'NV',
    'New Hampshire': 'NH',
    'New Jersey': 'NJ',
    'New Mexico': 'NM',
    'New York': 'NY',
    'North Carolina': 'NC',
    'North Dakota': 'ND',
    'Northern Mariana Islands':'MP',
    'Ohio': 'OH',
    'Oklahoma': 'OK',
    'Oregon': 'OR',
    'Pennsylvania': 'PA',
    'Puerto Rico': 'PR',
    'Rhode Island': 'RI',
    'South Carolina': 'SC',
    'South Dakota': 'SD',
    'Tennessee': 'TN',
    'Texas': 'TX',
    'Utah': 'UT',
    'Vermont': 'VT',
    'Virgin Islands': 'VI',
    'Virginia': 'VA',
    'Washington': 'WA',
    'West Virginia': 'WV',
    'Wisconsin': 'WI',
    'Wyoming': 'WY'
}
    if state in state_dict:
        state = state_dict[state]
    return state

In [45]:
abbreviate_state('HAJAKJSALKDJ')

'HAJAKJSALKDJ'

In [31]:
if state in state_dict:
    state = state_dict[state]

In [32]:
state

'GA'

In [29]:
len(state_dict)

56

In [3]:
import time
timestamp = time.strftime('%Y%m%d-%I_%M_%p')

filename = 'fleet-position-data/' + timestamp + '-Fleet Location.csv'

In [4]:
filename

'fleet-position-data/20201229-09_48_AM-Fleet Location.csv'

### Data Directory Manipulation

In [5]:
import os

DATA_DIR = 'data'

In [6]:
import os

DATA_DIR = 'data'
os.makedirs(DATA_DIR, exist_ok=True)

timestamp = time.strftime('%Y%m%d-%I_%M_%p')
filename = timestamp + '-Fleet Location.csv'
scrape_file = os.path.join(DATA_DIR, filename)

# with open(scrape_file)

In [None]:
timestamp = time.strftime('%Y%m%d-%I_%M_%p')


filename = timestamp + '-Fleet Location.csv'
file_dir = os.path.join('data',filename)