# Boat Scraping

In [51]:
import requests
from bs4 import BeautifulSoup
from csv import writer

agent = {"User-Agent":'Mozilla/5.0 (Windows NT 6.3; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/59.0.3071.115 Safari/537.36'}
response = requests.get(url, headers=agent)
soup = BeautifulSoup(response.text,"html.parser")

In [52]:
soup = BeautifulSoup(response.text,"html.parser")

In [3]:
#soup.find_all('table')

# Need to make `latlon_clean[1]` Negative

In [53]:
table = soup.find_all(class_='v3')
coords = table[9] # coordinate row
text = coords.get_text() # Text from coordinate row
latlon = text.split('/')
latlon_clean = [val[:-2] for val in latlon]
latlon_clean[1] = '-'+latlon_clean[1]  
position = ', '.join(latlon_clean)

In [54]:
coords

<td class="v3">32.02369 N/81.04659 W</td>

In [55]:
text

'32.02369 N/81.04659 W'

In [56]:
latlon

['32.02369 N', '81.04659 W']

In [57]:
latlon_clean

['32.02369', '-81.04659']

In [58]:
position

'32.02369, -81.04659'

## Reverse Address Lookup Using Lat/Long
`geopy` can reverse lookup an address from the lat/lon values.

In [59]:
from geopy.geocoders import Nominatim
geolocator = Nominatim(user_agent="jwh")
physical = geolocator.reverse(position)
location = physical.address

In [63]:
physical

Location(Thunderbolt Marine Inc. Warehouse, 3124, River Drive, Thunderbolt, Chatham County, Georgia, 31404, United States, (32.0222342, -81.04666794933618, 0.0))

In [61]:
location

'Thunderbolt Marine Inc. Warehouse, 3124, River Drive, Thunderbolt, Chatham County, Georgia, 31404, United States'

In [64]:
type(physical)

geopy.location.Location

## Adding Time Delays
We don't want to flood the website with traffic. 

`time.sleep(n)` pauses the code for n seconds

In [49]:
import time
print('Start!')
time.sleep(3)
print('Ok...its been long enough!')

Start!
Ok...its been long enough!


## Putting it All Together
Time to compile the entire script, including vessel URL dictionaries

In [3]:
import requests
import time
from bs4 import BeautifulSoup
from csv import writer
from geopy.geocoders import Nominatim
from tqdm import tqdm


agent = {"User-Agent":'Mozilla/5.0 (Windows NT 6.3; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/59.0.3071.115 Safari/537.36'}
geolocator = Nominatim(user_agent="jwh")
timestamp = time.strftime('%Y%m%d-%I_%M_%p')
filename = timestamp + '-Fleet Location.csv'

boats = {
    "Brangus": "https://www.vesselfinder.com/vessels/BRANGUS-IMO-0-MMSI-366899270",
    "Calcasieu River": "https://www.vesselfinder.com/vessels/CALCASIEU-RIVER-IMO-0-MMSI-367313850",
    "Cavalier State": "https://www.vesselfinder.com/vessels/CAVALIER-STATE-IMO-0-MMSI-367340130",
    "Columbia River": "https://www.vesselfinder.com/vessels/COLUMBIA-RIVER-IMO-0-MMSI-367187130",
    "Cooper River": "https://www.vesselfinder.com/vessels/COOPER-RIVER-IMO-0-MMSI-366867370",
    "East River": "https://www.vesselfinder.com/vessels/EAST-RIVER-IMO-0-MMSI-366898790",
    "Evergreen State": "https://www.vesselfinder.com/vessels/EVERGREEN-STATE-IMO-0-MMSI-367156370",
    "Lone Star State": "https://www.vesselfinder.com/vessels/LONESTAR-STATE-IMO-0-MMSI-367183730",
    "McCormack Boys": "https://www.vesselfinder.com/vessels/MCCORMACK-BOYS-IMO-0-MMSI-366872170",
    "Miami River": "https://www.vesselfinder.com/vessels/MIAMI-RIVER-IMO-0-MMSI-366898810",
    "Muskegon River": "https://www.vesselfinder.com/vessels/MUSKEGON-RIVER-IMO-0-MMSI-367509760",
    "Ohio River": "https://www.vesselfinder.com/vessels/OHIO-RIVER-IMO-0-MMSI-366899290",
    "Pearl River": "https://www.vesselfinder.com/vessels/PEARL-RIVER-IMO-0-MMSI-367187120",
    "Saginaw River": "https://www.vesselfinder.com/vessels/SAGINAW-RIVER-IMO-0-MMSI-367511230",
    "St Johns River": "https://www.vesselfinder.com/vessels/SAINT-JOHNS-RIVER-IMO-0-MMSI-367313760",
    "St Louis River": "https://www.vesselfinder.com/vessels/ST-LOUIS-RIVER-IMO-0-MMSI-367609770",
    "Volunteer State": "https://www.vesselfinder.com/vessels/VOLUNTEER-STATE-IMO-0-MMSI-367314110",
    "Wolf River": "https://www.vesselfinder.com/vessels/WOLF-RIVER-IMO-0-MMSI-367060910"
    }

start = time.time()

with open(filename, 'w', newline='') as file:
    csv_writer = writer(file)
    
    # Add Headers
    csv_writer.writerow(['Vessel', 'Lat-Long','Last Update','Address'])
    
    for boat, url in tqdm(boats.items()):
        response = requests.get(url, headers=agent)
        soup = BeautifulSoup(response.text,"html.parser")
        
        # Isolate table of interest
        table = soup.find_all(class_='v3')
        
        # Isolate and clean lat, long coordinates
        coords = table[9] # coordinate row
        text = coords.get_text() # Text from coordinate row
        latlon = text.split('/')
        latlon_clean = [val[:-2] for val in latlon]
        latlon_clean[1] = '-'+latlon_clean[1]  
        position = ', '.join(latlon_clean)
        # Reverse lat-lon lookup, giving address
        physical = geolocator.reverse(position)
        location = physical.address
        
        # Isolate and clean last position update
        time_since_last_position = table[11].get_text() # coordinate row
        cleaned_time = time_since_last_position[:-3]
        
        
        # Write results to spreadsheet row
        csv_writer.writerow([boat, position, cleaned_time, location])
        
        # Wait n seconds in between requests
        time.sleep(1)
        
end = time.time()
duration = end - start

print(f'Finished. This script took {duration} seconds to run')

100%|██████████████████████████████████████████████████████████████████████████████████| 17/17 [00:32<00:00,  1.91s/it]

Finished. This script took 32.54650521278381 seconds to run





In [4]:
len(boats)

17

## View Data with Pandas

In [21]:
import pandas as pd


df = pd.read_csv('20201228-08_29_PM-Fleet Location.csv')

In [22]:
df

Unnamed: 0,Vessel,Lat-Long,Last Update,Address
0,Brangus,"39.37314, -74.42712",49 days ago,"710, North Maryland Avenue, Carver Hall Apartm..."
1,Calcasieu River,"32.76257, -79.92533",78 days ago,"Charleston, Charleston County, South Carolina,..."
2,Cavalier State,"32.02369, -81.04659",6 days ago,"Thunderbolt Marine Inc. Warehouse, 3124, River..."
3,Columbia River,"30.38538, -81.55102",2 min ago,"Propeller Drive, Jacksonville, Duval County, F..."
4,Cooper River,"32.8328, -79.94211",1 min ago,"Herbert Street, Hibernian Heights, Charleston,..."
5,East River,"32.83273, -79.9423",5 hours ago,"Herbert Street, Hibernian Heights, Charleston,..."
6,Evergreen State,"26.76782, -80.04475",10 hours ago,"Peanut Island Shutte Boat, Lake Court, Palm Be..."
7,Lone Star State,"30.49491, -88.0194",196 days ago,"Mobile County, Alabama, United States"
8,McCormack Boys,"37.01095, -76.23765",26 days ago,"Thimble Shoal Light, Hampton City, Virginia, U..."
9,Miami River,"31.10606, -81.44053",2 min ago,"Brunswick, Glynn County, Georgia, United States"


In [6]:
import requests
import time
from bs4 import BeautifulSoup
from csv import writer
from geopy.geocoders import Nominatim

agent = {"User-Agent":'Mozilla/5.0 (Windows NT 6.3; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/59.0.3071.115 Safari/537.36'}
geolocator = Nominatim(user_agent="jwh")
url = 'https://www.vesselfinder.com/vessels/MIAMI-RIVER-IMO-0-MMSI-366898810'

In [7]:
response = requests.get(url, headers=agent)

In [9]:
soup = BeautifulSoup(response.text,"html.parser")

In [10]:
soup.find_all('table')

[<table class="tparams npctable">
 <thead><tr class="table-header"><th class="v31">Current AIS Destination</th><th class="v31">Estimated Time of Arrival</th></tr></thead>
 <tbody>
 <tr><td class="n3ata">-</td><td class="n3ata">-</td></tr>
 </tbody>
 </table>,
 <table class="tparams">
 <thead><tr><th class="text1">Port</th><th class="text1">Arrival (UTC)</th><th class="text1">In Port</th></tr></thead>
 <tbody class="pctable" id="port-calls"></tbody></table>,
 <table class="tparams"><tbody>
 <tr><td class="n3">AIS Type</td><td class="v3">Port tender</td></tr>
 <tr><td class="n3">Flag</td><td class="v3">USA</td></tr>
 <tr><td class="n3">Destination</td><td class="v3">-</td></tr>
 <tr><td class="n3">ETA</td><td class="v3">-</td></tr>
 <tr><td class="n3">IMO / MMSI</td><td class="v3"> -  / 366898810</td></tr>
 <tr><td class="n3">Callsign</td><td class="v3">WDB4478</td></tr>
 <tr><td class="n3">Length / Beam</td><td class="v3">12 / 4 m</td></tr>
 <tr><td class="n3">Current draught</td><td cl

In [18]:
table = soup.find_all(class_='v3')
time_since_last_position = table[11].get_text() # coordinate row
cleaned_time = time_since_last_position[:-3]

In [19]:
cleaned_time

'2 min ago'

In [None]:
# Find current lat, lon coordinates
table = soup.find_all(class_='v3')
coords = table[9] # coordinate row
text = coords.get_text() # Text from coordinate row
latlon = text.split('/')
latlon_clean = [val[:-2] for val in latlon]
latlon_clean[1] = '-'+latlon_clean[1]  
position = ', '.join(latlon_clean)

# Reverse lat-lon lookup, giving address
physical = geolocator.reverse(position)
location = physical.address

In [24]:
conda install progress


Note: you may need to restart the kernel to use updated packages.
