# Boat Scraping

In [51]:
import requests
from bs4 import BeautifulSoup
from csv import writer

agent = {"User-Agent":'Mozilla/5.0 (Windows NT 6.3; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/59.0.3071.115 Safari/537.36'}
response = requests.get(url, headers=agent)
soup = BeautifulSoup(response.text,"html.parser")

In [52]:
soup = BeautifulSoup(response.text,"html.parser")

In [3]:
#soup.find_all('table')

# Need to make `latlon_clean[1]` Negative

In [53]:
table = soup.find_all(class_='v3')
coords = table[9] # coordinate row
text = coords.get_text() # Text from coordinate row
latlon = text.split('/')
latlon_clean = [val[:-2] for val in latlon]
latlon_clean[1] = '-'+latlon_clean[1]  
position = ', '.join(latlon_clean)

In [54]:
coords

<td class="v3">32.02369 N/81.04659 W</td>

In [55]:
text

'32.02369 N/81.04659 W'

In [56]:
latlon

['32.02369 N', '81.04659 W']

In [57]:
latlon_clean

['32.02369', '-81.04659']

In [58]:
position

'32.02369, -81.04659'

## Reverse Address Lookup Using Lat/Long
`geopy` can reverse lookup an address from the lat/lon values.

In [59]:
from geopy.geocoders import Nominatim
geolocator = Nominatim(user_agent="jwh")
physical = geolocator.reverse(position)
location = physical.address

In [63]:
physical

Location(Thunderbolt Marine Inc. Warehouse, 3124, River Drive, Thunderbolt, Chatham County, Georgia, 31404, United States, (32.0222342, -81.04666794933618, 0.0))

In [61]:
location

'Thunderbolt Marine Inc. Warehouse, 3124, River Drive, Thunderbolt, Chatham County, Georgia, 31404, United States'

In [64]:
type(physical)

geopy.location.Location

## Adding Time Delays
We don't want to flood the website with traffic. 

`time.sleep(n)` pauses the code for n seconds

In [49]:
import time
print('Start!')
time.sleep(3)
print('Ok...its been long enough!')

Start!
Ok...its been long enough!


## Putting it All Together
Time to compile the entire script, including vessel URL dictionaries

In [90]:
import requests
import time
from bs4 import BeautifulSoup
from csv import writer
from geopy.geocoders import Nominatim


agent = {"User-Agent":'Mozilla/5.0 (Windows NT 6.3; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/59.0.3071.115 Safari/537.36'}
geolocator = Nominatim(user_agent="jwh")
boats = {
    "Brangus": "https://www.vesselfinder.com/vessels/BRANGUS-IMO-0-MMSI-366899270",
    "Calcasieu River": "https://www.vesselfinder.com/vessels/CALCASIEU-RIVER-IMO-0-MMSI-367313850",
    "Cavalier State": "https://www.vesselfinder.com/vessels/CAVALIER-STATE-IMO-0-MMSI-367340130",
    "Columbia River": "https://www.vesselfinder.com/vessels/COLUMBIA-RIVER-IMO-0-MMSI-367187130",
    "Cooper River": "https://www.vesselfinder.com/vessels/COOPER-RIVER-IMO-0-MMSI-366867370",
    "East River": "https://www.vesselfinder.com/vessels/EAST-RIVER-IMO-0-MMSI-366898790",
    "Evergreen State": "https://www.vesselfinder.com/vessels/EVERGREEN-STATE-IMO-0-MMSI-367156370",
    "Lone Star State": "https://www.vesselfinder.com/vessels/LONESTAR-STATE-IMO-0-MMSI-367183730",
    "McCormack Boys": "https://www.vesselfinder.com/vessels/MCCORMACK-BOYS-IMO-0-MMSI-366872170",
    "Miami River": "https://www.vesselfinder.com/vessels/MIAMI-RIVER-IMO-0-MMSI-366898810",
    "Muskegon River": "https://www.vesselfinder.com/vessels/MUSKEGON-RIVER-IMO-0-MMSI-367509760",
    "Ohio River": "https://www.vesselfinder.com/vessels/OHIO-RIVER-IMO-0-MMSI-366899290",
    "Pearl River": "https://www.vesselfinder.com/vessels/PEARL-RIVER-IMO-0-MMSI-367187120",
    "Saginaw River": "https://www.vesselfinder.com/vessels/SAGINAW-RIVER-IMO-0-MMSI-367511230",
    "St Johns River": "https://www.vesselfinder.com/vessels/SAINT-JOHNS-RIVER-IMO-0-MMSI-367313760",
    "Volunteer State": "https://www.vesselfinder.com/vessels/VOLUNTEER-STATE-IMO-0-MMSI-367314110",
    "Wolf River": "https://www.vesselfinder.com/vessels/WOLF-RIVER-IMO-0-MMSI-367060910"
    }
timestamp = time.strftime('%Y%m%d-%I_%M_%p')
filename = timestamp + '-Fleet Location.csv'

start = time.time()

with open(filename, 'w', newline='') as file:
    csv_writer = writer(file)
    
    # Add Headers
    csv_writer.writerow(['Vessel', 'Lat-Long','Address'])
    
    for boat, url in boats.items():
        response = requests.get(url, headers=agent)
        soup = BeautifulSoup(response.text,"html.parser")
        
        # Find current lat, lon coordinates
        table = soup.find_all(class_='v3')
        coords = table[9] # coordinate row
        text = coords.get_text() # Text from coordinate row
        latlon = text.split('/')
        latlon_clean = [val[:-2] for val in latlon]
        latlon_clean[1] = '-'+latlon_clean[1]  
        position = ', '.join(latlon_clean)
        
        # Reverse lat-lon lookup, giving address
        physical = geolocator.reverse(position)
        location = physical.address
        
        # Write results to spreadsheet row
        csv_writer.writerow([boat, position, location])
        
        # Wait n seconds in between requests
        time.sleep(2)
        
end = time.time()
duration = end - start

print(f'This script took {duration} seconds to run')

This script took 55.678189754486084 seconds to run
