Some preliminary notes
- seems like marinevesseltraffic.com doesn't have the position in each to retreive from easily. Will need to access this elsewhere. An attempt is currently being made at marinetraffic.com: https://copyprogramming.com/howto/how-to-scrape-location-data-from-marinetraffic#how-to-scrape-data-from-an-html-page 

Psuedocode
```
df = empty df
for each page in marinevesseltraffic database of chinese ships: 
    for each row:
        if (row's Vessel Name contains "Guard" or "Coast"), OR (row's type is "Law Enforce"):
            df.append(grab that ship's data()) 
            


// for simplicity, we just grab its position log
//may or may not find it useful to also scrape https://www.vesselfinder.com/vessels/details/[IMO] 's RECENT PORT CALLS, VESSEL PARTICULARS, HISTORY


// marinetraffic also seems to at least sometimes have a more recent position than marinevesseltraffic
[BELOW THIS LINE IS DEPRECATED since marinetraffic blocks us from scraping. A possible way to circumvent this: https://copyprogramming.com/howto/how-to-scrape-location-data-from-marinetraffic#how-to-scrape-data-from-an-html-page]     
def grabbing_data(IMO number):
    go to https://www.marinetraffic.com/en/ais/details/ships/imo:[IMO] //will get redirected to the complete url of the page belonging to this IMO. some imos don't have pages on here.
    click on Events Timeline
    click on VIEW FULL LIST
    scrape the location data into a df
    return the df
```

In [6]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import numpy as np

import time # for timing webscraping operations

In [7]:
"""
Used to process a page of ships on marinevesseltraffic.com

structure of data:
<td class="vessel_td td_type"> Passenger </td>
<td class="vessel_td"> FAN KE </td>
"""
def process_page(rows: list[BeautifulSoup]) -> pd.DataFrame():
    potential_cc_rows = [] # cc = chinese coastguard

    for row in rows:
        row_features = row.find_all('td', class_='vessel_td') # excludes the web link, which isn't very useful
        vessel_name = row_features[3].get_text(strip=True)
        vessel_type = row_features[5].get_text(strip=True)
        
        # Filter for potential Chinese Coastguard ships. Remove to not filter for any particular ships, and de-indent the body.  
        #if (("coast" in vessel_name.lower()) or ("guard" in vessel_name.lower())) or (vessel_type == "Law Enforce"): 
        imo = row_features[1].get_text(strip=True)
        mmsi = row_features[2].get_text(strip=True)
        call_sign = row_features[4].get_text(strip=True)
        length = row_features[6].get_text(strip=True)
        beam = row_features[7].get_text(strip=True)
        
        potential_cc_rows.append([imo, mmsi, vessel_name, call_sign, vessel_type, length, beam])
    
    
    return pd.DataFrame(potential_cc_rows, columns=["IMO", "MMSI", "Vessel Name", "Call Sign", "Type", "Length", "Beam"])

In [8]:
"""
Extract data from all 200 pages into a df
""" 
def process_pages(num_pages:int, field_to_sort_by:str, sort_order:str) -> pd.DataFrame():
    dfs = []
    for k in range(1, num_pages+1):
        
        # Send request
        url = f'https://www.marinevesseltraffic.com/vessels?page={k}&vessel=&flag=China&sort={field_to_sort_by}&direction={sort_order}'
        response = requests.get(url)
        if response.status_code != 200:
            print(f'Failed to retrieve the webpage: {url}\nStatus code:', response.status_code)

        # Extract the info from this page into a df to put in dfs
        soup_k = BeautifulSoup(response.text, 'html.parser')
        rows_k = soup_k.find_all('tr', class_='vessel_row') # Find all <tr> elements with class "vessel_row", with each element representing a row
        dfs.append(process_page(rows_k))
        
    return pd.concat(dfs, ignore_index=True)

In [15]:
dfs = []


Log of how many unique rows there are after retrieving a given combination:
(each df is stored in dfs)

none, asc: 2000
none, desc: 2000 (+0 from previous)

imo, asc: 3447 
name, asc: 4638 
type, asc: 4930
length, asc: 5139
beam, asc: 5155

imo, dsc: 6031
name, desc: 6724
type, desc: 7093
length, desc: 7497
beam, desc: 7524 / 7863

In [39]:
"""
Extract as many ships as possible working around the 2000 chinese ships constraint. How this cell was run was changing field and sort_order to one of the 12 possible combinations, and then calling process_pages() with that combination. Repeat this for each of the 12 combinations. 
"""

# sort_orders: "asc", "desc"
# fields_to_sort_by: "none", "imo", "name", "type", "lenght", "beam" (yes, length is indeed mispelled; "none" is really sorting by mmsi)
field = "beam"
sort_order = "desc"

t_start = time.time()
df = process_pages(200, field, sort_order)
dfs.append(df)
print(f"Finished creating df corresponding to: [{field}, {sort_order}] in {elapsed_time:.6f} seconds")
print(f"dfs now has {len(dfs)} dataframes.")
elapsed_time = time.time() - t_start


Finished creating df corresponding to: [beam, desc] in 148.375614 seconds
dfs now has 12 dataframes.


In [42]:
# Combine all 12 dataframes and remove duplicate ships
ch_df = pd.concat(dfs, ignore_index=True).drop_duplicates()
ch_df

Unnamed: 0,IMO,MMSI,Vessel Name,Call Sign,Type,Length,Beam
0,8914934,412000001,FAN KE,DLMU,Passenger,98,18
1,1111117,412000001,ZHELINYU00868,n.a.,Other Type,n.a.,n.a.
2,8626757,412000002,TERNA,AUUL,Unspecified,30,6
3,5035000,412000004,XX 04,n.a.,Unspecified,105,14
4,9203112,412000006,TIANRONGHAI,H3GD,Cargo,288,45
...,...,...,...,...,...,...,...
23745,9648570,413046110,SHEN QIAN HAO,BSIS,Dive Vessel,125,25
23778,9710000,414110000,PING AN DA 60,BFHL,Cargo,78,25
23817,9567829,413302640,HAI YANG SHI YOU 720,BFAQ5,Cargo,107,24
23821,9483700,413350450,KANG HAI YU YANG,BHLF,Cargo,115,24


In [51]:
# Some diagnostic info on uniqueness
pd.DataFrame(
    {
    'Unique IMO Count': [ch_df['IMO'].nunique()],
    'Unique MMSI Count': [ch_df['MMSI'].nunique()],
    'Unique Vessel Name Count': [ch_df['Vessel Name'].nunique()],
    'Total Rows Count': [ch_df.shape[0]]
    }
)

Unnamed: 0,Unique IMO Count,Unique MMSI Count,Unique Vessel Name Count,Total Rows Count
0,7486,7466,7467,7524


In [52]:
# export to csv
ch_df.to_csv('marinevesseltraffic chinese vessels.csv', index=False)

### Everything below is experimentation and messy

In [7]:
"""
Extra ships. These are ships that aren't on the first 2000 but are still of use. We keep these in a separate df, which can be concatenated with df_2k if desired. 
"""
# ...

"\nExtra ships. These are ships that aren't on the first 2000 but are still of use. We keep these in a separate df, which can be concatenated with df_2k if desired. \n"

In [30]:
def get_pos_history(imo: str, data_dict: dict) -> None:
    lst = []

    

    data_dict[imo] = lst

In [34]:
"""Process each ship in df_2k by getting their position info. We temporarily store their info in a dictionary."""

pos_data_2k = {}
df_2k["IMO"].apply(get_pos_history, args=(pos_data_2k,))

0       None
1       None
2       None
3       None
4       None
        ... 
1995    None
1996    None
1997    None
1998    None
1999    None
Name: IMO, Length: 2000, dtype: object

In [35]:
pos_data_2k

{'8914934': [],
 '1111117': [],
 '8626757': [],
 '5035000': [],
 '9203112': [],
 '3247661': [],
 '9055981': [],
 '8306773': [],
 '4908470': [],
 '1460362': [],
 '9058402': [],
 '1': [],
 '7820746': [],
 '9043627': [],
 '8929410': [],
 '9043639': [],
 '7529196': [],
 '9651618': [],
 '8919594': [],
 '8481121': [],
 '8919609': [],
 '1591422': [],
 '1362065': [],
 '8916334': [],
 '9055993': [],
 '9058397': [],
 '8319483': [],
 '8710211': [],
 '8844024': [],
 '8844048': [],
 '8614857': [],
 '8614869': [],
 '8512566': [],
 '8832186': [],
 '9177533': [],
 '9175444': [],
 '8601422': [],
 '2179348': [],
 '9175420': [],
 '8002250': [],
 '8875683': [],
 '8848238': [],
 '9128659': [],
 '8400414': [],
 '7378030': [],
 '9259678': [],
 '9069011': [],
 '9069035': [],
 '8843977': [],
 '8639807': [],
 '8354598': [],
 '8354366': [],
 '9666273': [],
 '9129079': [],
 '8520628': [],
 '9267120': [],
 '8733225': [],
 '8733237': [],
 '8658205': [],
 '8658190': [],
 '9549073': [],
 '9535747': [],
 '9162019': []

In [37]:
"""
    go to https://www.marinetraffic.com/en/ais/details/ships/imo:[IMO] //will get redirected to the complete url of the page belonging to this IMO. some imos don't have pages on here.
    click on Events Timeline
    click on VIEW FULL LIST
    scrape the location data into a df
    return the df
"""

# Testing just one page
url = 'https://www.marinetraffic.com/en/ais/details/ships/imo:9338498'

response = requests.get(url)

if response.status_code != 200:
    print('Failed to retrieve the webpage. Status code:', response.status_code)

soup = BeautifulSoup(response.text, 'html.parser')
soup.text

Failed to retrieve the webpage. Status code: 403


"\nAccess Denied\nYou don't have permission to access MarineTraffic.\nThis might be due to system abuse and/or violation of the Terms of Service.\nIf you believe this is an error, please contact us via http://help.marinetraffic.com\n\n\n"