In [1]:
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from webdriver_manager.chrome import ChromeDriverManager
from bs4 import BeautifulSoup

import pandas as pd
import csv

Pyarrow will become a required dependency of pandas in the next major release of pandas (pandas 3.0),
(to allow more performant data types, such as the Arrow string type, and better interoperability with other libraries)
but was not found to be installed on your system.
If this would cause problems for you,
please provide us feedback at https://github.com/pandas-dev/pandas/issues/54466
        
  import pandas as pd


In [2]:
def extract_ship_info(imo_numbers):
    # Initialize the Chrome driver
    service = Service(ChromeDriverManager().install())
    driver = webdriver.Chrome(service=service)

    # Initialize an empty dataframe
    df = pd.DataFrame()

    for imo_number in imo_numbers:
        # Access the page
        url = f"https://www.balticshipping.com/vessel/imo/{imo_number}"
        driver.get(url)

        # Get the complete HTML content of the page
        html_content = driver.page_source

        # Create a BeautifulSoup object to parse the HTML
        soup = BeautifulSoup(html_content, "html.parser")

        # Find the div with the class "ship-info-container"
        ship_info_container = soup.find("div", {"class": "ship-info-container", "style": "position: relative;"})

        if ship_info_container:
            # Find the table within the div
            ship_info_table = ship_info_container.find("table", {"class": "table ship-info", "style": "min-height: 710px;"})

            if ship_info_table:
                # Extract the data from the table
                data = {}
                rows = ship_info_table.find_all("tr")
                for row in rows:
                    cells = row.find_all("td")
                    headers = row.find_all("th")
                    if headers and cells:
                        data[headers[0].text.strip()] = [cell.text.strip() for cell in cells]

                # Convert the data to a dataframe
                new_df = pd.DataFrame(data)

                # Concatenate the new dataframe with the main dataframe
                df = pd.concat([df, new_df], ignore_index=True)

            else:
                print(f"Could not find the ship information table for IMO {imo_number}")
        else:
            print(f"Could not find the ship information section for IMO {imo_number}")

    # Close the browser
    driver.quit()

    # Return the dataframe
    return df


In [3]:
#abrir arquivo csv com os imo numbers imos.csv, para cara valor da coluna Imos, amarzenar em uma lista
with open('Imos.csv', 'r') as file:
    reader = csv.DictReader(file)
    imo_numbers = [row['Imo'] for row in reader]

In [4]:
#remover valores duplicados de imo_numbers
imo_numbers = list(set(imo_numbers))

In [5]:
#contar quantos valores tem em imo_numbers
print(f"Total IMOs: {len(imo_numbers)}")
print(imo_numbers[0:10])

Total IMOs: 405
['9326603', '9778117', '9955789', '9349813', '9800025', '9720287', '9301897', '9482562', '9719513', '9294991']


In [6]:
df_imos_caracteristicas = extract_ship_info(imo_numbers)

Could not find the ship information section for IMO 9955789
Could not find the ship information section for IMO 9877339
Could not find the ship information section for IMO 9857171
Could not find the ship information section for IMO 9785445
Could not find the ship information section for IMO 9857468
Could not find the ship information section for IMO 9746255
Could not find the ship information section for IMO 9851610
Could not find the ship information section for IMO 9851725
Could not find the ship information section for IMO 9920497
Could not find the ship information section for IMO 9861031
Could not find the ship information section for IMO 9756731
Could not find the ship information section for IMO 9936616
Could not find the ship information section for IMO 9785471
Could not find the ship information section for IMO 9785483
Could not find the ship information section for IMO 9909572
Could not find the ship information section for IMO 9898204
Could not find the ship information sect

In [7]:
#exportar para csv df_imos_caracteristicas
df_imos_caracteristicas.to_csv('df_imos_caracteristicas.csv', index=False)

In [11]:
#contar valore nulos nas colunas
print(df_imos_caracteristicas.isnull().sum())

IMO number                  0
MMSI                        1
Name of the ship            0
Former names               86
Vessel type                 0
Operating status            1
Flag                        0
Gross tonnage               0
Deadweight                  1
Length                      1
Breadth                     1
Engine type               148
Year of build               0
Builder                    14
Classification society     45
Home port                  96
Owner                      37
Manager                    37
Description                 0
Engine model              173
Engine power              151
dtype: int64
