In [1]:
import requests
from bs4 import BeautifulSoup
import pandas as pd

In [2]:
# Fetch the webpage
url = "https://en.wikipedia.org/wiki/1975_Pacific_hurricane_season"
response = requests.get(url)
soup = BeautifulSoup(response.text, 'html.parser')

In [4]:
# Placeholder for storing hurricane data
hurricanes = {}

# Extract relevant data (looking for hurricane systems)
for header in soup.find_all( class_='mw-heading mw-heading3'):
    storm_name = header.text
    
    # Extract start date, end date, deaths, and affected areas
    # We could also extract the duration dates from the infobox table but we will use the <p> text for simplicity
    storm_text = header.find_next('p').text  # You might need to adjust this logic
    hurricanes.update({storm_name:storm_text}) 

    
print(hurricanes)
    

{'Hurricane Agatha[edit]': 'An area of disturbed weather about 290\xa0mi (467\xa0km) southwest of Acapulco formed on June 1. It organized into a tropical depression the next day. After heading southwestward, it turned to the northwest and strengthened into Tropical Storm Agatha on June 2. Agatha maintained its course and steadily intensified. It reached hurricane intensity on June 3 while located about 170\xa0mi (270\xa0km) southwest of Zihuatanejo. Hurricane Agatha started weakening thereafter, becoming a tropical storm on June 4 and a depression on June 5. It dissipated shortly afterwards. At this time, Agatha was located about 140\xa0mi (230\xa0km) south of the Tres Marias Islands.[4]\n', 'Tropical Storm Bridget[edit]': 'On June 27, a tropical depression formed about 575\xa0mi (925\xa0km) south of the tip of the Baja California Peninsula at a location atypical for tropical cyclogenesis. The depression moved generally westward, and intensified into Tropical Storm Bridget on June 28. 

In [32]:
for h, text in hurricanes.items():
    print(h, text)

Hurricane Agatha[edit] An area of disturbed weather about 290 mi (467 km) southwest of Acapulco formed on June 1. It organized into a tropical depression the next day. After heading southwestward, it turned to the northwest and strengthened into Tropical Storm Agatha on June 2. Agatha maintained its course and steadily intensified. It reached hurricane intensity on June 3 while located about 170 mi (270 km) southwest of Zihuatanejo. Hurricane Agatha started weakening thereafter, becoming a tropical storm on June 4 and a depression on June 5. It dissipated shortly afterwards. At this time, Agatha was located about 140 mi (230 km) south of the Tres Marias Islands.[4]

Tropical Storm Bridget[edit] On June 27, a tropical depression formed about 575 mi (925 km) south of the tip of the Baja California Peninsula at a location atypical for tropical cyclogenesis. The depression moved generally westward, and intensified into Tropical Storm Bridget on June 28. It started accelerating as it turned

In [33]:
import os
from openai import OpenAI
import json

client = OpenAI(
  api_key=os.environ['OPENAI_API_KEY'],
)

In [37]:
extract_info = [
    {
        'name': 'extract_hurricane_info',
        'description': 'Get the hurricane information: date_start,date_end,number_of_deaths and list_of_areas_affected from the body of the input text',
        'parameters': {
            'type': 'object',
            'properties': {
                'date_start': {
                    'type': 'string',
                    'description': 'Date when the hurricane / storm started in 1975 '
                },
                'date_end': {
                    'type': 'string',
                    'description': 'Date when the hurricane / storm ended in 1975.'
                },
                'number_of_deaths': {
                    'type': 'integer',
                    'description': 'The number of deaths during the hurricane / storm .'
                },
                'list_of_areas_affected': {
                    'type': 'string',
                    'description': 'the areas affected by the hurricane / storm separated by ",".'
                }
                
            }
        }
    }
]

In [43]:

hurri_list = []
for h, text in hurricanes.items():
    h = {}
    response = client.chat.completions.create(
    model = 'gpt-4o-2024-08-06',
    messages = [{'role': 'user', 'content': text}],
    functions = extract_info,
    function_call = 'auto'
    )
    response_message = response.choices[0].message
    # Extracting the arguments
    function_args  = json.loads(response_message.function_call.arguments)
    h.update(function_args)
    hurri_list.append(h)


In [47]:
# print(hurri_list)

[{'date_start': '1975-06-01', 'date_end': '1975-06-05'}, {'date_start': 'June 27, 1975', 'date_end': 'July 3, 1975', 'number_of_deaths': 0, 'list_of_areas_affected': ''}, {'date_start': 'July 2, 1975', 'date_end': 'July 11, 1975', 'number_of_deaths': 0, 'list_of_areas_affected': 'none'}, {'date_start': 'July 4, 1975', 'date_end': 'July 14, 1975', 'number_of_deaths': 0, 'list_of_areas_affected': 'Mexico'}, {'date_start': 'July 10, 1975', 'date_end': 'July 12, 1975', 'number_of_deaths': 0, 'list_of_areas_affected': 'Acapulco, Manzanillo'}, {'date_start': 'July 27, 1975', 'date_end': 'July 30, 1975', 'number_of_deaths': 0}, {'date_start': 'August 11, 1975', 'date_end': 'August 14, 1975', 'number_of_deaths': 0, 'list_of_areas_affected': 'none'}, {'date_start': 'August 11, 1975', 'date_end': 'August 17, 1975', 'number_of_deaths': 0, 'list_of_areas_affected': ''}, {'date_start': 'August 18, 1975', 'date_end': 'August 26, 1975'}, {'date_start': 'August 24, 1975', 'date_end': 'August 31, 1975'

In [57]:
print(hurri_list[1])

{'date_start': 'June 27, 1975', 'date_end': 'July 3, 1975', 'number_of_deaths': 0, 'list_of_areas_affected': ''}


In [83]:

# Create final DataFrame
df = pd.DataFrame(hurricanes, columns=['hurricane_storm_name', 'date_start', 'date_end', 'number_of_deaths', 
                                       'list_of_areas_affected'])

# populate the dataframe
for index, (key, value) in enumerate(hurricanes.items()):
    key = key.replace('[edit]', '')
    new_row = hurri_list[index]
    new_row.update(hurricane_storm_name=key)
    df = df.append(new_row, ignore_index=True)
    
    
# Save to CSV
df.to_csv('hurricanes_1975.csv', index=False)

print("Data saved to hurricanes_1975.csv")


Data saved to hurricanes_1975.csv
