In [1]:
from bs4 import BeautifulSoup

In [2]:
# Function to parse the HTML file and extract the required sections
def parse_html_file(file_path):
    try:
        # Open and read the HTML file
        with open(file_path, 'r', encoding='utf-8') as file:
            html_content = file.read()
    except FileNotFoundError:
        print(f"The file at {file_path} was not found.")
        return []
    except Exception as e:
        print(f"An error occurred: {e}")
        return []

    # Parse the HTML content
    soup = BeautifulSoup(html_content, 'html.parser')

    # Initialize a list to store extracted data
    extracted_data = []

    # Find all <h4> tags with the 'rf-tile-title' class
    h4_tags = soup.find_all('h4', class_='rf-tile-title')

    for h4_tag in h4_tags:
        
        # Find the <a> tag within the <h4> tag
        a_tag = h4_tag.find('a', tabindex=True)
        
        

        if a_tag:
            # Extract the text from the <a> tag
            a_text = a_tag.get_text(strip=True)

            # Find the next sibling <p> tag with the specified class
            p_tag = h4_tag.find_next_sibling('p', class_='rf-tile-info rf-tile-line-one')

            if p_tag:
                # Extract the text from the <p> tag
                p_text = p_tag.get_text(strip=True)

                # Append the extracted texts to the list
                extracted_data.append({'a_text': a_text, 'p_text': p_text})

    return extracted_data

# Replace 'your_html_file.html' with the path to your actual HTML file
file_path = 'Sponsors _ GTC AI Conference 2024 _ NVIDIA.html'
extracted_data = parse_html_file(file_path)


# Print the extracted data
company_entries = []
if extracted_data:
    print(f"Found {len(extracted_data)} companies.")
    for data in extracted_data:
        company_entries.append({'name': data['a_text'], 'description': data['p_text']})
    print(f"Total Words = {sum([len((i['name'] + i['description']).split(' ')) for i in company_entries])}")
else:
    print("No data extracted. Please check your HTML structure.")

Found 297 companies.
Total Words = 12873


In [3]:
import pandas as pd
df = pd.DataFrame(company_entries)

In [4]:
from llama_index.embeddings.openai import OpenAIEmbedding
embeddings = OpenAIEmbedding()

  from .autonotebook import tqdm as notebook_tqdm


In [5]:
df['embds'] = df['description'].apply(lambda x: embeddings.get_text_embedding(x))

In [6]:
df.head()

Unnamed: 0,name,description,embds
0,"Amazon Web Services, Inc.",Amazon Web Services (AWS) is the world’s most ...,"[-0.008136449381709099, -0.012214291840791702,..."
1,Dell Technologies,Dell Technologies helps organizations and indi...,"[-0.012612163089215755, -0.029623154550790787,..."
2,Google Cloud,Google Cloud accelerates every organization’s ...,"[-0.0015818976098671556, -0.02260083146393299,..."
3,Hewlett Packard Enterprise,Hewlett Packard Enterprise is the global edge-...,"[-0.007676055654883385, -0.007578725926578045,..."
4,Microsoft Azure,"In this new era of AI, Microsoft is helping or...","[-0.008186150342226028, -0.035149574279785156,..."


In [7]:
df.to_json('company_full.json')