In [10]:
import pandas as pd
print(pd.__version__)


2.3.3


In [None]:
from bs4 import BeautifulSoup
import pandas as pd
import os

def get_name_price(html):
    """Parse GoodRx HTML and return a DataFrame with columns ['pharmacy', 'price'].

    Uses the passed-in `html` parameter (fixes the previous `html_content` NameError).
    Returns an empty DataFrame if the expected container is not found.
    """

    data = []

    # Parse the HTML content using the passed-in argument
    soup = BeautifulSoup(html, 'html.parser')

    # Find container divs; return empty DataFrame if none found
    tables = soup.find_all('div', class_='pt-2')
    if not tables:
        return pd.DataFrame(columns=['pharmacy', 'price'])

    relevant = tables[0]
    listitems = relevant.find_all('li')

    for li in listitems:

        seller_names = li.find_all('span', attrs={'data-qa': 'seller-name'})
        seller_price = li.find_all('span', attrs={'data-qa': 'seller-price'})

        # Only access [0] if lists are non-empty
        if seller_names and seller_price:
            name = seller_names[0].get_text(strip=True)
            price = seller_price[0].get_text(strip=True)
            data.append([name, price])

    # Build DataFrame and return
    df = pd.DataFrame(data, columns=['pharmacy', 'price'])
    return df



########## Look for GoodRx HTML files inside the 'Drugs_To_Get' folder ##########
folder = 'Drugs_To_Get'
fnames = [os.path.join(folder, fname) for fname in os.listdir(folder) if fname.endswith('GoodRx.html')]

results = []

for fname in fnames:
    # Read file with explicit encoding
    with open(fname, 'r', encoding='utf-8') as f:
        html_content = f.read()

    # Parse the HTML into a DataFrame
    df = get_name_price(html_content)

    # Extract a drug name from the filename (first token of the base name)
    drug_name = os.path.basename(fname).split(' ')[0]

    # Ensure df has the expected columns even if empty
    if df.empty:
        df = pd.DataFrame(columns=['pharmacy', 'price'])

    # Add the drug name column
    df['name'] = drug_name

    results.append(df)

# Concatenate results safely
if results:
    all_data = pd.concat(results)
else:
    all_data = pd.DataFrame(columns=['pharmacy', 'price', 'name'])

all_data.to_csv('all_data.csv', index=False)
print(all_data)




                       pharmacy   price           name
0                     Walgreens   $2.55  Acetaminophen
1                       Walmart   $5.66  Acetaminophen
2   Walmart Neighborhood Market   $3.57  Acetaminophen
3                        Costco   $4.80  Acetaminophen
4                 Harris Teeter  $18.78  Acetaminophen
..                          ...     ...            ...
4                 Harris Teeter  $18.37        Vitamin
5                  Target (CVS)  $22.70        Vitamin
6   Walmart Neighborhood Market   $3.67        Vitamin
7                        Publix   $4.03        Vitamin
8               Kroger Pharmacy  $18.37        Vitamin

[122 rows x 3 columns]
