In [1]:
!pip install pdfplumber

Collecting pdfplumber
  Downloading pdfplumber-0.11.5-py3-none-any.whl.metadata (42 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/42.5 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m42.5/42.5 kB[0m [31m2.0 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting pdfminer.six==20231228 (from pdfplumber)
  Downloading pdfminer.six-20231228-py3-none-any.whl.metadata (4.2 kB)
Collecting pypdfium2>=4.18.0 (from pdfplumber)
  Downloading pypdfium2-4.30.1-py3-none-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (48 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m48.2/48.2 kB[0m [31m1.9 MB/s[0m eta [36m0:00:00[0m
Downloading pdfplumber-0.11.5-py3-none-any.whl (59 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m59.5/59.5 kB[0m [31m4.0 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading pdfminer.six-20231228-py3-none-any.whl (5.6 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━

In [2]:
pip install groq

Collecting groq
  Downloading groq-0.15.0-py3-none-any.whl.metadata (14 kB)
Downloading groq-0.15.0-py3-none-any.whl (109 kB)
[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/109.6 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m109.6/109.6 kB[0m [31m4.2 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: groq
Successfully installed groq-0.15.0


In [7]:
import pdfplumber
from bs4 import BeautifulSoup
import pandas as pd

def process_pdf(file_path):
    """
    Extract text from a PDF file.
    """
    try:
        with pdfplumber.open(file_path) as pdf:
            text = ""
            for page in pdf.pages:
                text += page.extract_text()
        print("PDF Text Extracted Successfully.")
        return text
    except Exception as e:
        print(f"Error processing PDF: {e}")
        return None

def process_html(file_path):
    """
    Extract text from an HTML file.
    """
    try:
        with open(file_path, 'r', encoding='utf-8') as file:
            html_content = file.read()
        soup = BeautifulSoup(html_content, 'html.parser')
        text = soup.get_text(separator="\n")
        title = soup.find('title').text if soup.find('title') else "No Title Found"
        print(f"HTML Title Extracted: {title}")
        return text
    except Exception as e:
        print(f"Error processing HTML: {e}")
        return None

def extract_info_with_groq(document_text, model="llama-3.3-70b-versatile"):
    """
    Send document text to Groq API and extract structured information.
    """
    prompt = f"""
    Extract the following information from this document:
    - Bid Number
    - Title
    - Due Date
    - Bid Submission Type
    - Term of Bid
    - Pre-Bid Meeting
    - Installation
    - Bid Bond Requirement
    - Delivery Date
    - Payment Terms
    - Any Additional Documentation Required
    - MFG for Registration
    - Contract or Cooperative to use
    - Model_no
    - Part_no
    - Product
    - contact_info
    - company_name
    - Bid Summary
    - Product Specification

    Document:
    {document_text}
    """
    try:
        chat_completion = client.chat.completions.create(
            messages=[
                {"role": "system", "content": "You are a helpful assistant skilled in extracting structured data."},
                {"role": "user", "content": prompt},
            ],
            model=model,
            temperature=0.5,
            max_completion_tokens=1024,
            top_p=1,
            stop=None,
            stream=False,
        )
        return chat_completion.choices[0].message.content
    except Exception as e:
        print(f"Error querying Groq API: {e}")
        return None

def save_to_table(extracted_info, output_file):
    """
    Save extracted information into a CSV or table format.
    """
    try:
        data = {"Field": [], "Value": []}
        for line in extracted_info.split("\n"):
            if ":" in line:
                field, value = line.split(":", 1)
                data["Field"].append(field.strip())
                data["Value"].append(value.strip())
        df = pd.DataFrame(data)
        df.to_csv(output_file, index=False)
        print(f"Data saved to {output_file}")
    except Exception as e:
        print(f"Error saving data to table: {e}")

def process_document(file_path, file_type, output_file):
    """
    Process a single document and save the extracted information to a table.
    """
    if file_type == "pdf":
        document_text = process_pdf(file_path)
    elif file_type == "html":
        document_text = process_html(file_path)
    else:
        print("Unsupported file type.")
        return

    if document_text:
        extracted_info = extract_info_with_groq(document_text)
        if extracted_info:
            save_to_table(extracted_info, output_file)
        else:
            print("No information extracted.")
    else:
        print("No text extracted from document.")

# Example usage:
# For a PDF file
process_document("/content/Addendum 1 RFP JA-207652 Student and Staff Computing Devices.pdf", "pdf", "output_pdf.csv")

# For an HTML file
process_document("//content/Student and Staff Computing Devices __SOURCING #168884__ - Bid Information - {3} _ BidNet Direct.html", "html", "output_html.csv")


PDF Text Extracted Successfully.
Error querying Groq API: name 'client' is not defined
No information extracted.
HTML Title Extracted: Student and Staff Computing Devices **SOURCING #168884** - Bid Information - {3} | BidNet Direct
Error querying Groq API: name 'client' is not defined
No information extracted.


In [13]:
pip install python-dotenv


Collecting python-dotenv
  Downloading python_dotenv-1.0.1-py3-none-any.whl.metadata (23 kB)
Downloading python_dotenv-1.0.1-py3-none-any.whl (19 kB)
Installing collected packages: python-dotenv
Successfully installed python-dotenv-1.0.1


In [18]:
import os
from groq import Groq
import pdfplumber
from bs4 import BeautifulSoup

# Set up Groq API client
api_key = "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"  # Replace with your actual API key
client = Groq(api_key=api_key)

# Function to extract text from PDF
def extract_text_from_pdf(pdf_path):
    try:
        with pdfplumber.open(pdf_path) as pdf:
            text = ""
            for page in pdf.pages:
                text += page.extract_text()
        return text
    except Exception as e:
        print(f"Error extracting text from PDF: {e}")
        return None

# Function to parse HTML and extract information
def extract_info_from_html(html_path):
    try:
        with open(html_path, 'r', encoding='utf-8') as file:
            html_content = file.read()
        soup = BeautifulSoup(html_content, 'html.parser')
        title = soup.find('title').text if soup.find('title') else "No title found"
        return {"title": title, "html_content": html_content}
    except Exception as e:
        print(f"Error extracting information from HTML: {e}")
        return None

# Function to query Groq API
def extract_info_with_groq(text):
    prompt = f"""
    Extract the following information from this document:
    - Bid Number
    - Title
    - Due Date
    - Bid Submission Type
    - Term of Bid
    - Pre-Bid Meeting
    - Installation
    - Bid Bond Requirement
    - Delivery Date
    - Payment Terms
    - Any Additional Documentation Required
    - MFG for Registration
    - Contract or Cooperative to use
    - Model_no
    - Part_no
    - Product
    - contact_info
    - company_name
    - Bid Summary
    - Product Specification

    Document:
    {text}
    """
    try:
        chat_completion = client.chat.completions.create(
            messages=[
                {"role": "system", "content": "You are a helpful assistant skilled in extracting structured data."},
                {"role": "user", "content": prompt},
            ],
            model="llama-3.3-70b-versatile",  # Replace with the desired model
            temperature=0.5,
            max_completion_tokens=1024,
            top_p=1,
            stop=None,
            stream=False,
        )
        return chat_completion.choices[0].message.content
    except Exception as e:
        print(f"Error querying Groq API: {e}")
        return None

# Paths to input files
pdf_path = "/content/Addendum 1 RFP JA-207652 Student and Staff Computing Devices.pdf"
html_path = "/content/Student and Staff Computing Devices __SOURCING #168884__ - Bid Information - {3} _ BidNet Direct.html"

# Extract text from PDF
pdf_text = extract_text_from_pdf(pdf_path)
if pdf_text:
    print("PDF Text Extracted Successfully.")

# Extract information from HTML
html_info = extract_info_from_html(html_path)
if html_info:
    print(f"HTML Title Extracted: {html_info['title']}")

# Combine PDF and HTML content for Groq
combined_text = pdf_text + "\n\n" + html_info["html_content"] if pdf_text and html_info else pdf_text or html_info

# Query Groq API for structured data
if combined_text:
    extracted_info = extract_info_with_groq(combined_text)
    print("Extracted Information:")
    print(extracted_info)
else:
    print("No content to process with Groq.")


PDF Text Extracted Successfully.
HTML Title Extracted: Student and Staff Computing Devices **SOURCING #168884** - Bid Information - {3} | BidNet Direct
Error querying Groq API: Error code: 413 - {'error': {'message': 'Request too large for model `llama-3.3-70b-versatile` in organization `org_01jenh9152ea4bk6asc1ndave2` service tier `on_demand` on tokens per minute (TPM): Limit 6000, Requested 24992, please reduce your message size and try again. Visit https://console.groq.com/docs/rate-limits for more information.', 'type': 'tokens', 'code': 'rate_limit_exceeded'}}
Extracted Information:
None


In [19]:
import pdfplumber
from bs4 import BeautifulSoup
import pandas as pd

def process_pdf(file_path):
    """
    Extract text from a PDF file.
    """
    try:
        with pdfplumber.open(file_path) as pdf:
            text = ""
            for page in pdf.pages:
                text += page.extract_text()
        print("PDF Text Extracted Successfully.")
        return text
    except Exception as e:
        print(f"Error processing PDF: {e}")
        return None

def process_html(file_path):
    """
    Extract text from an HTML file.
    """
    try:
        with open(file_path, 'r', encoding='utf-8') as file:
            html_content = file.read()
        soup = BeautifulSoup(html_content, 'html.parser')
        text = soup.get_text(separator="\n")
        title = soup.find('title').text if soup.find('title') else "No Title Found"
        print(f"HTML Title Extracted: {title}")
        return text
    except Exception as e:
        print(f"Error processing HTML: {e}")
        return None

def extract_info_with_groq(document_text, model="llama-3.3-70b-versatile"):
    """
    Send document text to Groq API and extract structured information.
    """
    prompt = f"""
    Extract the following information from this document:
    - Bid Number
    - Title
    - Due Date
    - Bid Submission Type
    - Term of Bid
    - Pre-Bid Meeting
    - Installation
    - Bid Bond Requirement
    - Delivery Date
    - Payment Terms
    - Any Additional Documentation Required
    - MFG for Registration
    - Contract or Cooperative to use
    - Model_no
    - Part_no
    - Product
    - contact_info
    - company_name
    - Bid Summary
    - Product Specification

    Document:
    {document_text}
    """
    try:
        chat_completion = client.chat.completions.create(
            messages=[
                {"role": "system", "content": "You are a helpful assistant skilled in extracting structured data."},
                {"role": "user", "content": prompt},
            ],
            model=model,
            temperature=0.5,
            max_completion_tokens=1024,
            top_p=1,
            stop=None,
            stream=False,
        )
        return chat_completion.choices[0].message.content
    except Exception as e:
        print(f"Error querying Groq API: {e}")
        return None

def save_to_table(extracted_info, output_file):
    """
    Save extracted information into a CSV or table format.
    """
    try:
        data = {"Field": [], "Value": []}
        for line in extracted_info.split("\n"):
            if ":" in line:
                field, value = line.split(":", 1)
                data["Field"].append(field.strip())
                data["Value"].append(value.strip())
        df = pd.DataFrame(data)
        df.to_csv(output_file, index=False)
        print(f"Data saved to {output_file}")
    except Exception as e:
        print(f"Error saving data to table: {e}")

def process_document(file_path, file_type, output_file):
    """
    Process a single document and save the extracted information to a table.
    """
    if file_type == "pdf":
        document_text = process_pdf(file_path)
    elif file_type == "html":
        document_text = process_html(file_path)
    else:
        print("Unsupported file type.")
        return

    if document_text:
        extracted_info = extract_info_with_groq(document_text)
        if extracted_info:
            save_to_table(extracted_info, output_file)
        else:
            print("No information extracted.")
    else:
        print("No text extracted from document.")

# Example usage:
# For a PDF file
process_document("/content/Addendum 1 RFP JA-207652 Student and Staff Computing Devices.pdf", "pdf", "output_pdf.csv")

# For an HTML file
process_document("//content/Student and Staff Computing Devices __SOURCING #168884__ - Bid Information - {3} _ BidNet Direct.html", "html", "output_html.csv")


PDF Text Extracted Successfully.
Data saved to output_pdf.csv
HTML Title Extracted: Student and Staff Computing Devices **SOURCING #168884** - Bid Information - {3} | BidNet Direct
Data saved to output_html.csv


In [None]:
import pdfplumber
from bs4 import BeautifulSoup
import pandas as pd
import json
import requests  # Assuming you are using HTTP requests for the Groq API

# Replace with your Groq API endpoint and key
GROQ_API_URL = "https://api.groq.com/v1/extract"
API_KEY = "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"

def process_pdf(file_path):
    """
    Extract text from a PDF file.
    """
    try:
        with pdfplumber.open(file_path) as pdf:
            text = ""
            for page in pdf.pages:
                text += page.extract_text()
        print("PDF Text Extracted Successfully.")
        return text
    except Exception as e:
        print(f"Error processing PDF: {e}")
        return None

def process_html(file_path):
    """
    Extract text from an HTML file.
    """
    try:
        with open(file_path, 'r', encoding='utf-8') as file:
            html_content = file.read()
        soup = BeautifulSoup(html_content, 'html.parser')
        text = soup.get_text(separator="\n")
        title = soup.find('title').text if soup.find('title') else "No Title Found"
        print(f"HTML Title Extracted: {title}")
        return text
    except Exception as e:
        print(f"Error processing HTML: {e}")
        return None

def extract_info_with_groq(document_text):
    """
    Send document text to Groq API and extract structured information.
    """
    headers = {
        "Authorization": f"Bearer {API_KEY}",
        "Content-Type": "application/json"
    }
    data = {
        "document_text": document_text
    }

    try:
        response = requests.post(GROQ_API_URL, json=data, headers=headers)
        if response.status_code == 200:
            return response.json()  # Assuming the API returns structured JSON data
        else:
            print(f"Error querying Groq API: {response.text}")
            return None
    except Exception as e:
        print(f"Error querying Groq API: {e}")
        return None

def save_to_json(extracted_info, output_file):
    """
    Save extracted information into a JSON format.
    """
    try:
        with open(output_file, 'w', encoding='utf-8') as json_file:
            json.dump(extracted_info, json_file, ensure_ascii=False, indent=4)
        print(f"Data saved to {output_file}")
    except Exception as e:
        print(f"Error saving data to JSON: {e}")

def process_document(file_path, file_type, output_file):
    """
    Process a single document and save the extracted information to a JSON file.
    """
    if file_type == "pdf":
        document_text = process_pdf(file_path)
    elif file_type == "html":
        document_text = process_html(file_path)
    else:
        print("Unsupported file type.")
        return

    if document_text:
        extracted_info = extract_info_with_groq(document_text)
        if extracted_info:
            save_to_json(extracted_info, output_file)
        else:
            print("No information extracted.")
    else:
        print("No text extracted from document.")

# Example usage:
# For a PDF file
process_document("/content/Addendum 1 RFP JA-207652 Student and Staff Computing Devices.pdf", "pdf", "output_pdf.json")

# For an HTML file
process_document("//content/Student and Staff Computing Devices __SOURCING #168884__ - Bid Information - {3} _ BidNet Direct.html", "html", "output_html.json")
