In [5]:
import pandas as pd
from openai import OpenAI
from dotenv import load_dotenv
import os
import json
from openpyxl import load_workbook

load_dotenv()

client = OpenAI() # defaults to os.environ.get('OPENAI_API_KEY')

# Load the Excel file
file_path = './file_folder/Data_Extraction_Concentration.xlsx'
df = pd.read_excel(file_path)
# df = df.head(105)

# Set up your OpenAI API key
# openai.api_key = os.environ.get('OPENAI_API_KEY')

def extract_value(note, task_description, operational_guidelines, examples):
    prompt_template= f"""You are an expert data extractor. Your task is to {task_description}

    Operational Guidelines:
    {operational_guidelines}

    Examples:
    {examples}

    Here is the note: "{note}"
    Return ONLY the resulting length-2 JSON array.
    """

    response = client.chat.completions.create(
        model="gpt-4",
        messages=[{"role": "system", "content": prompt_template}]
        #, temperature?
    )

    response_text = response.choices[0].message.content.strip()
    print("API response: ", response_text, "to note: ", note)

    # Ensure the response is a valid JSON
    if response_text.startswith("```") and response_text.endswith("```"):
        response_text = response_text[3:-3].strip()
    response_text = response_text.strip("json\n")
    print('clen',response_text)

    # Parse the JSON response
    try:
        extracted_values = json.loads(response_text)
        return extracted_values
    except json.JSONDecodeError as e:
        print(f"Error parsing JSON response: {e}")
        return [['?','?']]*5
    
def process_notes(df, column_name, task_description, operational_guidelines, examples):
    values, confidences = [], []
    for note in df[column_name]:
        print('note: ', note)
        extracted_value, confidence = extract_value(note, task_description, operational_guidelines, examples)
        values.append(extracted_value)
        confidences.append(confidence)
    return values, confidences

def main():
    task_description = "extract the concentration value from the following note."
    operational_guidelines = """
    1. Convert the note to one of the following categories: "Yes", "No", "Unknown", or "Blank".
    2. This is a boolean flag to answering the question of whether there is concentration (>20% of revenue cut-off) in this company or not.
    3. If there is potentially a concentration, return "Yes".
    3. If the company sells to many different companies, return "No".
    4. If the input is an empty string or nan, return "Blank".
    4. If the value is vague, return "Unknown".
    5. Provide a confidence level in the range (0,1] for the extracted value. The confidence level should reflect how certain you are of your converted value.
    6. Return ONLY a length-2 JSON array with the extracted value and confidence level.
    """

    examples = """
    Note: "Large Contract" -> ["Yes", 1.0]
    Note: "-" -> ["Unknown", 1.0]
    Note: "top is 7-8% of revenue" -> ["No", 0.9]
    Note: " " -> ["Blank", 1.0]
    Note: "some medicare / medicaid" -> ["Yes", 0.6]
    Note: "None known" -> ["No", 0.9]
    Note: "carfax is a 40% customer" -> ["Yes", 1.0]
    Note: "Unknown" -> ["Unknown", 1.0]
    Note: "potentially has some concentration, tbd" -> ["Yes", 0.7]
    Note: "" -> ["Blank", 1.0]
    Note: "Need to diligence" -> ["Unknown", 0.8]
    Note: "1 customer at 23%. PF for year end 18%" -> ["Yes", 0.8]
    """

    start_row, end_row = 3500,4642 # Excel rows 1-1601 filled in Data_Extraction (1-1601 4o-mini, 1602-3501 4, )

    df_subset = df.iloc[start_row:end_row]
    # Apply the batch processing function
    revenue_values, revenue_confidences = process_notes(df_subset, 'Concentrations Notes', task_description, operational_guidelines, examples)

    print('results: ', revenue_values, revenue_confidences)

    # Insert the values and confidences back into the DataFrame
    df.loc[start_row:end_row-1, 'Yes / No'] = revenue_values
    df.loc[start_row:end_row-1, 'Concentrations Confidence'] = revenue_confidences

    # Save the updated dataframe back to Excel
    output_path = './file_folder/Data_Extraction_Concentration_Edit.xlsx'
    df.to_excel(output_path, index=False)

    # Output path for user reference
    print(f"Updated Excel file saved to {output_path}")

main()

note:  1mm+ customers
API response:  ["No", 1.0] to note:  1mm+ customers
clen ["No", 1.0]
note:  14% customer
API response:  ["No", 0.9] to note:  14% customer
clen ["No", 0.9]
note:  nan
API response:  ["Blank", 1.0] to note:  nan
clen ["Blank", 1.0]
note:  3000 customers
API response:  ["No", 0.9] to note:  3000 customers
clen ["No", 0.9]
note:  none known
API response:  ["No", 0.9] to note:  none known
clen ["No", 0.9]
note:  4.5K pro services firms
API response:  ["No", 1.0] to note:  4.5K pro services firms
clen ["No", 1.0]
note:  none
API response:  ["No", 1.0] to note:  none
clen ["No", 1.0]
note:  none known
API response:  ["No", 0.9] to note:  none known
clen ["No", 0.9]
note:  none
API response:  ["No", 1.0] to note:  none
clen ["No", 1.0]
note:  None known
API response:  ["No", 0.9] to note:  None known
clen ["No", 0.9]
note:  nan
API response:  ["Blank", 1.0] to note:  nan
clen ["Blank", 1.0]
note:  none
API response:  ["No", 1.0] to note:  none
clen ["No", 1.0]
note:  nan