In [20]:
import pandas as pd
from openai import OpenAI
from dotenv import load_dotenv
import os
import json
from openpyxl import load_workbook

load_dotenv()

client = OpenAI() # defaults to os.environ.get('OPENAI_API_KEY')

# Load the Excel file
file_path = './file_folder/Data_Extraction_Growth.xlsx'
df = pd.read_excel(file_path)
# df = df.head(105)

# Set up your OpenAI API key
# openai.api_key = os.environ.get('OPENAI_API_KEY')

def extract_value(note, task_description, operational_guidelines, examples):
    prompt_template= f"""You are an expert data extractor. Your task is to {task_description}

    Operational Guidelines:
    {operational_guidelines}

    Examples:
    {examples}

    Here is the note: "{note}"
    Return ONLY the resulting length-2 JSON array.
    """

    response = client.chat.completions.create(
        model="gpt-4o",
        messages=[{"role": "system", "content": prompt_template}]
        #, temperature?
    )

    response_text = response.choices[0].message.content.strip()
    print("API response: ", response_text, "to note: ", note)

    # Ensure the response is a valid JSON
    if response_text.startswith("```") and response_text.endswith("```"):
        response_text = response_text[3:-3].strip()
    response_text = response_text.strip("json\n")
    print('clen',response_text)

    # Parse the JSON response
    try:
        extracted_values = json.loads(response_text)
        return extracted_values
    except json.JSONDecodeError as e:
        print(f"Error parsing JSON response: {e}")
        return [['?','?']]*5
    
def process_notes(df, column_name, task_description, operational_guidelines, examples):
    values, confidences = [], []
    for note in df[column_name]:
        print('note: ', note)
        extracted_value, confidence = extract_value(note, task_description, operational_guidelines, examples)
        values.append(extracted_value)
        confidences.append(confidence)
    return values, confidences

def main():
    task_description = "extract the growth percentages from the following note."
    operational_guidelines = """
    1. Convert the percentage to a decimal format (between 0 and 1, returning values greater than 1 if applicable).
    2. If there is a range, use the lower number.
    3. If the input is an empty string or nan, return "Blank".
    4. If the value is vague, return "Unknown".
    5. Provide a confidence level in the range (0,1] for the extracted value. The confidence level should reflect how certain you are of your response.
    6. Return ONLY a length-2 JSON array with the extracted value and confidence level.
    """
    examples = """
    Note: "--10% growth y/y" -> [0.10, 0.9]
    Note: "Double digits" -> ["Unknown", 0.9]
    Note: "--2020-12E:  budget is to grow ARR 40%" -> [0.40, 0.7]
    Note: " " -> ["Blank", 1.0]
    Note: "80%" -> [0.80, 1.0]
    Note: "-12% (due to COVID)" -> [-0.12, 0.9]
    Note: "" -> ["Blank", 1.0]
    Note: "- Revenue has increased by 3x+ over past ~10 years. 12% YoY growth today" -> [0.12, 0.7]
    Note: "100% growth in 2019E" -> [1.0, 0.7]
    """

    start_row, end_row = 4000,4400 # Excel rows 1-4001 filled in Data_Extraction

    df_subset = df.iloc[start_row:end_row]
    # Apply the batch processing function
    revenue_values, revenue_confidences = process_notes(df_subset, 'Growth Notes', task_description, operational_guidelines, examples)

    print('results: ', revenue_values, revenue_confidences)

    # Insert the values and confidences back into the DataFrame
    df.loc[start_row:end_row-1, 'Growth Percentage'] = revenue_values
    df.loc[start_row:end_row-1, 'Growth Confidence'] = revenue_confidences

    # Save the updated dataframe back to Excel
    output_path = './file_folder/Data_Extraction_Growth_Edit.xlsx'
    df.to_excel(output_path, index=False)

    # Output path for user reference
    print(f"Updated Excel file saved to {output_path}")

main()

note:  growing but tbd
API response:  ["Unknown", 1.0] to note:  growing but tbd
clen ["Unknown", 1.0]
note:  good ARR bookings/flat growth
API response:  [
    "Unknown",
    0.9
] to note:  good ARR bookings/flat growth
clen [
    "Unknown",
    0.9
]
note:  High single digit growth two-year CAGR
API response:  [
    "Unknown",
    0.9
] to note:  High single digit growth two-year CAGR
clen [
    "Unknown",
    0.9
]
note:  100% YOY MRR growth (June 2019 time frame)
API response:  [1.0, 0.7] to note:  100% YOY MRR growth (June 2019 time frame)
clen [1.0, 0.7]
note:  nan
API response:  ["Blank", 1.0] to note:  nan
clen ["Blank", 1.0]
results:  ['Unknown', 'Unknown', 'Unknown', 1.0, 'Blank'] [1.0, 0.9, 0.9, 0.7, 1.0]
Updated Excel file saved to ./file_folder/Data_Extraction_Growth_Edit.xlsx


In [4]:
import pandas as pd
from openai import OpenAI
from dotenv import load_dotenv
import os
import json
from openpyxl import load_workbook

load_dotenv()

client = OpenAI() # defaults to os.environ.get('OPENAI_API_KEY')

# Load the Excel file
file_path = './file_folder/Data_Extraction_Growth.xlsx'
df = pd.read_excel(file_path)
# df = df.head(105)

# Set up your OpenAI API key
# openai.api_key = os.environ.get('OPENAI_API_KEY')

def extract_value(note, task_description, operational_guidelines, examples):
    prompt_template= f"""You are an expert data extractor. Your task is to {task_description}

    Operational Guidelines:
    {operational_guidelines}

    Examples:
    {examples}

    Here is the note: "{note}"
    Return ONLY the resulting length-2 JSON array.
    """

    response = client.chat.completions.create(
        model="gpt-4o-mini",
        messages=[{"role": "system", "content": prompt_template}]
        #, temperature?
    )

    response_text = response.choices[0].message.content.strip()
    print("API response: ", response_text, "to note: ", note)

    # Ensure the response is a valid JSON
    if response_text.startswith("```") and response_text.endswith("```"):
        response_text = response_text[3:-3].strip()
    response_text = response_text.strip("json\n")
    print('clen',response_text)

    # Parse the JSON response
    try:
        extracted_values = json.loads(response_text)
        return extracted_values
    except json.JSONDecodeError as e:
        print(f"Error parsing JSON response: {e}")
        return [['?','?']]*5
    
def process_notes(df, column_name, task_description, operational_guidelines, examples):
    values, confidences = [], []
    for note in df[column_name]:
        print('note: ', note)
        extracted_value, confidence = extract_value(note, task_description, operational_guidelines, examples)
        values.append(extracted_value)
        confidences.append(confidence)
    return values, confidences

def main():
    task_description = "extract the growth percentages from the following note."
    operational_guidelines = """
    1. Convert the percentage to a decimal format (between 0 and 1, returning values greater than 1 if applicable).
    2. If there is a range, use the lower number.
    3. If the input is an empty string or nan, return "Blank".
    4. If the value is vague, return "Unknown".
    5. Provide a confidence level in the range (0,1] for the extracted value. The confidence level should reflect how certain you are of your response.
    6. Return ONLY a length-2 JSON array with the extracted value and confidence level.
    """

    examples = """
    Note: "--10% growth y/y" -> [0.10, 0.9]
    Note: "Double digits" -> ["Unknown", 0.9]
    Note: "--2020-12E:  budget is to grow ARR 40%" -> [0.40, 0.7]
    Note: " " -> ["Blank", 1.0]
    Note: "80%" -> [0.80, 1.0]
    Note: "-12% (due to COVID)" -> [-0.12, 1.0]
    Note: "" -> ["Blank", 1.0]
    Note: "- Revenue has increased by 3x+ over past ~10 years. 12% YoY growth today" -> [0.12, 0.7]
    Note: "100% growth in 2019E" -> [1.0, 0.7]
    """
    
    start_row, end_row = 4000, 4537 # Excel rows 1-4001 filled in Data_Extraction (1-3001, 4002-4537 4o-mini, rest 4o)

    df_subset = df.iloc[start_row:end_row]
    # Apply the batch processing function
    revenue_values, revenue_confidences = process_notes(df_subset, 'Growth Notes', task_description, operational_guidelines, examples)

    print('results: ', revenue_values, revenue_confidences)

    # Insert the values and confidences back into the DataFrame
    df.loc[start_row:end_row-1, 'Growth Percentage'] = revenue_values
    df.loc[start_row:end_row-1, 'Growth Confidence'] = revenue_confidences

    # Save the updated dataframe back to Excel
    output_path = './file_folder/Data_Extraction_Growth_Edit.xlsx'
    df.to_excel(output_path, index=False)

    # Output path for user reference
    print(f"Updated Excel file saved to {output_path}")

main()

note:  growing but tbd
API response:  ["Unknown", 0.9] to note:  growing but tbd
clen ["Unknown", 0.9]
note:  good ARR bookings/flat growth
API response:  ["Unknown", 0.8] to note:  good ARR bookings/flat growth
clen ["Unknown", 0.8]
note:  High single digit growth two-year CAGR
API response:  ["Unknown", 0.9] to note:  High single digit growth two-year CAGR
clen ["Unknown", 0.9]
note:  100% YOY MRR growth (June 2019 time frame)
API response:  [1.0, 1.0] to note:  100% YOY MRR growth (June 2019 time frame)
clen [1.0, 1.0]
note:  nan
API response:  ["Blank", 1.0] to note:  nan
clen ["Blank", 1.0]
note:  nan
API response:  ["Blank", 1.0] to note:  nan
clen ["Blank", 1.0]
note:  High
API response:  ["Unknown", 0.9] to note:  High
clen ["Unknown", 0.9]
note:  80%+ (22.3M '22B ARR / 12.8M '21A ARR / $7.3M '20A ARR)
API response:  [0.80, 0.9] to note:  80%+ (22.3M '22B ARR / 12.8M '21A ARR / $7.3M '20A ARR)
clen [0.80, 0.9]
note:  2.5mm in 17', 8mm in 18', 11mm LTM Sep 19'
API response:  ["U

In [5]:
import os

print(os.environ.get('OPENAI_API_KEY'))

sk-proj-szg2DMXASYCIcUBeTqFxT3BlbkFJ8w1GVsW5xB9Abz675vzS


In [8]:
v = [['?','?']]*5
v

[['?', '?'], ['?', '?'], ['?', '?'], ['?', '?'], ['?', '?']]

In [162]:
strr = f"""Is this
literal?"""
print(strr)

Is this
literal?


['',
 'U',
 'n',
 'k',
 'n',
 'o',
 'w',
 'n',
 '$25M',
 '$14-$15m of ebitda',
 '$13mm ARR / FY22E $18mm.  Near 100% logo attrition. Net Revenue Attrition >150%']

In [None]:
file_path = './file_folder/Data_Extraction.xlsx'
df = pd.read_excel(file_path)
# df = df.head(105)

start_row, end_row = 150,200
df_subset = df.iloc[start_row:end_row]

df_subset


# Other prompts

In [None]:
# 5. Only return the number as a long digit, 'Blank', or 'Unknown'.

    # Current Growth Percentage prompt - x in (0,1), blank, unknown
    growth_note = '--15% growth y/y'
    # growth_note = '--2020-12E:  budget is to grow ARR 20%'
    # growth_note = '30% yoy'
    growth_note = '--Still growing the top line at 20% year over year'
    # growth_note = '40% growth'
    # growth_note = 'Double digits'
    # growth_note = 'Growing'
    # growth_note = ''
    # growth_note = '121% in 2022 (100%+ in 5 of last 6 years)'
    
    # prompt = f"""You are an expert data extractor. Your task is to extract the growth percentages from the following list of notes. 
    
    # Operational Guidelines for each note:
    
    # 1. Convert the percentage to a decimal format (between 0 and 1, returning values greater than 1 if applicable).
    # 2. If there is a range, use the lower number.
    # 3. If the input is empty, return 'Blank'.
    # 4. If the value is vague or non-numerical, return 'Unknown'.
    # 5. Provide a confidence level in the range (0,1] for the extracted value. The confidence level should reflect how certain you are of your response.
    # 6. Return only a tuple with the extracted value and confidence level.
    
    # Here is the note: {growth_note}
    # """
    
    profit_note = 'Burning $3mm'
    profit_note = "BE by Dec'20"
    # profit_note = 'Profitable'
    # profit_note = '50% EBITDA'
    # profit_note = 'burning $2mm'
    # profit_note = '2m'
    # profit_note = '$2m+ of EBITDA'
    # profit_note = '12'
    # profit_note = '$10 '
    # profit_note = '$12m in ebitda'
    # profit_note = '-'
    profit_note = '-'
    # profit_note = '"Very profitable"'
    # profit_note = ' '
    # profit_note = '$2M Adj EBITDA'
    # profit_note = '20% EBITDA in UK, burning at corporate level given US investment'
    # profit_note = 'Breakeven'
    # profit_note = 'burning $1m'
    # profit_note = '$4.6m EBITDA 2020'
    # profit_note = 'breakeven'
    # profit_note = '$5m of ebitda'
    # profit_note = '30%'

    # prompt = f"""You are an expert data extractor. Your task is to extract the profitability value from the following list of notes. 
    
    # Operational Guidelines for each note:
    
    # 1. The value should be one of the following categories: 'Profitable','Not Profitable', 'Break Even', 'Burning', 'Unknown', or 'Blank'. 
    # 2. If the value is a positive number or percentage, it is profitable. BE stands for 'Break Even'.
    # 3. If the input is only whitespace, return 'Blank'. If the value is vague, return 'Unknown'.
    # 4. Provide a confidence level in the range (0,1] for the extracted value. The confidence level should reflect how certain you are of your response.
    # 5. Return only a tuple with the extracted value and confidence level.
    
    # Here is the note: {profit_note}
    # """

    conc_note = 'Large J&J Contract (details to be received)'
    conc_note = ''
    conc_note = 'top is 7-8% of revenue'
    # conc_note = ''
    conc_note = 'none, top channel 20%'
    conc_note = '12'
    # conc_note = ''
    conc_note = 'some channel concentration with century link'
    conc_note = 'potentially has some concentration, tbd' # Should return yes(?)
    # conc_note = '-'
    # conc_note = 'None'
    # conc_note = ''
    # conc_note = ''
    # conc_note = ''
    # conc_note = 'None'
    # conc_note = 'carfax is a 40% customer'
    # conc_note = 'no meaningful concentration'
    # conc_note = 'Unknown'
    # conc_note = 'None'
    # conc_note = 'No - confirmed none above our 15% threshold; multi-million dollar deals with every major pharma co'
    # conc_note = ''
    # conc_note = 'No customer more than 0.5% of revenue'
    # conc_note = 'None known'

    # prompt = f"""You are an expert data extractor. Your task is to extract the concentration value from the following list of notes. 
    
    # Operational Guidelines for each note:
    
    # 1. Convert the note to one of the following categories: 'Yes', 'No', 'Unknown', or 'Blank'.
    # 2. This is a boolean flag to note whether the company in question has a concentration or not.'
    # 3. If there is potentially a concentration, retrun 'Yes'.
    # 3. If the input is empty, return 'Blank'.
    # 4. If the value is vague, return 'Unknown'.
    # 5. Provide a confidence level in the range (0,1] for the extracted value. The confidence level should reflect how certain you are of your response.
    # 6. Return only a tuple with the extracted value and confidence level. 
    
    # Here is the note: {conc_note}
    # """


### Old iterations

In [None]:
    # Process the response to extract the list of tuples
    response_text = response_text.choices[0].message.content.strip()
    # Convert response text to a list of tuples
    response_lines = response_text.split('\n')
    extracted_values = [eval(line.strip()) for line in response_lines if line.strip().startswith("(")]

    return extracted_values
    

    # TODO: regex safeguard to find tuple in case of errant response
    print("response", response_text.choices[0].message.content.strip())
    
    # responses = response.choices[0].message.strip().split('\n')
    # extracted_values = [res.split(': ')[-1].strip() for res in responses]
    
    # return extracted_values



    


# df['Revenue Value'] = df['Revenue Value'].apply(postprocess_value)

# Save the updated dataframe back to Excel
# output_path = '/mnt/data/Updated_Data_Extraction.xlsx'
# df.to_excel(output_path, index=False)

# Output path for user reference
# output_path

# batch_query(['asdf'])






# rev_notes = []
    # rev_note = ''
    # rev_notes += [rev_note]
    # rev_note = 'Unknown'
    # rev_notes += [rev_note]
    # rev_note = "$25M"
    # rev_notes += [rev_note]
    # rev_note = "$14-$15m of ebitda"
    # rev_notes += [rev_note]
    # rev_note = "$13mm ARR / FY22E $18mm.  Near 100% logo attrition. Net Revenue Attrition >150%"
    # rev_notes += [rev_note]
    # prompt =f"You are an expert data extractor. Your task is to extract the ARR values from the following list of notes. Operational Guidelines for each note: If there is a range, take the lower number. Convert the values to long digits. If a value is not clear, return 'Unknown'. Only return the number as a long digit or 'Unknown'. Here is the note: {rev_note}\n"




# Apply the batch processing function
# df['Revenue Value'] = process_batch(df)




# Current Revenue Value prompt
    
    prompt_template =f"""You are an expert data extractor. Your task is to extract the ARR values from the following list of notes.
    
    Operational Guidelines for each note:
    
    1. Construct a length-2 JSON array.
    2. Convert the value to long digits. Value must be in the millions.
    3. If there is a range, use the lower number.
    4. If the input is an empty string, set the value to 'Blank'.
    5. If the value is vague, set the value to 'Unknown'.
    6. Set the value as the first element.
    7. Provide a confidence level in the range (0,1] for the extracted value. The confidence level should reflect how certain you are of your response.
    8. Set the condence level as the second element.
    9. Return ONLY a JSON length-2 array with the extracted value and confidence level properties.
    
    Here are the notes: {notes}. Return ONLY a list of the resulting length-2 JSON arrays.
    """




def process_batch(df, column_name, task_description, operational_guidelines, batch_size=5):
    values, confidences = [], []
    for i in range(0, len(df), batch_size):
        batch_notes = df[column_name][i:i+batch_size].tolist()
        batch_values = batch_query(batch_notes, task_description, operational_guidelines)
        for value, confidence in batch_values:
            values.append(value)
            confidences.append(confidence)
    return values, confidences







# Postprocess to ensure extracted values are either floats or 'unknown'
def postprocess_value(value):
    try:
        return float(value)
    except ValueError:
        return 'Unknown'
    


    
    # # file_path = './file_folder/Data_Extraction_Test.xlsx'
    # workbook = load_workbook(file_path)
    # writer = pd.ExcelWriter(file_path, engine='openpyxl')
    # writer.book = workbook
    # writer.sheets = {ws.title: ws for ws in workbook.worksheets}

    # # Write the updated DataFrame to the existing sheet
    # df.to_excel(writer, index=False, sheet_name=writer.sheets.keys()[0])

    # writer.save()
    # writer.close()





    
    # for idx, note in enumerate(notes):
    #     prompt_template += f"Note {idx+1}: {note}\n"

    # prompt_template += "Return ONLY the resulting length-2 JSON array."

In [None]:
rev_notes = []
rev_note = ''
rev_notes += [rev_note]
rev_note = 'Unknown'
rev_notes += [rev_note]
rev_note = "$25M"
rev_notes += [rev_note]
rev_note = "$14-$15m of ebitda"
rev_notes += [rev_note]
rev_note = "$13mm ARR / FY22E $18mm.  Near 100% logo attrition. Net Revenue Attrition >150%"
rev_notes += [rev_note]
rev_notes