In [1]:
import pandas as pd
import openai
import os
import time
import re
import glob



# try to use batch call with openai 


In [2]:

def is_substantive_safe(text):
    """Check if the text has substantive content, handling missing or non-string values safely."""
    if not isinstance(text, str):
        return False
    
    # Define keywords and minimum length to consider the content substantive
    min_length = 10
    financial_terms = [
        "earnings", "revenue", "forecast", "guidance", "expense", "growth", "margin", 
        "profit", "quarter", "operating", "investment", "cash flow", "assets", "liabilities",
        "dividend", "debt", "capital", "equity", "net income", "costs", "shareholder", 
        "acquisition", "merger", "balance sheet", "market conditions", "supply chain", 
        "inflation", "interest rate", "valuation", "EBITDA", "tax", "cost reduction", 
        "production", "sales", "pricing", "strategy", "projections", "financial performance"
    ]
    
    # Check if the text length is above a threshold or contains financial terms
    if len(text.split()) > min_length or any(term in text.lower() for term in financial_terms):
        return True
    
    # Check if there are sentences beyond initial greetings
    greeting_pattern = r"^(hi|hello|good morning|good afternoon|thank you|thanks),?\s*"
    cleaned_text = re.sub(greeting_pattern, "", text.lower()).strip()
    if len(cleaned_text.split()) > min_length or any(term in cleaned_text for term in financial_terms):
        return True
    
    return False

def clean_qa_dataframe_safe(df):
    """Clean the dataframe by removing rows with non-substantive questions and answers, handling edge cases."""
    # Filter rows where both Question and Answer have substantive content
    df_cleaned = df[df['Question'].apply(is_substantive_safe) & df['Answer'].apply(is_substantive_safe)]
    return df_cleaned

def clean_all_csvs_in_directory(base_dir):
    """Clean all CSV files in the specified base directory and save them in corresponding company directories."""
    # Iterate through all company directories inside the base directory
    for company_name in os.listdir(base_dir):
        company_path = os.path.join(base_dir, company_name)
        
        if os.path.isdir(company_path):
            print(f"Processing company: {company_name}")
            
            # Create a corresponding directory inside 'clean_QAs'
            clean_company_dir = os.path.join(
                "/Users/petersapountzis/Desktop/tulane/fall2024/cmps4010/Entergy-AI/data/clean_QAs",
                f"{company_name}_clean_QA"
            )
            os.makedirs(clean_company_dir, exist_ok=True)

            # Process all CSV files within the company's directory
            csv_files = glob.glob(os.path.join(company_path, "*.csv"))
            for file_path in csv_files:
                try:
                    df = pd.read_csv(file_path)
                    cleaned_df = clean_qa_dataframe_safe(df)

                    # Save the cleaned CSV in the company's clean_QA directory
                    cleaned_filename = os.path.join(clean_company_dir, f"cleaned_{os.path.basename(file_path)}")
                    cleaned_df.to_csv(cleaned_filename, index=False)
                    print(f"Cleaned file saved to {cleaned_filename}")

                except Exception as e:
                    print(f"Error processing {file_path}: {e}")

# Example usage
base_directory = "/Users/petersapountzis/Desktop/tulane/fall2024/cmps4010/Entergy-AI/data/earningsQAs/UTY Transcripts-selected/UTY csvs"
clean_all_csvs_in_directory(base_directory)

Processing company: duke_csvs
Cleaned file saved to /Users/petersapountzis/Desktop/tulane/fall2024/cmps4010/Entergy-AI/data/clean_QAs/duke_csvs_clean_QA/cleaned_Duke_energy_transcripts_earnings_qa_2020-8-10.csv
Cleaned file saved to /Users/petersapountzis/Desktop/tulane/fall2024/cmps4010/Entergy-AI/data/clean_QAs/duke_csvs_clean_QA/cleaned_Duke_energy_transcripts_earnings_qa_2021-8-05.csv
Cleaned file saved to /Users/petersapountzis/Desktop/tulane/fall2024/cmps4010/Entergy-AI/data/clean_QAs/duke_csvs_clean_QA/cleaned_Duke_energy_transcripts_earnings_qa_2019-2-14.csv
Cleaned file saved to /Users/petersapountzis/Desktop/tulane/fall2024/cmps4010/Entergy-AI/data/clean_QAs/duke_csvs_clean_QA/cleaned_Duke_energy_transcripts_earnings_qa_2019-5-09.csv
Cleaned file saved to /Users/petersapountzis/Desktop/tulane/fall2024/cmps4010/Entergy-AI/data/clean_QAs/duke_csvs_clean_QA/cleaned_Duke_energy_transcripts_earnings_qa_2023-2-09.csv
Cleaned file saved to /Users/petersapountzis/Desktop/tulane/fall2