The docs with name is the input docs, the docs start with MSR is the output docs. 

I want to create a python streamlit app, that let the managers input the inputs docs, then it will look at each input doc. 

Process:

* Create a master file: just a variable in the script. (save to a word or text file later)
   a. Look into each file,
       a. extract the file name.
       b. extract the data from the file.
   b. Do the same for all input docs.


* Create output docs
   1. Dumb all into llm: 
      1. Sonnet: 1 hallucination
      2. Gemini:
   2. By task order

Design decisions:
- Do one task order at a time, instead of dumb all files in at once.
  - Why: 
    - Simplified the task: avoid complexity manage files.
    - Reduce risk of hallucination for super long context. 

Edge case:
- Some person may not have the report: 
    - Administrative Services Support   
        Kyerra Jones
        o	NO INPUT DOCUMENT
- Make sure their name is on the file name: "MSR TO1 BernardonL September 24.docx"
- Highlight the accuracy of the information in the prompt!!! 



# Testing:
- Create a good example file.
- Combined all docx files into 1 file.
- Upload to LLM with example file

In [None]:
!python -m pip install python-docx google-generativeai python-dotenv pypandoc

- Upload TO files
- Extract text from docx file into 1 master file
- Pass the master file with example file to LLM
- Output the MSR file

In [4]:
import os
from docx import Document
from typing import List, Dict

def read_word_file(file_path: str) -> str:
    """
    Read the content of a Word file.
    
    Args:
    file_path (str): Path to the Word file
    
    Returns:
    str: Content of the Word file
    """
    doc = Document(file_path)
    full_text = []
    for para in doc.paragraphs:
        full_text.append(para.text)
    return '\n'.join(full_text)

def process_input_docs(directory: str) -> Dict[str, str]:
    """
    Process all Word files in the given directory.
    
    Args:
    directory (str): Path to the directory containing Word files
    
    Returns:
    Dict[str, str]: A dictionary with filenames as keys and file contents as values
    """
    input_docs = {}
    for filename in os.listdir(directory):
        if filename.endswith('.docx'):
            file_path = os.path.join(directory, filename)
            content = read_word_file(file_path)
            input_docs[filename] = content
    return input_docs

# Example usage
if __name__ == "__main__":
    input_directory = "./data_files/inputs/TO4_LeasesFacilitiesDivision(LFD)"
    
    processed_docs = process_input_docs(input_directory)
    
    # Export into a text file in the data_files/master_file folder
    os.makedirs("./data_files/master_file", exist_ok=True)
    output_file = "./data_files/master_file/master_file.txt"
    
    with open(output_file, 'w') as file:
        for filename, content in processed_docs.items():
            file.write(f"File: {filename}\n")
            file.write(f"Content: {content}\n")
            file.write("-" * 50 + "\n")
            file.write("\n\n")
    
    print(f"Combined documents exported to: {output_file}")
    
    # # Print the results
    # for filename, content in processed_docs.items():
    #     print(f"File: {filename}")
    #     print(f"Content: {content}")
    #     print("-" * 50)
    #     print("\n\n")

Combined documents exported to: ./data_files/master_file/master_file.txt


In [26]:
import os
import google.generativeai as genai
from typing import List, Dict
from docx import Document  # Add this import for Word document handling
from dotenv import load_dotenv
from IPython.display import Markdown, display
import pypandoc


# Prompt management
def load_prompt(prompt_name: str) -> str:
    prompt_folder = "./prompt_folder"
    os.makedirs(prompt_folder, exist_ok=True)
    prompt_path = os.path.join(prompt_folder, f"{prompt_name}.txt")
    
    if os.path.exists(prompt_path):
        with open(prompt_path, 'r') as file:
            return file.read()
    else:
        return ""

def save_prompt(prompt_name: str, prompt_content: str):
    prompt_folder = "./prompt_folder"
    os.makedirs(prompt_folder, exist_ok=True)
    prompt_path = os.path.join(prompt_folder, f"{prompt_name}.txt")
    
    with open(prompt_path, 'w') as file:
        file.write(prompt_content)

# Setup
load_dotenv()
api_key = os.getenv("GOOGLE_API_KEY")
genai.configure(api_key=api_key)

# Creating a Model
generation_config = {
    "temperature": 0,
    "max_output_tokens": 8192,
}

safety_settings = [
    {"category": "HARM_CATEGORY_DANGEROUS", "threshold": "BLOCK_NONE"},
    {"category": "HARM_CATEGORY_HARASSMENT", "threshold": "BLOCK_NONE"},
    {"category": "HARM_CATEGORY_HATE_SPEECH", "threshold": "BLOCK_NONE"},
    {"category": "HARM_CATEGORY_SEXUALLY_EXPLICIT", "threshold": "BLOCK_NONE"},
    {"category": "HARM_CATEGORY_DANGEROUS_CONTENT", "threshold": "BLOCK_NONE"},
]

model = genai.GenerativeModel(
    model_name="gemini-1.5-pro",
    generation_config=generation_config,
    safety_settings=safety_settings
)

def generate_monthly_status_report(master_content: str, example_content: str) -> str:
    prompt = f"""
    You are tasked with creating a Monthly Status Report based on the following information:

    1. Master File Content (combined MSR files):
    {master_content}

    2. Example Report Format (example of a combined MSR file output):
    {example_content}

    Please generate a Monthly Status Report following the structure and style of the example report, 
    using the information provided in the master file. 
    - Ensure that you maintain the accuracy of the information from the master file (only fix grammar, do not change the information)
    - Follow the formatting of the example report.
    - Make sure to seperated the work of each person.
    
    Use Markdown to represent the following:
    - Headings
    - Bullet points
    - Bold for emphasis
    - Tables where applicable
    
    Your generated report should be comprehensive, accurate, and well-structured in Markdown format.
    """
    save_prompt("monthly_status_report", prompt)
    
    try:
        response = model.generate_content(prompt, stream=True)
        
        full_response = ""
        for chunk in response:
            if chunk.text:
                full_response += chunk.text
        
        # Handling Safety Filters
        if response.candidates[0].finish_reason == "SAFETY":
            safety_ratings = response.candidates[0].safety_ratings
            safety_message = "Content was filtered due to safety concerns:\n"
            for rating in safety_ratings:
                safety_message += f"- Category: {rating.category}, Probability: {rating.probability}\n"
            print(safety_message)
            return safety_message
        
        # Retrieving Usage Metadata
        if hasattr(response, 'usage_metadata'):
            prompt_tokens = response.usage_metadata.prompt_token_count
            candidates_tokens = response.usage_metadata.candidates_token_count
            print(f"Prompt tokens: {prompt_tokens}")
            print(f"Response tokens: {candidates_tokens}")
        
        return full_response
    
    except Exception as e:
        error_message = f"An error occurred: {e}"
        print(error_message)
        return error_message



# Save the generated Markdown to a file
def save_markdown_to_file(markdown_content: str, file_path: str):
    with open(file_path, 'w') as md_file:
        md_file.write(markdown_content)

# Convert Markdown to .docx using pypandoc
def convert_markdown_to_docx(markdown_file_path: str, output_file_path: str):
    pypandoc.convert_file(markdown_file_path, 'docx', outputfile=output_file_path)

if __name__ == "__main__":
    # Read the master file and example file
    with open("./data_files/master_file/master_file.txt", 'r') as file:
        master_content = file.read()
    
    with open("./example/example.txt", 'r') as file:
        example_content = file.read()
    
    # Generate the monthly status report in Markdown
    report = generate_monthly_status_report(master_content, example_content)

    # Save the Markdown report
    markdown_file = "./generated_report.md"
    save_markdown_to_file(report, markdown_file)

    # Convert the Markdown report to Word document
    docx_file = "./generated_report.docx"
    convert_markdown_to_docx(markdown_file, docx_file)

    print(f"Monthly Status Report has been generated and saved to '{docx_file}'")



Prompt tokens: 3732
Response tokens: 863
Monthly Status Report has been generated and saved to './generated_report.docx'
