In [31]:
#!pip install together

In [32]:
import base64
import os
from nbconvert import export
import requests
from together import Together
from PIL import Image
from io import BytesIO
import json

os.environ["TOGETHER_API_KEY"] = "tgp_v1_DgIeKD-5c-MS2fxBpr5VO8hIB1TeUgun-Nk7Y5b9o98"

# Initialize Together AI client (set API key as environment variable or replace here)
# os.environ["TOGETHER_API_KEY"] = "your_api_key_here"
client = Together()

def load_image(image_source, is_url=False):
    """
    Load an image from a URL or local file and return it as a base64-encoded string with content type.
    
    Args:
        image_source (str): URL or local file path to the image.
        is_url (bool): True if image_source is a URL, False if local file.
    
    Returns:
        tuple: (base64-encoded image string, content type)
    """
    try:
        if is_url:
            response = requests.get(image_source, timeout=10)
            response.raise_for_status()
            img = Image.open(BytesIO(response.content))
            content_type = response.headers.get('Content-Type', 'image/jpeg')
            format = content_type.split('/')[-1].lower()
        else:
            format = os.path.splitext(image_source)[1][1:].lower()
            if format not in ['jpeg', 'jpg', 'png']:
                format = 'jpeg'
            img = Image.open(image_source)
            content_type = f"image/{format}"
        
        buffer = BytesIO()
        img.save(buffer, format=format)
        img_str = base64.b64encode(buffer.getvalue()).decode("utf-8")
        return img_str, content_type
    
    except Exception as e:
        raise ValueError(f"Failed to load image: {e}")

def generate_image_summary(image_source, is_url=False, max_tokens=2500):
    """
    Generate a summary of an image using Llama 4 Maverick via Together AI API.
    
    Args:
        image_source (str): URL or local file path to the image.
        is_url (bool): True if image_source is a URL, False if local file.
        max_tokens (int): Maximum tokens for the summary.
    
    Returns:
        str: Summary of the image.
    """

    prompt = """You are an expert computer vision analyst specializing in desktop environment analysis. Examine the provided screenshot with meticulous attention to detail and deliver your analysis in the following JSON format:

        {
            "applications_open": ["comprehensive list of ALL visible applications, software, and browser tabs in the screenshot"],
            "text_content": ["ALL readable text visible in the screenshot, including application names, window titles, menu items, document content, code snippets, terminal commands, browser content, notifications, taskbar information, and any other visible text elements"],
            "summary": "A comprehensive yet concise analysis that integrates all observations from applications_open and text_content into a cohesive narrative. Describe what the user appears to be working on, the relationship between open applications, and provide context for the visible content. This summary must be detailed enough to stand alone as a complete analysis of the screenshot."
        }

        Analysis guidelines:
        1. Be exhaustive in identifying ALL open applications - include minimized apps in taskbars, system trays, docks, browser tabs, and background processes with visual indicators
        2. Capture ALL visible text regardless of size or prominence - include menu items, file paths, code, commands, URLs, and partial text if readable
        3. When analyzing code or technical content, note the programming language, frameworks, or technologies in use
        4. Pay attention to timestamps, usernames, file names, and other contextual information
        5. Consider the relationship between open applications to infer the user's workflow
        6. In the summary, reconstruct the likely sequence of the user's activities based on the visible evidence

        Image data: data:image/png;base64,{base64_image}

        Provide only the JSON response without any introduction or additional text."""

    img_base64, content_type = load_image(image_source, is_url)
    messages = [
        {
            "role": "user",
            "content": [
                {"type": "text", "text": prompt},
                {"type": "image_url", "image_url": {"url": f"data:{content_type};base64,{img_base64}"}}
            ]
        }
    ]
    
    response = client.chat.completions.create(
        model="meta-llama/Llama-4-Maverick-17B-128E-Instruct-FP8",
        messages=messages,
        max_tokens=max_tokens,
        temperature=0.4,
        
    )
    
    return response.choices[0].message.content.strip()

```json
{
    "applications_open": ["Windows PowerShell", "Google Chrome"],
    "text_content": [
        "Windows PowerShell",
        "Copyright (C) Microsoft Corporation. All rights reserved.",
        "Install the latest PowerShell for new features and improvements! https://aka.ms/PSWindows",
        "PS C:\\Users\\Sathya Rajesh A>",
        "Get-PSDrive -PSProvider FileSyste",
        "Get-PSDrive : Cannot find a provider with the name 'FileSyste'.",
        "At line:1 char:5",
        "+ Get-PSDrive -PSProvider FileSyste",
        "+ CategoryInfo          : ObjectNotFound: (System.String[]:String[]) [Get-PSDrive], ProviderNotFoundException",
        "+ FullyQualifiedErrorId : GetLocationNoMatchingDrive,Microsoft.PowerShell.Commands.GetPSDriveCommand",
        "PS C:\\Users\\Sathya Rajesh A>",
        "Get-PSDrive -PSProvider FileSystem",
        "Name  Used (GB) Free (GB) Provider Root",
        "----  --------- ---------- -------- ----",
        "C     549.82    402.70   FileSys

In [33]:
import json
import re

def convert_output_to_json(output_str):
    """
    Convert the output string to a JSON object with robust error handling.
    
    Args:
        output_str (str): The string containing the JSON-like data structure
        
    Returns:
        dict: A properly formatted JSON object
    """
    # First try direct JSON parsing if the output is a valid JSON
    if output_str.strip().startswith("{") and output_str.strip().endswith("}"):
        try:
            # Handle potential code block markers
            clean_str = output_str.strip()
            if clean_str.startswith("```") and clean_str.endswith("```"):
                clean_str = clean_str[3:-3].strip()
                if clean_str.startswith("json"):
                    clean_str = clean_str[4:].strip()
            
            return json.loads(clean_str)
        except json.JSONDecodeError:
            pass  # Continue with regex-based parsing
    
    # Create an empty result structure
    result = {
        "applications_open": [],
        "text_content": [],
        "summary": ""
    }
    
    # Extract applications_open array
    apps_match = re.search(r'"applications_open":\s*\[(.*?)\]', output_str, re.DOTALL)
    if apps_match:
        apps_str = apps_match.group(1).strip()
        # Parse the array items
        if apps_str:
            apps_items = re.findall(r'"((?:\\.|[^"\\])*)"', apps_str)
            result["applications_open"] = [item.replace('\\\"', '"').replace('\\\\', '\\') for item in apps_items]
    
    # Extract text_content array - try a even more robust approach that captures all text content items
    # First try to match the full content between array brackets
    text_match = re.search(r'"text_content":\s*\[([\s\S]*?)\],\s*"summary":', output_str)
    if text_match:
        text_content_raw = text_match.group(1).strip()
        
        # Parse each string item in the array
        text_items = []
        
        # Use a custom string parsing approach to handle nested quotes correctly
        in_string = False
        current_item = ""
        escape_next = False
        
        for char in text_content_raw:
            if escape_next:
                current_item += char
                escape_next = False
                continue
                
            if char == '\\':
                escape_next = True
                current_item += char
                continue
                
            if char == '"' and not escape_next:
                if not in_string:
                    in_string = True  # Starting a string
                    current_item = ""  # Reset the current item
                else:
                    in_string = False  # Ending a string
                    if current_item:  # Add the completed item if it's not empty
                        text_items.append(current_item.replace('\\\"', '"').replace('\\\\', '\\'))
                continue
                
            if in_string:  # Only collect characters when inside a string
                current_item += char
                
        # If that failed, try the regex-based approach as a fallback
        if not text_items:
            string_pattern = re.compile(r'"((?:\\.|[^"\\])*)"')
            pos = 0
            
            # Find all strings in the text content section
            while pos < len(text_content_raw):
                match = string_pattern.search(text_content_raw, pos)
                if not match:
                    break
                    
                text_items.append(match.group(1).replace('\\\"', '"').replace('\\\\', '\\'))
                pos = match.end()
                
        result["text_content"] = text_items
    
    # Extract summary
    summary_match = re.search(r'"summary":\s*"((?:\\.|[^"\\])*)"', output_str, re.DOTALL)
    if summary_match:
        summary = summary_match.group(1).replace('\\\"', '"').replace('\\\\', '\\')
        result["summary"] = summary
    
    return result

{'applications_open': ['Windows PowerShell', 'Google Chrome'], 'text_content': ['Windows PowerShell', 'Copyright (C) Microsoft Corporation. All rights reserved.', 'Install the latest PowerShell for new features and improvements! https://aka.ms/PSWindows', 'PS C:\\Users\\Sathya Rajesh A>', 'Get-PSDrive -PSProvider FileSyste', "Get-PSDrive : Cannot find a provider with the name 'FileSyste'.", 'At line:1 char:5', '+ Get-PSDrive -PSProvider FileSyste', '+ CategoryInfo          : ObjectNotFound: (System.String[]:String[]) [Get-PSDrive], ProviderNotFoundException', '+ FullyQualifiedErrorId : GetLocationNoMatchingDrive,Microsoft.PowerShell.Commands.GetPSDriveCommand', 'PS C:\\Users\\Sathya Rajesh A>', 'Get-PSDrive -PSProvider FileSystem', 'Name  Used (GB) Free (GB) Provider Root', '----  --------- ---------- -------- ----', 'C     549.82    402.70   FileSystem C:\\', 'CurrentLocation', '---------------', 'Users\\Sathya Rajesh A', 'Get-PSDrive -PSProvider FileSystem', 'AI responses may include

In [34]:
from typing import Union, Dict, List, Any
import json
from datetime import datetime

def json_to_markdown(
    json_data: Union[str, Dict[str, Any]], 
    output_file: str = "analysis_output_excel.md",
    username: str = "Aarav",
    timestamp: str = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
) -> str:
    """
    Convert JSON data to a nicely formatted Markdown file.
    
    Args:
        json_data: Either a JSON string or a parsed JSON object containing analysis data
        output_file: Path/name of the output markdown file
        username: Current user's login (optional)
        timestamp: Current timestamp (optional)
        
    Returns:
        str: Path to the created markdown file
    """
    # Parse JSON if a string is provided
    if isinstance(json_data, str):
        # Try to parse it directly first
        try:
            # Clean string of code block markers if present
            clean_str = json_data.strip()
            if clean_str.startswith("```") and clean_str.endswith("```"):
                clean_str = clean_str[3:-3].strip()
                if clean_str.startswith("json"):
                    clean_str = clean_str[4:].strip()
                    
            data = json.loads(clean_str)
        except json.JSONDecodeError:
            # If the standard parsing fails, use our custom parser
            data = convert_output_to_json(json_data)
    else:
        data = json_data
    
    # Create markdown content
    md_content = []
    
    # Add title and metadata
    md_content.append("# Desktop Screenshot Analysis\n")
    
    if timestamp or username:
        md_content.append("## Metadata\n")
        if timestamp:
            md_content.append(f"**Timestamp:** {timestamp}\n")
        if username:
            md_content.append(f"**User:** {username}\n")
        md_content.append("\n")
    
    # Add applications section
    md_content.append("## Applications Open\n")
    if data.get("applications_open"):
        for app in data["applications_open"]:
            md_content.append(f"- {app}\n")
    else:
        md_content.append("*No applications detected*\n")
    md_content.append("\n")
    
    # Add text content section
    md_content.append("## Text Content\n")
    if data.get("text_content"):
        md_content.append("```\n")  # Start code block
        for text_item in data["text_content"]:
            md_content.append(f"{text_item}\n")
        md_content.append("```\n")  # End code block
    else:
        md_content.append("*No text content detected*\n")
    md_content.append("\n")
    
    # Add summary section
    md_content.append("## Summary\n")
    if data.get("summary"):
        md_content.append(f"{data['summary']}\n")
    else:
        md_content.append("*No summary available*\n")
    
    # Write to file
    try:
        with open(output_file, 'w', encoding='utf-8') as f:
            f.write(''.join(md_content))
        print(f"Successfully created Markdown file: {output_file}")
        return output_file
    except Exception as e:
        error_msg = f"Error writing to file: {str(e)}"
        print(error_msg)
        return error_msg

Successfully created Markdown file: ./Outputs/analysis_output_powershell_error_NonAdmin.md


'./Outputs/analysis_output_powershell_error_NonAdmin.md'

In [37]:
image_url_terminal = "./Screenshots/Terminal.png"
image_url_excel = "./Screenshots/Excel.png"
image_url_chrome = "./Screenshots/Chrome.png"
image_path_pdf = "./Screenshots/PDF.png"
image_path_powershell_error = "./Screenshots/Powershell_error.png"
image_path_powershell_success = "./Screenshots/Powershell_error_free.png"

output_pdf = "./Outputs/analysis_output_pdf.md"
output_powershell_error = "./Outputs/analysis_output_powershell_error_nonAdmin.md"
output_powershell_success = "./Outputs/analysis_output_powershell_Admin.md"
output_terminal = "./Outputs/analysis_output_terminal.md"
output_excel = "./Outputs/analysis_output_excel.md"
output_chrome = "./Outputs/analysis_output_chrome.md"

try:
    summary = generate_image_summary(image_path_powershell_success, is_url=False)
    print(summary)
except Exception as e:
    print(f"Error: {e}")

# Convert the output to JSON
summary_json = convert_output_to_json(summary)
print(summary_json)

# Example usage
json_to_markdown(
    summary_json, 
    output_file=output_powershell_success, 
    username="current_user", 
    timestamp="2023-10-01 12:00:00"
)

```
{
    "applications_open": ["Windows PowerShell"],
    "text_content": [
        "Administrator: Windows PowerShell",
        "Windows PowerShell",
        "Copyright (C) Microsoft Corporation. All rights reserved.",
        "Install the latest PowerShell for new features and improvements! https://aka.ms/PSWindows",
        "PS C:\\WINDOWS\\system32>",
        "Get-PSDrive -PSProvider FileSystem",
        "Name",
        "Used (GB)",
        "Free (GB)",
        "Provider",
        "Root",
        "C",
        "549.85",
        "402.67",
        "FileSystem",
        "C:\\",
        "PS C:\\WINDOWS\\system32>",
        "CurrentLocation",
        "WINDOWS\\system32",
        "Watchlist +5.43%",
        "Search web & PC",
        "ENG US",
        "10:44 AM 6/26/2025"
    ],
    "summary": "The user is working with Windows PowerShell as an administrator. They have executed the command 'Get-PSDrive -PSProvider FileSystem' to retrieve information about the file system drives. The outpu

'./Outputs/analysis_output_powershell_Admin.md'