In [None]:
import openai

from openai import OpenAI
client = OpenAI()
# key is stored in the environment variable

## Code to handdling the labeling work

In [None]:
import openai
import pandas as pd
import time  # Import time module for sleep
import warnings

warnings.simplefilter(action='ignore', category=FutureWarning)


def process_visualizations(csv_file, output_file, start_id=0, end_id=10000):
    # Load the CSV file
    df = pd.read_csv(csv_file)
    
    # If the output file exists, load the processed part to avoid reprocessing; else create an empty DataFrame with required columns
    try:
        processed_df = pd.read_csv(output_file)
    except FileNotFoundError:
        processed_df = pd.DataFrame(columns=['id', 'paper_id','local_path', 'server_path', 'T1', 'T1-Score', 'T2', 'T2-Score', 'T3', 'T3-Score'])
    
    batch_counter = 0  # Initialize batch counter
    error_rows = []    # Initialize list to keep track of error rows

    # Iterate over the rows in the CSV
    for index, row in df.iterrows():
        # Ensure ID is within the specified range and not already processed
        if row['id'] < start_id or row['id'] > end_id or (processed_df['id'] == row['id']).any():
            continue

        # Use server_path as the image URL
        image_url = row['server_path']
        retries = 0  # Initialize retry counter

        while retries < 3:
            try:
                # send the request to the API
                response = client.chat.completions.create(
                    model="gpt-4o-mini",
                    messages=[
                        {
                            "role": "user",
                            "content": [
                                {"type": "text", "text": 
                                '''
                                Please identify the types of visual content in this image and return results in the format: type1,confidence;type2,confidence;type3,confidence. Only reply with those terms and confidence levels, with no extra words. Here are the options:

                                "Violin", "Density", "Histogram", "Boxplot", "Ridgeline", "Scatter", "Heatmap", "Correlogram", "Bubble", "Connected Scatter", "Density 2d", "Barplot", "Spider / Radar", "Wordcloud", "Parallel", "Lollipop", "Circular Barplot", "Treemap", "Venn Diagram", "Doughnut", "Pie Chart", "Dendrogram", "Circular Packing", "Sunburst", "Line Plot", "Area", "Stacked Area", "Streamchart", "Map", "Choropleth", "Hexbin Map", "Cartogram", "Connection", "Bubble Map", "Chord Diagram", "Network", "Sankey", "Arc Diagram", "Edge Bundling", "Complex Diagram", "Circuit Diagram (includes electrical or electronic circuits)", "Flow Chart (process flows, decision flows)", "Geometry (includes shapes, geometric diagrams)", "Scientific Viz (includes graphs of scientific data, models)", "Other (if none of the above categories fit)".

                                '''
                                },
                                {
                                    "type": "image_url",
                                    "image_url": {
                                        "url": image_url,
                                    },
                                },
                            ],
                        }
                    ],
                    max_tokens=350,
                )
                
                # Extract the response from the API
                parsed_response = response.choices[0].message.content
            
                # Split the response into types and scores
                types_scores = parsed_response.split(';')
                types_scores = [ts.strip().split(',') for ts in types_scores if len(ts.strip().split(',')) == 2]
        
                # Create a new row to store the results
                new_row = {
                    'id': row['id'],
                    'paper_id': row['paper_id'],
                    'local_path': row['local_path'],
                    'server_path': row['server_path'],
                    'T1': types_scores[0][0] if len(types_scores) >= 1 else None,
                    'T1-Score': float(types_scores[0][1]) if len(types_scores) >= 1 else None,
                    'T2': types_scores[1][0] if len(types_scores) >= 2 else None,
                    'T2-Score': float(types_scores[1][1]) if len(types_scores) >= 2 else None,
                    'T3': types_scores[2][0] if len(types_scores) >= 3 else None,
                    'T3-Score': float(types_scores[2][1]) if len(types_scores) >= 3 else None,
                }
                
                # Append the new row to processed_df
                processed_df = pd.concat([processed_df, pd.DataFrame([new_row])], ignore_index=True)
                
                batch_counter += 1  # Increment batch counter

                if batch_counter == 3:
                    # Save processed_df to the output CSV file
                    processed_df.to_csv(output_file, index=False)
                    # Reset batch counter
                    batch_counter = 0

                time.sleep(3)  # Wait for 3 seconds before sending the next request
                break  # Exit retry loop on success

            except Exception as e:
                retries += 1
                print(f"Attempt {retries} failed for row with id {row['id']}: {e}")
                if retries < 3:
                    time.sleep(5)  # Wait for 5 seconds before retrying
                else:
                    error_rows.append(row['id'])
                    print(f"Max retries reached for row with id {row['id']}. Skipping...")
                    break  # Exit retry loop after max retries

    # After processing all rows, save any remaining data
    if batch_counter > 0:
        processed_df.to_csv(output_file, index=False)

    # Save the list of error rows to a separate file
    if error_rows:
        error_df = pd.DataFrame({'id': error_rows})
        error_df.to_csv('error_rows.csv', index=False)

# Usage example
process_visualizations('../data/figure.csv', '../data/result/figure_result.csv', start_id=0, end_id=11574)


Attempt 1 failed for row with id 226: Error code: 400 - {'error': {'message': 'Error while downloading https://data.cyverse.org/dav-anon/iplant/home/carolinarr/vis-sieve/Princeton_content/3009742497/Figure9-1.png.', 'type': 'invalid_request_error', 'param': None, 'code': 'invalid_image_url'}}
Attempt 1 failed for row with id 483: Error code: 400 - {'error': {'message': 'Timeout while downloading https://data.cyverse.org/dav-anon/iplant/home/carolinarr/vis-sieve/Princeton_content/3126256773/Figure1-1.png.', 'type': 'invalid_request_error', 'param': None, 'code': 'invalid_image_url'}}
Attempt 1 failed for row with id 527: could not convert string to float: '95%'
Attempt 2 failed for row with id 527: could not convert string to float: ' 90%'
Attempt 3 failed for row with id 527: could not convert string to float: ' 95%'
Max retries reached for row with id 527. Skipping...
Attempt 1 failed for row with id 592: Error code: 400 - {'error': {'message': 'Timeout while downloading https://data.

## Erorr Msg Handling, process those failed image

In [26]:
import re


log_data = """

Successfully processed id: 527
Attempt 1 failed for row with id 3991: could not convert string to float: '90%'
Attempt 2 failed for row with id 3991: could not convert string to float: ' 100%'
Attempt 3 failed for row with id 3991: could not convert string to float: ' 90%'
Max retries reached for row with id 3991. Skipping...
Successfully processed id: 3991
Successfully processed id: 4435
Successfully processed id: 5342
Attempt 1 failed for row with id 5571: could not convert string to float: '95%'
Attempt 2 failed for row with id 5571: could not convert string to float: '95%'
Successfully processed id: 5571/
Successfully processed id: 5658
Successfully processed id: 5659
Attempt 1 failed for row with id 7067: could not convert string to float: '90%'
Attempt 2 failed for row with id 7067: could not convert string to float: '95%'
Attempt 3 failed for row with id 7067: could not convert string to float: '90%'
Max retries reached for row with id 7067. Skipping...
Successfully processed id: 7067
Successfully processed id: 7349
Successfully processed id: 7724
Successfully processed id: 8123
Attempt 1 failed for row with id 8172: could not convert string to float: ' 98%'
Successfully processed id: 8172
Successfully processed id: 8381
Successfully processed id: 8689
Successfully processed id: 8773
Successfully processed id: 9022
Successfully processed id: 9361
Successfully processed id: 9533
Successfully processed id: 10348
Successfully processed id: 10878
Successfully processed id: 11318
Successfully processed id: 11319
Successfully processed id: 11414

"""

# Regular expression to extract skipped IDs
pattern = r"Max retries reached for row with id (\d+)"
skipped_ids = []
skipped_ids = re.findall(pattern, log_data)
skipped_ids = [int(id) for id in skipped_ids] 


print(skipped_ids)


[3991, 7067]


## Reexcute the Labeling work based on the above array

In [None]:
import openai
import pandas as pd
import time  # Import time module for sleep
import warnings

warnings.simplefilter(action='ignore', category=FutureWarning)


def process_visualizations(csv_file, output_file, start_id=0, end_id=10000):
    # Load the CSV file
    df = pd.read_csv(csv_file)
    
    # If the output file exists, load the processed part to avoid reprocessing; else create an empty DataFrame with required columns
    try:
        processed_df = pd.read_csv(output_file)
    except FileNotFoundError:
        processed_df = pd.DataFrame(columns=['id', 'paper_id','local_path', 'server_path', 'T1', 'T1-Score', 'T2', 'T2-Score', 'T3', 'T3-Score'])
    
    batch_counter = 0  # Initialize batch counter
    error_rows = []    # Initialize list to keep track of error rows

    # Iterate over the rows in the CSV
    for index, row in df.iterrows():
        # Ensure ID is within the specified range and not already processed
        if row['id'] < start_id or row['id'] > end_id or (processed_df['id'] == row['id']).any():
            continue

        # Use server_path as the image URL
        image_url = row['server_path']
        retries = 0  # Initialize retry counter

        while retries < 3:
            try:
                # send requests to API
                response = client.chat.completions.create(
                    model="gpt-4o-mini",
                    messages=[
                        {
                            "role": "user",
                            "content": [
                                {"type": "text", "text": 
                                '''
                                Please identify the types of visual content in this image and return results in the format: type1,confidence;type2,confidence;type3,confidence. Only reply with those terms and confidence levels, with no extra words. The confidence level could be float only Here are the options:

                                "Violin", "Density", "Histogram", "Boxplot", "Ridgeline", "Scatter", "Heatmap", "Correlogram", "Bubble", "Connected Scatter", "Density 2d", "Barplot", "Spider / Radar", "Wordcloud", "Parallel", "Lollipop", "Circular Barplot", "Treemap", "Venn Diagram", "Doughnut", "Pie Chart", "Dendrogram", "Circular Packing", "Sunburst", "Line Plot", "Area", "Stacked Area", "Streamchart", "Map", "Choropleth", "Hexbin Map", "Cartogram", "Connection", "Bubble Map", "Chord Diagram", "Network", "Sankey", "Arc Diagram", "Edge Bundling", "Complex Diagram", "Circuit Diagram (includes electrical or electronic circuits)", "Flow Chart (process flows, decision flows)", "Geometry (includes shapes, geometric diagrams)", "Scientific Viz (includes graphs of scientific data, models)", "Other (if none of the above categories fit)".

                                '''
                                },
                                {
                                    "type": "image_url",
                                    "image_url": {
                                        "url": image_url,
                                    },
                                },
                            ],
                        }
                    ],
                    max_tokens=350,
                )
                
                # extract the response from the API
                parsed_response = response.choices[0].message.content
            
                # Split the response into types and scores
                types_scores = parsed_response.split(';')
                types_scores = [ts.strip().split(',') for ts in types_scores if len(ts.strip().split(',')) == 2]
        
                # Create a new row to store the results
                new_row = {
                    'id': row['id'],
                    'paper_id': row['paper_id'],
                    'local_path': row['local_path'],
                    'server_path': row['server_path'],
                    'T1': types_scores[0][0] if len(types_scores) >= 1 else None,
                    'T1-Score': float(types_scores[0][1]) if len(types_scores) >= 1 else None,
                    'T2': types_scores[1][0] if len(types_scores) >= 2 else None,
                    'T2-Score': float(types_scores[1][1]) if len(types_scores) >= 2 else None,
                    'T3': types_scores[2][0] if len(types_scores) >= 3 else None,
                    'T3-Score': float(types_scores[2][1]) if len(types_scores) >= 3 else None,
                }
                
                # Append the new row to processed_df
                processed_df = pd.concat([processed_df, pd.DataFrame([new_row])], ignore_index=True)
                
                batch_counter += 1  # Increment batch counter

                if batch_counter == 3:
                    # Save processed_df to the output CSV file
                    processed_df.to_csv(output_file, index=False)
                    # Reset batch counter
                    batch_counter = 0

                time.sleep(3)  # Wait for 3 seconds before sending the next request
                break  # Exit retry loop on success

            except Exception as e:
                retries += 1
                print(f"Attempt {retries} failed for row with id {row['id']}: {e}")
                if retries < 3:
                    time.sleep(5)  # Wait for 5 seconds before retrying
                else:
                    error_rows.append(row['id'])
                    print(f"Max retries reached for row with id {row['id']}. Skipping...")
                    break  # Exit retry loop after max retries

    # After processing all rows, save any remaining data
    if batch_counter > 0:
        processed_df.to_csv(output_file, index=False)

    # Save the list of error rows to a separate file
    if error_rows:
        error_df = pd.DataFrame({'id': error_rows})
        error_df.to_csv('error_rows.csv', index=False)


def process_skipped_ids(skipped_ids):
    for id in skipped_ids:
        try:
            
            process_visualizations(
                '../data/figure.csv',
                '../data/result/figure_result.csv',
                start_id=id,
                end_id=id
            )
            print(f"Successfully processed id: {id}")
        except Exception as e:
            print(f"Error processing id {id}: {e}")


process_skipped_ids(skipped_ids)



Successfully processed id: 3991
Successfully processed id: 7067


# result modification

In [29]:
figure_result = pd.read_csv('../data/result/figure_result.csv')

In [32]:
figure_result.sort_values(by='id', inplace=True)
figure_result.reset_index(drop=True, inplace=True)

In [52]:
figure_result.to_csv("../data/result/figure_result_sorted.csv", index=False)