## Description

This project was set to run on Github codespaces with the following requirements:

- azure-ai-inference~=1.0.0b4
- openai~=1.37.1
- mistralai~=0.4.2
- python-dotenv~=1.0.1
- ipykernel~=6.29.5

You can also run this locally.


In [1]:
import time
import os
import requests
from openai import OpenAI
import pandas as pd

pd.set_option('display.max_colwidth', 0)

### Load Files 

You should have as input, a csv containing the entirety of the project description under the column 'description'. The rubric should have two columns -- criteria and description

In [2]:
# Load project descriptions and rubric descriptions. 
projects_df = pd.read_csv("projects_selected.csv")
rubric_df = pd.read_csv("devpost_creativity_rubric.csv")

### Microsoft AI


In [3]:
#pip install azure-ai-inference

import os
import time
from azure.ai.inference import ChatCompletionsClient
from azure.ai.inference.models import SystemMessage, UserMessage
from azure.core.credentials import AzureKeyCredential
import pandas as pd

# Configuration
endpoint = "https://models.inference.ai.azure.com"
model_name = "Phi-3-medium-4k-instruct"
token = os.environ["GITHUB_TOKEN"]

# Initialize Microsoft client
client = ChatCompletionsClient(
    endpoint=endpoint,
    credential=AzureKeyCredential(token),
)

# Function to call the Microsoft-hosted OpenAI model with rate limiting
def call_microsoft_model_with_rate_limiting(prompt, max_tokens=300, temperature=1.0, top_p=1.0):
    """
    Calls the Microsoft-hosted OpenAI model with rate limiting.
    Args:
        prompt (str): Input prompt for the model.
        max_tokens (int): Maximum tokens to generate.
        temperature (float): Sampling temperature.
        top_p (float): Nucleus sampling parameter.
    Returns:
        str: The generated text response.
    """
    while True:
        try:
            response = client.complete(
                messages=[
                    SystemMessage(content="You are a helpful assistant."),
                    UserMessage(content=prompt),
                ],
                temperature=temperature,
                top_p=top_p,
                max_tokens=max_tokens,
                model=model_name,
            )
            return response.choices[0].message.content.strip()
        except Exception as e:
            if "RateLimitExceeded" in str(e):
                # Extract retry time or default to 60 seconds
                retry_after = int(e.response.headers.get("Retry-After", 60)) if hasattr(e, "response") else 60
                print(f"Rate limit exceeded. Retrying after {retry_after} seconds...")
                time.sleep(retry_after)
            else:
                raise

# Function to process project descriptions
def process_projects_with_microsoft_model(projects_df, rubric_df, batch_size=3, pause_time=60):
    """
    Processes project descriptions in batches with rate limiting.
    Args:
        projects_df (DataFrame): DataFrame containing project descriptions.
        rubric_df (DataFrame): DataFrame containing rubric descriptions.
        batch_size (int): Number of descriptions to process per batch.
        pause_time (int): Time to wait between batches in seconds.
    Returns:
        DataFrame: Updated DataFrame with reworded segments.
    """
    # Extract rubric descriptions
    innovation_desc = rubric_df[rubric_df["Criterion"] == "Innovation"]["Description"].iloc[0]
    curiosity_desc = rubric_df[rubric_df["Criterion"] == "Curiosity"]["Description"].iloc[0]

    # Define a function to generate reworded text
    def generate_reworded_text(description, criterion, criterion_desc):
        prompt = (
            f"Analyze the project description below based on the '{criterion}' criteria. "
            f"Extract and list specific aspects of the project that demonstrate alignment with the '{criterion}' criteria, using only the information from the description without adding interpretive commentary or value judgments."
            f"Extract and list specific aspects of the project that misalign with or fail to fully meet the '{criterion}' criteria, focusing only on evidence from the description."
            f"Describe how this project met or did not meet each level of the scoring scale for the '{criterion}' criteria. Use  only 'fully met,' 'partially met,' or 'did not meet'for each level, and provide a concise explanation.\n\n"
            f"Definition of {criterion}: {criterion_desc}\n\n"
            f"Project Description: {description}\n\n"
            f"Response:"
        )
        return call_microsoft_model_with_rate_limiting(prompt)
    
    def generate_project_summary(description):
        """
        Adds a new column to the DataFrame with a summary of the project, including
        the Project Overview and Problem Statement.
        """
        prompt = (
            f"Given the project description:\n\n"
            f"Provide one sentence describing the project.\n"
            f"Provide one sentence stating the problem the project is trying to solve.\n\n"
            f"Project Description: {description}\n\n"
            f"Response:"
        )
        return call_microsoft_model_with_rate_limiting(prompt)

    # Process in batches
    for i in range(0, len(projects_df), batch_size):
        batch = projects_df.iloc[i:i + batch_size]
        print(f"Processing batch {i // batch_size + 1}/{(len(projects_df) // batch_size) + 1}...")

        # Apply project summary generation
        projects_df.loc[batch.index, "Project_Summary"] = batch["description"].apply(
            generate_project_summary
        )
        # Apply rewording for innovation and curiosity
        projects_df.loc[batch.index, "Innovation_Assessment"] = batch["description"].apply(
            lambda x: generate_reworded_text(x, "Innovation", innovation_desc)
        )
        projects_df.loc[batch.index, "Curiosity_Assessment"] = batch["description"].apply(
            lambda x: generate_reworded_text(x, "Curiosity", curiosity_desc)
        )

        # Pause between batches to avoid exceeding rate limits
        print(f"Batch {i // batch_size + 1} completed. Pausing for {pause_time} seconds...")
        time.sleep(pause_time)

    return projects_df


In [None]:
processed_df = process_projects_with_microsoft_model(projects_df, rubric_df)
processed_df.to_csv("processed_projects.csv", index=False)


Processing batch 1/4...


In [None]:
creativity_df = processed_df[['Project_Summary','Innovation_Assessment','Curiosity_Assessment']]
creativity_df