Notebook to add better categories to RSO data

In [13]:
import json
import asyncio
from groq import Groq
import os
from typing import Dict, List, Optional
import time
from tqdm import tqdm
from dotenv import load_dotenv
from datetime import datetime, timedelta
import tiktoken
import random


In [14]:
load_dotenv()  # This loads the variables from .env

client = Groq(
    api_key=os.environ["GROQ_API_KEY"],
)
token_bucket = TokenBucket(tokens_per_minute=4500)  # Using 4500 to be conservative


In [12]:
class TokenBucket:
    def __init__(self, tokens_per_minute: int = 5000):
        self.max_tokens = tokens_per_minute
        self.tokens = tokens_per_minute
        self.last_update = datetime.now()
        self.tokens_per_minute = tokens_per_minute

    def update_tokens(self):
        now = datetime.now()
        time_passed = now - self.last_update
        self.tokens = min(
            self.max_tokens,
            self.tokens + (time_passed.total_seconds() * self.tokens_per_minute / 60)
        )
        self.last_update = now

    def consume(self, tokens: int) -> float:
        self.update_tokens()
        if self.tokens < tokens:
            wait_time = (tokens - self.tokens) * 60 / self.tokens_per_minute
            return wait_time
        self.tokens -= tokens
        return 0

In [15]:
def count_tokens(text: str) -> int:
    """Estimate token count using tiktoken"""
    # Using cl100k_base as an approximation
    encoding = tiktoken.get_encoding("cl100k_base")
    return len(encoding.encode(text))

def exponential_backoff(attempt: int, base_delay: float = 1) -> float:
    """Calculate delay with jitter for exponential backoff"""
    delay = min(300, base_delay * (2 ** attempt))  # Cap at 5 minutes
    jitter = random.uniform(0, 0.1 * delay)
    return delay + jitter

In [19]:
VALID_CATEGORIES = [
    "Biology", "Business", "Chemistry", "Physics", "Mathematics", "Computer Science", 
    "Data Science", "Economics", "Psychology", "Sociology", "Political Science",
    "History", "Philosophy", "Literature", "Languages", "Law", "Medicine",
    "Nursing", "Public Health", "Engineering", "Environmental Science",
    "Finance", "Investment", "Quantitative Trading", "Private Equity",
    "Venture Capital", "Consulting", "Marketing", "Entrepreneurship",
    "Real Estate", "Technology", "Software Development", "Product Management",
    "Healthcare", "Legal", "Research", "Journalism", "Media Production",
    "Visual Arts", "Painting", "Photography", "Digital Art", "Music",
    "Band", "Choir", "A Cappella", "Theater", "Dance", "Film",
    "Creative Writing", "Design", "Cultural", "International", "Religious",
    "LGBTQ+", "Gender & Sexuality", "Social Justice", "Political", "Activism",
    "Community Service/volunteering", "Mentorship", "Environmental", "Sports"
    "Team Sports", "Individual Sports", "Gaming",
    "Debate", "Model UN", "Food & Cooking", "Travel", "Outdoor Activities",
    "Student Government", "Publications", "Journalism", "Mental Health", "Wellness",
    "Career Development", "Academic Support", "Leadership", "Greek Life"
]

In [6]:
FEW_SHOT_EXAMPLES = [
    {
        "name": "Investment Banking Group",
        "description": "A professional organization dedicated to educating members about investment banking, private equity, and financial markets. We host networking events, technical workshops, and mock interviews to prepare students for careers in finance.",
        "ideal_categories": ["Finance", "Investment", "Career Development"],
        "explanation": "This RSO focuses on finance education and career preparation, warranting multiple related financial categories."
    },
    {
        "name": "Data Science for Social Good",
        "description": "We apply data science and machine learning techniques to tackle social issues in healthcare, education, and environmental sustainability. Members work on real-world projects while learning technical skills.",
        "ideal_categories": ["Data Science", "Computer Science", "Community Service/volunteering"],
        "explanation": "Combines technical data science work with social impact, deserving both technical and service categories."
    },
    {
        "name": "Mental Health Alliance",
        "description": "A student organization focused on promoting mental health awareness, reducing stigma, and connecting students with resources. We organize wellness workshops, peer support groups, and educational events.",
        "ideal_categories": ["Mental Health", "Wellness", "Student Life"],
        "explanation": "Focuses on mental health and wellness within student life context."
    }
]

In [7]:
def create_prompt(rso: Dict) -> str:
    """Create a prompt for the LLM to categorize an RSO."""
    few_shot_text = "\n---\n".join([
        f"""
Name: {ex['name']}
Description: {ex['description']}
Categories: {', '.join(ex['ideal_categories'])}
Explanation: {ex['explanation']}""" 
        for ex in FEW_SHOT_EXAMPLES
    ])
    
    prompt = f"""You are an expert at categorizing university student organizations. Given an RSO's name and description, assign it relevant categories from the provided list. Each RSO can have multiple categories if appropriate.

Valid categories: {', '.join(VALID_CATEGORIES)}

Here are some examples:
{few_shot_text}

For the following RSO, please provide:
1. A list of relevant categories (can be multiple)
2. A confidence score (0-100) for each category
3. A brief explanation of your categorization

Name: {rso['name']}
Description: {rso.get('full_description', '') or rso.get('description_preview', '')}

Response should be in JSON format:
{{
  "categories": [
    {{"name": "category_name", "confidence": 95}},
    {{"name": "another_category", "confidence": 85}}
  ],
  "explanation": "Brief explanation of categorization"
}}"""
    return prompt

In [16]:
def categorize_rso(rso: Dict, attempt: int = 0) -> Optional[Dict]:
    """Categorize a single RSO using the Groq API with rate limiting."""
    try:
        prompt = create_prompt(rso)
        estimated_tokens = count_tokens(prompt) + 500  # Add buffer for response
        
        # Check token bucket
        wait_time = token_bucket.consume(estimated_tokens)
        if wait_time > 0:
            print(f"\nRate limit approaching, waiting {wait_time:.2f} seconds...")
            time.sleep(wait_time)
        
        completion = client.chat.completions.create(
            messages=[{"role": "user", "content": prompt}],
            model="mixtral-8x7b-32768",
            temperature=0.3,
            max_tokens=1000
        )
        
        response = json.loads(completion.choices[0].message.content)
        
        # Validate categories against our list
        response['categories'] = [
            cat for cat in response['categories'] 
            if cat['name'] in VALID_CATEGORIES
        ]
        
        return response
        
    except Exception as e:
        if "rate_limit" in str(e).lower():
            if attempt < 5:  # Max 5 retries
                delay = exponential_backoff(attempt)
                print(f"\nRate limit hit for {rso['name']}, waiting {delay:.2f} seconds...")
                time.sleep(delay)
                return categorize_rso(rso, attempt + 1)
            else:
                print(f"\nMax retries reached for {rso['name']}")
        print(f"Error categorizing RSO {rso['name']}: {str(e)}")
        return None

In [17]:
def process_rsos(input_file: str = None, output_file: str = None, rsos: List[Dict] = None):
    """Process all RSOs from input file or list and save results to output file."""
    try:
        # Read input JSON if file provided, otherwise use provided list
        if input_file:
            with open(input_file, 'r') as f:
                rsos = json.load(f)
        
        if not rsos:
            raise ValueError("No RSOs provided")
            
        results = []
        batch_size = 3  # Reduced batch size
        
        # Process RSOs in batches with progress bar
        for i in tqdm(range(0, len(rsos), batch_size)):
            batch = rsos[i:i + batch_size]
            batch_results = []
            
            # Process each RSO in batch
            for rso in batch:
                categorization = categorize_rso(rso)
                if categorization:
                    rso['ai_categories'] = categorization['categories']
                    rso['categorization_explanation'] = categorization['explanation']
                batch_results.append(rso)
            
            results.extend(batch_results)
            
            # Save intermediate results every batch
            if output_file:
                with open(f"{output_file}.partial", 'w') as f:
                    json.dump(results, f, indent=2)
            
            # Add delay between batches
            time.sleep(2)  # Conservative delay between batches
        
        # Write final results to output file if provided
        if output_file:
            with open(output_file, 'w') as f:
                json.dump(results, f, indent=2)
            print(f'\nCategorization complete! Results saved to {output_file}')
            
        return results
        
    except Exception as e:
        print(f'\nError processing RSOs: {str(e)}')
        # Save partial results if available
        if results and output_file:
            with open(f"{output_file}.error_partial", 'w') as f:
                json.dump(results, f, indent=2)
            print(f'Partial results saved to {output_file}.error_partial')
        return results

In [18]:
results = process_rsos('rso_data_detailed.json', 'categorized_rsos.json')

  1%|          | 1/136 [00:03<07:58,  3.55s/it]


Rate limit approaching, waiting 0.72 seconds...

Rate limit approaching, waiting 10.49 seconds...


  1%|▏         | 2/136 [00:18<22:40, 10.15s/it]


Rate limit approaching, waiting 11.77 seconds...

Rate limit approaching, waiting 2.90 seconds...


  2%|▏         | 3/136 [00:44<38:51, 17.53s/it]


Rate limit hit for American Lung Cancer Screening Initiative, waiting 1.07 seconds...

Rate limit approaching, waiting 1.47 seconds...


  3%|▎         | 4/136 [01:29<1:02:28, 28.40s/it]


Rate limit approaching, waiting 5.96 seconds...


  4%|▎         | 5/136 [01:57<1:01:29, 28.16s/it]


Rate limit approaching, waiting 0.56 seconds...

Rate limit hit for Apsara, waiting 1.09 seconds...

Rate limit approaching, waiting 8.01 seconds...


  4%|▍         | 6/136 [02:37<1:09:46, 32.20s/it]


Rate limit approaching, waiting 5.38 seconds...


  5%|▌         | 7/136 [03:05<1:06:19, 30.85s/it]


Rate limit hit for ArtShould, waiting 1.08 seconds...

Rate limit approaching, waiting 1.74 seconds...

Rate limit approaching, waiting 5.22 seconds...


  6%|▌         | 8/136 [03:48<1:14:19, 34.84s/it]


Rate limit hit for Badminton, waiting 1.06 seconds...

Rate limit approaching, waiting 1.96 seconds...


  7%|▋         | 9/136 [04:28<1:16:39, 36.22s/it]


Rate limit approaching, waiting 5.32 seconds...


  7%|▋         | 10/136 [04:56<1:10:41, 33.66s/it]


Rate limit approaching, waiting 1.67 seconds...

Rate limit hit for Black Student Fellowship, waiting 1.08 seconds...


  8%|▊         | 11/136 [05:37<1:14:49, 35.92s/it]


Rate limit approaching, waiting 4.30 seconds...

Rate limit approaching, waiting 0.96 seconds...


  9%|▉         | 12/136 [06:05<1:09:27, 33.61s/it]


Rate limit hit for Blue Chips, waiting 1.08 seconds...

Rate limit approaching, waiting 4.05 seconds...


 10%|▉         | 13/136 [06:44<1:12:29, 35.36s/it]


Rate limit approaching, waiting 7.73 seconds...


 10%|█         | 14/136 [07:13<1:07:40, 33.28s/it]


Rate limit approaching, waiting 2.26 seconds...

Rate limit hit for Business Organization for Latino Development, waiting 1.03 seconds...

Rate limit approaching, waiting 1.02 seconds...


 11%|█         | 15/136 [07:53<1:11:28, 35.44s/it]


Rate limit approaching, waiting 3.95 seconds...

Rate limit hit for Catholic Students Association, waiting 1.01 seconds...


 12%|█▏        | 16/136 [08:31<1:12:25, 36.22s/it]


Rate limit approaching, waiting 8.44 seconds...


 12%|█▎        | 17/136 [08:59<1:06:57, 33.76s/it]


Rate limit approaching, waiting 2.47 seconds...

Rate limit approaching, waiting 0.48 seconds...

Rate limit hit for Cheerleading, waiting 1.02 seconds...


 13%|█▎        | 18/136 [09:38<1:09:33, 35.37s/it]


Rate limit approaching, waiting 0.01 seconds...


 14%|█▍        | 19/136 [10:12<1:08:02, 34.90s/it]


Rate limit approaching, waiting 3.83 seconds...

Rate limit hit for Chicago Italian Appreciation Organization, waiting 1.10 seconds...


 15%|█▍        | 20/136 [10:50<1:09:15, 35.83s/it]


Rate limit approaching, waiting 3.28 seconds...

Rate limit approaching, waiting 0.92 seconds...


 15%|█▌        | 21/136 [11:18<1:04:11, 33.49s/it]


Rate limit hit for Chicago Raas Team, waiting 1.10 seconds...

Rate limit approaching, waiting 5.06 seconds...


 16%|█▌        | 22/136 [12:00<1:08:04, 35.83s/it]


Rate limit approaching, waiting 1.18 seconds...


 17%|█▋        | 23/136 [12:28<1:03:33, 33.75s/it]


Rate limit approaching, waiting 1.45 seconds...

Rate limit hit for Chinese Student Christian Fellowship, waiting 1.01 seconds...

Rate limit approaching, waiting 1.55 seconds...


 18%|█▊        | 24/136 [13:05<1:04:16, 34.43s/it]


Rate limit approaching, waiting 3.55 seconds...


 18%|█▊        | 25/136 [13:32<59:41, 32.27s/it]  


Rate limit hit for College Council, waiting 1.08 seconds...

Rate limit approaching, waiting 4.52 seconds...

Rate limit approaching, waiting 4.54 seconds...


 19%|█▉        | 26/136 [14:09<1:01:55, 33.77s/it]


Rate limit approaching, waiting 1.92 seconds...


 20%|█▉        | 27/136 [14:40<59:41, 32.86s/it]  


Rate limit hit for Community Health Initiative, waiting 1.02 seconds...

Rate limit approaching, waiting 0.74 seconds...


 21%|██        | 28/136 [15:18<1:01:50, 34.36s/it]


Rate limit approaching, waiting 3.05 seconds...

Rate limit approaching, waiting 2.14 seconds...

Rate limit hit for Critical Understanding of Liturgies and Traditions, waiting 1.09 seconds...


 21%|██▏       | 29/136 [15:53<1:01:44, 34.62s/it]


Rate limit approaching, waiting 0.17 seconds...


 22%|██▏       | 30/136 [16:24<59:19, 33.58s/it]  


Rate limit approaching, waiting 7.18 seconds...

Rate limit hit for None, waiting 1.07 seconds...

Rate limit approaching, waiting 4.90 seconds...


 23%|██▎       | 31/136 [17:01<1:00:42, 34.69s/it]


Rate limit approaching, waiting 7.88 seconds...


 24%|██▎       | 32/136 [17:29<56:36, 32.66s/it]  


Rate limit hit for Documentary Films, waiting 1.08 seconds...

Rate limit approaching, waiting 1.42 seconds...


 24%|██▍       | 33/136 [18:08<59:05, 34.42s/it]


Rate limit approaching, waiting 3.96 seconds...

Rate limit approaching, waiting 2.17 seconds...

Rate limit hit for Edmund Burke Society, waiting 1.10 seconds...


 25%|██▌       | 34/136 [18:44<59:24, 34.94s/it]


Rate limit approaching, waiting 5.16 seconds...


 26%|██▌       | 35/136 [19:18<58:31, 34.77s/it]


Rate limit approaching, waiting 2.25 seconds...

Rate limit hit for Environmental Law Society, waiting 1.00 seconds...

Rate limit approaching, waiting 0.41 seconds...


 26%|██▋       | 36/136 [19:58<1:00:22, 36.22s/it]


Rate limit approaching, waiting 1.07 seconds...

Rate limit hit for Euphony Journal, waiting 1.09 seconds...


 27%|██▋       | 37/136 [20:35<1:00:17, 36.54s/it]


Rate limit approaching, waiting 5.86 seconds...


 28%|██▊       | 38/136 [21:05<56:25, 34.55s/it]  


Rate limit approaching, waiting 5.90 seconds...

Rate limit hit for Festival of the Arts, waiting 1.04 seconds...

Rate limit approaching, waiting 0.16 seconds...


 29%|██▊       | 39/136 [21:46<59:04, 36.55s/it]


Rate limit approaching, waiting 3.54 seconds...


 29%|██▉       | 40/136 [22:14<54:24, 34.01s/it]


Rate limit approaching, waiting 0.80 seconds...

Rate limit hit for Food Recovery Network, waiting 1.04 seconds...


 30%|███       | 41/136 [22:53<55:49, 35.26s/it]


Rate limit approaching, waiting 9.84 seconds...


 31%|███       | 42/136 [23:26<54:25, 34.74s/it]


Rate limit approaching, waiting 6.04 seconds...

Rate limit hit for Gateway to the Great Outdoors, waiting 1.05 seconds...

Rate limit approaching, waiting 0.81 seconds...


 32%|███▏      | 43/136 [24:05<56:00, 36.13s/it]


Rate limit approaching, waiting 4.14 seconds...


 32%|███▏      | 44/136 [24:32<51:10, 33.38s/it]


Rate limit approaching, waiting 2.84 seconds...

Rate limit hit for Graduate Christian Fellowship, waiting 1.05 seconds...


 33%|███▎      | 45/136 [25:09<52:12, 34.42s/it]


Rate limit approaching, waiting 3.14 seconds...

Rate limit approaching, waiting 0.86 seconds...


 34%|███▍      | 46/136 [25:38<49:10, 32.78s/it]


Rate limit hit for Health Professions Recruitment and Exposure Program, waiting 1.01 seconds...

Rate limit approaching, waiting 2.61 seconds...


 35%|███▍      | 47/136 [26:20<52:40, 35.51s/it]


Rate limit approaching, waiting 6.14 seconds...


 35%|███▌      | 48/136 [26:51<49:59, 34.08s/it]


Rate limit hit for Hong Kong Student Association, waiting 1.09 seconds...

Rate limit approaching, waiting 4.90 seconds...

Rate limit approaching, waiting 1.80 seconds...


 36%|███▌      | 49/136 [27:30<51:35, 35.58s/it]


Rate limit approaching, waiting 4.85 seconds...


 37%|███▋      | 50/136 [27:57<47:32, 33.17s/it]


Rate limit hit for Indonesian Student Association at The University of Chicago, waiting 1.00 seconds...

Rate limit approaching, waiting 9.33 seconds...


 38%|███▊      | 51/136 [28:37<49:44, 35.12s/it]


Rate limit approaching, waiting 9.35 seconds...


 38%|███▊      | 52/136 [29:15<50:28, 36.06s/it]


Rate limit approaching, waiting 0.36 seconds...


 39%|███▉      | 53/136 [29:43<46:32, 33.64s/it]


Rate limit approaching, waiting 6.64 seconds...

Rate limit hit for Judo Club, waiting 1.00 seconds...


 40%|███▉      | 54/136 [30:22<47:57, 35.10s/it]


Rate limit approaching, waiting 5.46 seconds...

Rate limit approaching, waiting 4.17 seconds...


 40%|████      | 55/136 [30:51<45:04, 33.39s/it]


Rate limit approaching, waiting 3.27 seconds...

Rate limit hit for Kojo Daiko, waiting 1.05 seconds...


 41%|████      | 56/136 [31:28<45:44, 34.31s/it]


Rate limit approaching, waiting 10.33 seconds...


 43%|████▎     | 58/136 [33:23<54:13, 41.71s/it]  


Rate limit approaching, waiting 10.29 seconds...


 43%|████▎     | 59/136 [33:37<42:49, 33.38s/it]


Rate limit approaching, waiting 12.98 seconds...

Rate limit approaching, waiting 0.48 seconds...


 44%|████▍     | 60/136 [34:13<43:21, 34.22s/it]


Rate limit approaching, waiting 6.48 seconds...

Rate limit hit for Major Activities Board, waiting 1.01 seconds...

Rate limit approaching, waiting 1.64 seconds...


 45%|████▍     | 61/136 [34:43<41:19, 33.06s/it]


Rate limit approaching, waiting 0.56 seconds...


 46%|████▌     | 62/136 [35:11<38:41, 31.37s/it]


Rate limit approaching, waiting 1.89 seconds...

Rate limit hit for Maroons for Israel, waiting 1.08 seconds...


 46%|████▋     | 63/136 [35:50<40:56, 33.64s/it]


Rate limit approaching, waiting 4.79 seconds...

Rate limit approaching, waiting 3.40 seconds...


 47%|████▋     | 64/136 [36:18<38:38, 32.19s/it]


Rate limit approaching, waiting 1.35 seconds...


 48%|████▊     | 65/136 [36:50<37:47, 31.93s/it]


Rate limit hit for Memoryhouse, waiting 1.06 seconds...

Rate limit approaching, waiting 4.25 seconds...


 49%|████▊     | 66/136 [37:27<39:06, 33.52s/it]


Rate limit approaching, waiting 8.08 seconds...


 49%|████▉     | 67/136 [37:54<36:18, 31.57s/it]


Rate limit approaching, waiting 3.20 seconds...

Rate limit hit for Minorities in Public Policy Studies, waiting 1.10 seconds...

Rate limit approaching, waiting 3.71 seconds...
