In [None]:
import ace_lib as ace

In [None]:
s = ace.start_session()

In [None]:
if ace.check_session_timeout(s) < 3000:
    s = ace.check_session_and_relogin(s)

In [None]:
def get_operators_reference(brain_session) -> str:
    operators_df = ace.get_operators(brain_session)
    operators_df = operators_df[operators_df['scope']=='REGULAR']
    operators_df = operators_df[['name', 'category', 'definition', 'description']]
    operators_df['description'] = operators_df['description'].apply(lambda x : x.replace('\r\n', ' ').replace('\n', ' ').replace('\r', ' ') if isinstance(x, str) else x)
    operators_table = operators_df.to_markdown(index=False, tablefmt='github')
    operators_table_description = """## WorldQuant BRAIN Operators Reference

### Table Structure
- `name`: The name of the operator or function (e.g., `abs`, `add`, `log`).
- `category`: The classification of the operator based on its functionality (e.g., Arithmetic, Logical, Time Series, etc.).
- `definition`: A concise syntax or formula that describes how the operator is used.
- `description`: A brief explanation of what the operator does, including its purpose and behavior.

### Categories
1. Arithmetic: Operators for mathematical operations like addition, subtraction, multiplication, division, logarithms, and exponentials.

2. Logical: Operators for logical comparisons and conditions, such as AND, OR, NOT, and equality checks..

3. Time Series: Operators for analyzing and manipulating time-series data, such as calculating moving averages, delays, or correlations over a specified number of days..

4. Cross Sectional: Operators for working with data across multiple instruments or entities at a single point in time, such as ranking, scaling, and normalization.

5. Vector: Operators for vector-based calculations, such as finding the sum, mean, or standard deviation of elements in a vector.

6. Transformational: Operators for transforming data, such as filtering, clamping, or tailing values based on conditions.

7. Group: Operators for group-based calculations, such as neutralizing, ranking, or scaling values within groups (e.g., sectors or industries).

8. Special: Operators for specific use cases, such as converting between units or calculating profit and loss.

"""
    return operators_table_description + "\n" + operators_table


In [None]:
# Example output
print(get_operators_reference(s))

In [None]:
def get_dataset_reference(brain_session, dataset_id, region='USA', universe='TOP3000') -> str:
    datasets_df = ace.get_datasets(brain_session, region=region, universe=universe)
    dataset = datasets_df[datasets_df['id'] == dataset_id].iloc[0]
    dataset_details = f"""
## Dataset Details

ID: {dataset['id']}
Name: {dataset['name']}
Category Name: {dataset['category_name']}
Subcategory Name: {dataset['subcategory_name']}
Description: {dataset['description']}

Below is the reference table to available datafields:
"""
    datafields_df = ace.get_datafields(brain_session, dataset_id=dataset_id, region=region, universe=universe)
    datafields_df = datafields_df[['id', 'description', 'type', 'dateCoverage', 'coverage', 'alphaCount']] # feel free to add or remove columns
    datafields_df['description'] = datafields_df['description'].apply(lambda x : x.replace('\r\n', ' ').replace('\n', ' ').replace('\r', ' ') if isinstance(x, str) else x)
    datafields_table = datafields_df.to_markdown(index=False, tablefmt='github')

    return dataset_details + "\n" + datafields_table

In [None]:
# Example output
print(get_dataset_reference(s, "model110"))

In [None]:
from typing import List, Dict
from pydantic import BaseModel, Field


class AlphaExpression(BaseModel):
    """Structured output for a single alpha expression"""
    alpha_expression: str = Field(description="Alpha expression in WorldQuant Brain syntax")
    economic_rationale: str = Field(description="Economic reasoning behind the alpha expression")
    data_fields_used: List[str] = Field(description="List of data fields/features used in the expression")
    operators_used: List[str] = Field(description="List of operators used in the expression")


class AlphaExpressions(BaseModel):
    """Collection of alpha expressions"""
    alphas: List[AlphaExpression] = Field(description="List of generated alpha expressions")


In [None]:
from openai import OpenAI

OPENAI_API_KEY = "YOUR_OPENAI_API_KEY" # or os.getenv("OPENAI_API_KEY")
OPENAI_BASE_URL = "OPENAI_BASE_URL" # or os.getenv("OPENAI_BASE_URL") 

def call_llm(messages: List[Dict], output_structure: BaseModel):
    # Initialize OpenAI client
    client = OpenAI(base_url = OPENAI_BASE_URL, api_key=OPENAI_API_KEY)
    
    # Call OpenAI API with structured output
    completion = client.chat.completions.parse(
        model="gpt-5.1",  # Model that supports structured outputs
        messages=messages,
        response_format=output_structure,
        temperature=0.3,  # Higher temperature for more diverse expressions
        max_tokens=4000
    )
    
    # Extract structured output
    llm_structured_output = completion.choices[0].message.parsed
    
    return llm_structured_output


In [None]:
def generate_alpha_expressions(hypothesis: str, operators_reference: str, dataset_reference: str, num_alphas: int = 10) -> AlphaExpressions:

    
    # Construct prompts for alpha generation

    system_prompt = "You are an expert quantitative researcher with deep knowledge of WorldQuant Brain alpha expressions, financial markets, and quantitative trading strategies."
    
    user_prompt = f"""You are an expert quantitative researcher specializing in creating alpha expressions for WorldQuant Brain platform.

Given the following investment hypothesis or idea:
"{hypothesis}"

Generate {num_alphas} diverse alpha expressions that capture different aspects of this hypothesis.

Here are the dataset details and available data fields:

{dataset_reference}

Here is the operators reference:

{operators_reference}

IMPORTANT REQUIREMENTS:

1. **WorldQuant Brain Syntax**: Use proper WorldQuant Brain syntax and strictly use provided operators references.

2. **Expression Diversity**: Each alpha should:
   - Use different combinations of operators and data fields
   - Capture different aspects of the hypothesis (momentum, value, quality, volatility, etc.)
   - Vary in complexity (some simple, some more sophisticated)
   - Consider different time horizons (short-term vs long-term)

3. **Economic Rationale**: Provide clear, concise economic reasoning that:
   - Explains WHY the alpha should work
   - Links back to the original hypothesis
   - Describes the market inefficiency or behavioral bias being exploited
   - Is specific to the alpha expression (not generic)

4. **Data Fields**: List ALL data fields actually used in the expression
   - Be specific (e.g., "close", "volume", "market_cap")
   - Include derived fields if relevant

5. **Operators**: List ALL operators used in the expression
   - Include both time-series and cross-sectional operators
   - Be comprehensive and accurate

EXAMPLE FORMAT (for reference):
- Alpha Expression: rank(ts_delta(close, 5) / ts_std_dev(close, 20))
- Economic Rationale: Captures short-term momentum normalized by recent volatility, identifying stocks with strong recent price moves relative to their volatility
- Data Fields: ["close"]
- Operators: ["rank", "ts_delta", "ts_std_dev"]

Now generate {num_alphas} high-quality alpha expressions based on the hypothesis provided."""

    # Call LLM
    messages=[
            {
                "role": "system",
                "content": system_prompt          },
            {
                "role": "user",
                "content": user_prompt
            }
        ]
    
    # Extract structured output
    alpha_expressions = call_llm(messages, AlphaExpressions)
    
    return alpha_expressions


In [None]:
def print_alpha_expressions(alpha_expressions: AlphaExpressions):
    """
    Pretty print the generated alpha expressions.
    
    Args:
        alpha_expressions: AlphaExpressions object to print
    """
    print(f"\n{'='*80}")
    print(f"Generated {len(alpha_expressions.alphas)} Alpha Expressions")
    print(f"{'='*80}\n")
    
    for i, alpha in enumerate(alpha_expressions.alphas, 1):
        print(f"Alpha #{i}")
        print(f"{'-'*80}")
        print(f"Expression: {alpha.alpha_expression}")
        print(f"\nEconomic Rationale:\n{alpha.economic_rationale}")
        print(f"\nData Fields Used: {', '.join(alpha.data_fields_used)}")
        print(f"Operators Used: {', '.join(alpha.operators_used)}")
        print(f"\n{'='*80}\n")


def export_to_dict(alpha_expressions: AlphaExpressions) -> List[dict]:
    return [alpha.model_dump() for alpha in alpha_expressions.alphas]

def export_to_json(alpha_dicts: List[Dict]):
    import json
    with open('alphas.json', 'w') as json_file:
        json.dump(alpha_dicts, json_file, indent=4)



In [None]:
# Example hypothesis
hypothesis = "Stocks with increasing revenue growth and low volatility outperform the market"
operators_reference = get_operators_reference(s)
dataset_reference = get_dataset_reference(s, dataset_id="model110")

print(f"Generating alpha expressions for hypothesis:\n'{hypothesis}'\n")

# Generate alpha expressions
result = generate_alpha_expressions(hypothesis, operators_reference, dataset_reference, num_alphas=10)

# Print results
print_alpha_expressions(result)

# Export to dict for further use
alpha_dicts = export_to_dict(result)
print(f"Exported {len(alpha_dicts)} alpha expressions to dictionary format")


In [None]:
# Simulate alphas

alpha_list = []
for alpha_dict in alpha_dicts:
    alpha = ace.generate_alpha(
            regular=alpha_dict['alpha_expression'],
            alpha_type="REGULAR",
            region="USA",
            universe="TOP1000",
            delay=1,
            neutralization="SECTOR",
            decay=4,
            truncation=0.02,
            pasteurization="ON",
            test_period="P2Y",
            unit_handling="VERIFY",
            nan_handling="ON",
            max_trade="OFF",
            visualization=True,
        )
    alpha_dict['alpha_settings'] = alpha
    alpha_list.append(alpha)


export_to_json(alpha_dicts)