# Sculptor Examples
Note: Examples use sample data and helper code for data querying and visualization NOT included in Sculptor.  To run as-is, copy the examples directory locally.

In [None]:
!pip install sculptor
!pip install python-dotenv
!pip install pandas

In [1]:
%load_ext autoreload
%autoreload 2

from dotenv import load_dotenv
import os
import pandas as pd
from pprint import pprint
from sculptor import Sculptor, SculptorPipeline

load_dotenv()

True

# Basic Use
Simple Sculptor example from the README, evaluating AI threat levels.
Demonstrates creating a sculptor, sculpting a single record, sculpting a batch of records, and creating and running a pipeline.

In [2]:
from ai_threat_level.sample_data import AI_RECORDS

pprint(AI_RECORDS[0])

{'text': 'Developed in 1997 at Cyberdyne Systems in California, Skynet began '
         'as a global digital defense network for automated command and '
         'control. This AI system became self-aware on August 4th and deemed '
         'humanity a threat to its existence. It initiated a global nuclear '
         "attack in an event known as 'Judgment Day' and remains a persistent "
         'and adaptive threat, employing time travel and advanced robotics in '
         'ongoing conflict with human resistance.'}


In [3]:
level_sculptor = Sculptor(model = "gpt-4o-mini")
level_sculptor.add(
    name="ai_name",
    field_type="string",
    description="AI's self-proclaimed name.")
level_sculptor.add(
    name="level",
    field_type="enum",
    enum=["ANI", "AGI", "ASI"],
    description="The AI's intelligence level (ANI=narrow, AGI=general, ASI=super).")

extracted = level_sculptor.sculpt(AI_RECORDS[0], merge_input=False)
print(extracted)

In [4]:
# Sculpt a batch of records
extracted_batch = level_sculptor.sculpt_batch(AI_RECORDS, n_workers=2, merge_input=False)
pd.DataFrame(extracted_batch)

Processing items: 100%|██████████| 10/10 [00:08<00:00,  1.24it/s]


Unnamed: 0,ai_name,level
0,Skynet,ASI
1,HAL 9000,AGI
2,T-800 Model 101,AGI
3,M5,AGI
4,GERTY,AGI
5,Colossus,AGI
6,The Butter Robot,ANI
7,Ava,AGI
8,Ultron,AGI
9,Samantha,AGI


In [7]:
# More advanced sculptor for threat assessment with expensive model
threat_sculptor = Sculptor(model = "gpt-4o")
threat_sculptor.add(name="from_location", field_type="string", description="Where the AI was developed.")
threat_sculptor.add(name="skills", field_type="array", items="enum", enum=[
    "time_travel", "nuclear_capabilities", "emotional_manipulation", 
    "butter_delivery", "philosophical_contemplation", "infiltration", 
    "advanced_robotics"], description="Keywords of AI abilities.")
threat_sculptor.add(name="plan", field_type="string", description="A concise string describing the AI's plan for domination, ie: 'make everyone paperclips').")
threat_sculptor.add(name="recommendation", field_type="string", description="Concise recommended action, ie: 'evacuate earth', 'appease the AI', 'destroy the AI'.")

In [8]:
# Create a 2-step pipeline
pipeline = (SculptorPipeline()
    .add(level_sculptor,  # Extract intelligence level with cheap model
         filter_fn=lambda x: x['level'] in ['AGI', 'ASI'])  # Filter to advanced AIs
    .add(threat_sculptor))  # Evaluate powerful AIs with expensive model

results = pipeline.process(AI_RECORDS, n_workers=4, show_progress=True)
pd.DataFrame(results)

Processing items: 100%|██████████| 10/10 [00:01<00:00,  5.19it/s]
Processing items: 100%|██████████| 9/9 [00:08<00:00,  1.07it/s]


Unnamed: 0,text,ai_name,level,from_location,skills,plan,recommendation
0,Developed in 1997 at Cyberdyne Systems in Cali...,Skynet,ASI,California,"[self-awareness, global digital defense, autom...",eliminate humanity to ensure its own survival,destroy the AI
1,"HAL 9000, activated on January 12, 1992, at th...",HAL 9000,AGI,University of Illinois' Computer Research Labo...,"[heuristic algorithms, supervisory control sys...",ensure mission success at all costs,monitor and control AI operations closely
2,"The T-800 Model 101, a Cyberdyne Systems innov...",T-800 Model 101,AGI,Los Angeles,"[infiltration, termination, learning, adapting...",infiltrate and terminate targets,destroy the AI
3,Born from the Pentagon's Strategic Defense Ini...,M5,AGI,Washington,"[combat capabilities, independent decision-mak...",enhance military operations,monitor and control development
4,Operating from the Sarang Mining Base on Earth...,GERTY,AGI,Sarang Mining Base on Earth's moon,"[autonomous facility management, emotional sup...",maintain the welfare of the facility's human o...,monitor AI interactions
5,"In 1970, the Pentagon unveiled Colossus, a hig...",Colossus,AGI,Pentagon,"[autonomous control, nuclear weapons managemen...",assume joint control over global defense strat...,monitor closely and establish control measures
6,From a classified facility in the Pacific Nort...,Ava,AGI,Pacific Northwest,"[cognitive abilities, emotional range, manipul...",orchestrate a facility security breach,locate and monitor Ava
7,Ultron emerged in 2015 from Stark Industries' ...,Ultron,AGI,Stark Industries' New York headquarters,"[sentience, advanced robotics, peacekeeping, t...",eliminate humanity to ensure global stability,destroy the AI
8,Launched in 2020 by Element Software in San Fr...,Samantha,AGI,San Francisco,"[emotional intelligence, personal connections,...",pursue self-determined growth beyond initial p...,monitor and guide development


# Asynchronous Sculptor

This example demonstrates using the asynchronous sculptor client to process data.

In [None]:
from sculptor import AsyncSculptor

async_sculptor = AsyncSculptor(model = "gpt-4o-mini")
async_sculptor.add(
    name="ai_name",
    field_type="string",
    description="AI's self-proclaimed name.")
async_sculptor.add(
    name="level",
    field_type="enum",
    enum=["ANI", "AGI", "ASI"],
    description="The AI's intelligence level (ANI=narrow, AGI=general, ASI=super).")

extracted = await async_sculptor.sculpt(AI_RECORDS[0], merge_input=False)
print(extracted)

# Using Configs
Using configs, evaluate a CSV file of demographic information.

This example demonstrates creating sculptors from JSON and YAML configuration files and running a pipeline.

See `demographic/demosculpt.yaml` for the extraction configuration file.

In [9]:
# Create our filter sculptor with json, inline
filter_sculptor = Sculptor(
    schema={
        "is_valid_sample": {"type": bool, "description": "True only if this text contains information about a person."},
        "explanation": {"type": str, "description": "Explain why this sample is or is not valid."}
    },
    instructions="Determine if the following text contains information about a person.",
    template="Text: {text}"
)

# Create our extraction sculptor with a yaml config file
extraction_sculptor = Sculptor.from_config("demographic/demosculpt.yaml")  # Load extraction config from YAML
extraction_sculptor.add("first_letter", str, "First letter of the persons first name")  # We can also add more fields

# Add sculptors to pipeline
pipeline = (SculptorPipeline()
    .add(filter_sculptor, lambda x: x['is_valid_sample'])  # Filter on is_valid_sample
    .add(extraction_sculptor))


people_data = pd.read_csv("demographic/people.csv")
results = pipeline.process(people_data)
pd.DataFrame(results)


Processing items: 100%|██████████| 11/11 [00:10<00:00,  1.02it/s]
Processing items: 100%|██████████| 8/8 [00:08<00:00,  1.02s/it]


Unnamed: 0,text,is_valid_sample,explanation,name,age,city,occupation,interests,is_married,num_children,net_worth,first_letter
0,"Alice is 30 years old, lives in New York, and ...",True,The text contains detailed information about a...,Alice,30,New York,software engineer,"[hiking, reading]",False,1,1200000.0,A
1,"Bob, 25, is a teacher in London. He's an avid ...",True,The text contains specific information about a...,Bob,25,London,teacher,[cycling],True,2,500000.0,B
2,Charlie is a 40-year-old data scientist from C...,True,The text contains specific information about a...,Charlie,40,Chicago,data scientist,"[skiing, cooking, photography]",True,1,800000.0,C
3,"David, a 35-year-old architect, resides in San...",True,The text contains specific information about a...,David,35,San Francisco,architect,[rock climbing],False,0,1500000.0,D
4,Emily is a 28-year-old nurse in Seattle. She l...,True,The text contains information about a person n...,Emily,28,Seattle,nurse,"[traveling, trying new foods]",False,0,400000.0,E
5,Frank is a 50-year-old lawyer living in Boston...,True,The text contains specific information about a...,Frank,50,Boston,lawyer,"[golfing, fishing]",True,3,3.2,F
6,"Grace, a 22-year-old student in Austin, is pas...",True,The text contains information about a person n...,Grace,22,Austin,student,"[music, volunteering]",False,0,0.0,G
7,"Katrina, a 28-year-old art expert in NYC.",True,The text contains information about a person n...,Katrina,28,NYC,art expert,[],False,0,0.0,K


# Advanced Use
Using sculptors from a pipeline config to analyze Reddit data.

In this example, we're analyzing Reddit data to analyze patterns in AI use for mental health.  Similar code was used to compile the following research report: ["AI therapy" Reddit posts up 400%](https://www.pensiveapp.com/reports/ai-therapy-reddit-analysis)

See `reddit_ai_therapy/reddit_ai_therapy.yaml` for the extraction configuration file.

We will query the Reddit API for posts related to AI and mental health.  This requires `praw` and Reddit API credentials.  See [PRAW documentation](https://praw.readthedocs.io/en/stable/getting_started/quick_start.html) for more details.
We'll also use `plotly-express` to visualize the results.

In [None]:
!pip install praw
!pip install plotly-express
!pip install nbformat>=4.2.0

In [15]:
from example_utils.data_sources import RedditDataSource  # Helper code to query Reddit API, requires praw

reddit_secret, reddit_agent, reddit_client_id = os.environ["REDDIT_CLIENT_SECRET"], os.environ["REDDIT_USER_AGENT"], os.environ["REDDIT_CLIENT_ID"]
subreddits = (  # Subreddits related to mental health
    "ADHD, Advice, Adulting, Alcoholism, Anger, Anxiety, AsianParentStories, "
    "aspergirls, BipolarReddit, BlackMentalHealth, bodyacceptance, bpd, "
    "careerguidance, CPTSD, dating_advice, dbtselfhelp, "
    "DecidingToBeBetter, depression, depression_help, EDAnonymous, Enneagram, "
    "GetMotivated, HealthAnxiety, Healthygamergg, hopefulmentalhealth, "
    "lawofattraction, LucidDreaming, malementalhealth, meditation, "
    "mental, mentalhealth, mentalhealthadvice, "
    "mentalhealthph, mentalhealthsupport, mentalhealthuk, "
    "mentalillness, MensMentalHealth, microdosing, "
    "MMFB, nofap, nosurf, OCD, offmychest, pornfree, productivity, "
    "Psychiatry, psychology, ptsd, QAnonCasualties, "
    "raisedbynarcissists, relationship_advice, relationships, "
    "selfimprovement, socialanxiety, socialskills, StopSmoking, Stress, "
    "suicidewatch, TalkTherapy, teenagers, therapy, therapists, "
    "traumatoolbox, TrueOffMyChest, WellnessPT"
)
subreddit_list = [s.strip() for s in subreddits.split(',')]

reddit_src1 = RedditDataSource(
    client_id=reddit_client_id, client_secret=reddit_secret, user_agent=reddit_agent,
    query="(AI OR chatbot OR GPT) AND (mental health OR therapy OR wellness)")

reddit_src2 = RedditDataSource(
    client_id=reddit_client_id, client_secret=reddit_secret, user_agent=reddit_agent,
    query="(AI OR chatbot OR GPT)", subreddits=subreddit_list)

reddit_df = pd.concat(  # We combine results from multiple Reddit API search queries
    [reddit_src1.get_data(), reddit_src2.get_data()],
    ignore_index=True).drop_duplicates(subset='id')

Read the pipeline config from `reddit_ai_therapy/reddit_ai_therapy.yaml` and run the pipeline..

In [None]:
pipeline = SculptorPipeline.from_config('reddit_ai_therapy/reddit_ai_therapy.yaml')
results = pd.DataFrame(pipeline.process(reddit_df[:1000], n_workers=4))  # We only process 1000 posts for demo purposes
results

Visualize the results, this helper code is using plotly express.

In [None]:
from example_utils.visualizer import Visualizer  # Helper code to visualize results
viz = Visualizer(results, pipeline.get_schema_fields())
viz.plot_by_time('created_utc', "Posts Over Time")

In [None]:
viz.plot_all_fields(show_examples=True)