In [15]:
%load_ext autoreload
%load_ext dotenv
%dotenv -o

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload
The dotenv extension is already loaded. To reload it, use:
  %reload_ext dotenv


In [16]:
import os

current_directory = os.getcwd()

workspace_directory = os.path.abspath(os.path.join(current_directory, '..'))
print(workspace_directory)

/home/izlobin/wd/events-planner-sst


In [17]:
%%capture
%pip install dspy-ai

In [18]:
import os

current_directory = os.getcwd()

workspace_directory = os.path.abspath(os.path.join(current_directory, ".."))
print(workspace_directory)

/home/izlobin/wd/events-planner-sst


In [19]:
# load dataset into pydata dataframe for analysis

import json

import pandas as pd

datasets_base_path = f"{workspace_directory}/task/datasets"
datasets_paths = [
    f"{datasets_base_path}/local-luma",
    f"{datasets_base_path}/local-meetup",
]

df_all = None
for dataset_path in datasets_paths:
    for root, dirs, files in os.walk(dataset_path):
        for file in files:
            file_path = os.path.join(root, file)
            with open(file_path, "r") as f:
                data = json.load(f)
                df = pd.json_normalize(data)

                if "df_all" not in globals():
                    df_all = df
                else:
                    df_all = pd.concat([df_all, df], ignore_index=True)

# Drop the 'description' column
df_all_preview = df_all.drop(columns=["description", "cover", "dateEnd"])

# Display the first few rows of the DataFrame
print(df_all_preview.head().to_markdown())

|    | url                            | dateStart                | title                                                       | hosts                                                                                   | group                | address                                              | guests                                     |   attendees |   tags |   venue |   online |   eventSeriesDates |
|---:|:-------------------------------|:-------------------------|:------------------------------------------------------------|:----------------------------------------------------------------------------------------|:---------------------|:-----------------------------------------------------|:-------------------------------------------|------------:|-------:|--------:|---------:|-------------------:|
|  0 | https://lu.ma/mw005iqm         | 2025-01-23T12:30:00.000Z | Pitch and Run 730am Thursday Early Bird                     | ['Kevin Weatherman', 'Chaitenya Razdan']                 

In [20]:
pd.set_option("display.max_colwidth", None)
# print(df_all['description'].head(2))
for description in df_all["description"].head(10):
    print(description)

<p>​<!-- -->Meet at 730am </p><p>​<!-- -->Intros till 740am</p><p>​<!-- -->4.5 miles up the piers and back down the WSH</p><p>​<a href="https://chat.whatsapp.com/JShpXGbgAoL8HC2A5bpNau" target="_blank" rel="nofollow noopener">Whatsapp</a></p><p>​<!-- -->Route <a href="https://www.strava.com/routes/3213205617339336616" target="_blank" rel="nofollow noopener">https://www.strava.com/routes/3213205617339336616</a></p>
<p>​<!-- -->Meet at 7:30AM at Grand Army Plaza</p><p>​<!-- -->Route: <a href="https://www.strava.com/routes/3250644464025093664" target="_blank" rel="nofollow noopener">https://www.strava.com/routes/3250644464025093664</a></p><p>​<!-- -->Photos by the Great Lawn and then at the Lake</p>
<p>​<strong>you are invited to</strong><strong><em> the january founder's sauna club</em></strong> - a unique experience for andrew and rho's founder community, taking place in an otherworldly sauna and ice bath space.</p><p>​<!-- -->​shift your state, unplug, and be yourself with others.</p><

In [21]:
from typing import Optional

from pydantic import BaseModel, Field


class EventDetails(BaseModel):
    """Information about a person."""

    # ^ Doc-string for the entity Person.
    # This doc-string is sent to the LLM as the description of the schema Person,
    # and it can help to improve extraction results.

    # Note that:
    # 1. Each field is an `optional` -- this allows the model to decline to extract it!
    # 2. Each field has a `description` -- this description is used by the LLM.
    # Having a good description can help improve extraction results.
    indoor: Optional[bool] = Field(default=None, description="1 for indoor, 0 for outdoor")
    # location_proximity: Optional[float] = Field(default=None, description="Normalized distance to the event (0 to 1)")
    event_duration_hours: Optional[float] = Field(default=None, description="Total event duration in hours")
    number_of_activities: Optional[int] = Field(default=None, description="Count of different agenda items")
    number_of_speakers: Optional[int] = Field(default=None, description="Total number of speakers/panelists")
    # speaker_popularity: Optional[float] = Field(default=None, description="Social media metrics for speakers (e.g., LinkedIn followers)")
    # diversity_of_topics: Optional[float] = Field(default=None, description="Semantic similarity score for topic diversity")
    food_availability: Optional[bool] = Field(default=None, description="1 if food/drinks are mentioned, 0 if not")
    free_refreshments: Optional[bool] = Field(default=None, description="1 if free food/drinks are available, 0 if not")
    number_of_sponsors: Optional[int] = Field(default=None, description="Total number of sponsors")
    sponsor_prestige: Optional[float] = Field(default=None, description="Score based on sponsors' reputation")
    social_media_links: Optional[bool] = Field(default=None, description="1 if social media links are present, 0 if not")
    registration_link: Optional[bool] = Field(default=None, description="1 if a registration link is present, 0 if not")
    # event_type_weight: Optional[int] = Field(default=None, description="Weight based on event type (e.g., conferences = 3, meetups = 2, workshops = 1)")
    # language_match: Optional[bool] = Field(default=None, description="1 if event language matches user preferences, 0 if not")


In [22]:
from typing import Optional

from langchain_core.prompts import ChatPromptTemplate, MessagesPlaceholder
from pydantic import BaseModel, Field

# Define a custom prompt to provide instructions and any additional context.
# 1) You can add examples into the prompt template to improve extraction quality
# 2) Introduce additional parameters to take context into account (e.g., include metadata
#    about the document from which the text was extracted.)
prompt_template = ChatPromptTemplate.from_messages(
    [
        (
            "system",
            "You are an expert extraction algorithm. "
            "Only extract relevant information from the text. "
            "If you do not know the value of an attribute asked to extract, "
            "return null for the attribute's value.",
        ),
        # Please see the how-to about improving performance with
        # reference examples.
        # MessagesPlaceholder('examples'),
        ("human", "{text}"),
    ]
)


In [23]:
import getpass
import os

if not os.environ.get("OPENAI_API_KEY"):
  os.environ["OPENAI_API_KEY"] = getpass.getpass("Enter API key for OpenAI: ")

from langchain_openai import ChatOpenAI

llm = ChatOpenAI(model="gpt-4o-mini")

structured_llm = llm.with_structured_output(schema=EventDetails)


In [24]:
import json

print(df_all.head(10).to_markdown())
for index, row in df_all.iterrows():
    # print(f"Skipping {index}/{df_all.shape[0]}: {row['title']} ({row['url']})")
    # if index not in [-1]:
    # if index not in [4]:
    #     continue
    print(f"Analysing {index}/{df_all.shape[0]}: {row['title']} ({row['url']})")
    description = row['description']
    print(description)
    prompt = prompt_template.format(text=description)
    result = structured_llm.invoke(prompt)
    result_dict = result.model_dump()
    print(json.dumps(result_dict, indent=4))


|    | url                            | dateStart                | dateEnd                  | title                                                                                                        | hosts                                                                                       | group                | address                                               | guests                                     |   attendees | description                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                        

KeyboardInterrupt: 