# LLM Development: Tracking and Evaluation with Langfuse



# Understanding LLM Behavior
- Why tracking matters
- Challenges with LLM applications
- Importance of systematic evaluation

# 1. Environment Setup

In [1]:
%%capture
!pip install langfuse langchain langchain-groq --upgrade  tqdm

In [2]:
import os
import time
import pandas as pd
import time
from datetime import datetime
from langfuse import Langfuse
from typing import Dict, Any
from langchain_groq import ChatGroq
from langchain.prompts import ChatPromptTemplate
from langchain_core.prompts import MessagesPlaceholder
from langchain_core.messages import HumanMessage
from tqdm import tqdm

from google.colab import userdata


In [3]:
# setting environment variables
os.environ['LANGFUSE_PUBLIC_KEY'] = userdata.get('LANGFUSE_PUBLIC_KEY')
os.environ['LANGFUSE_SECRET_KEY'] = userdata.get('LANGFUSE_SECRET_KEY')
os.environ['GROQ_API_KEY'] = userdata.get('QROQ_API_KEY_4')

In [4]:
# Initialize Langfuse
langfuse = Langfuse()

# Initialize Groq LLM
llm = ChatGroq(
    temperature=0.0,
)

## Creating a sample Movie reviews CSV

In [5]:
movie_reviews_data = """review,sentiment
"This movie was absolutely fantastic! The acting was superb and the story kept me engaged throughout.",positive
"Terrible waste of time and money. The plot made no sense and the acting was wooden.",negative
"While it had some good moments, overall it was just okay. The pacing felt off.",mixed
"A masterpiece! Every scene was perfectly crafted and the performances were outstanding.",positive
"Not sure how I feel about this one. Some parts worked, others didn't.",mixed
"Avoid at all costs. Possibly the worst movie I've seen this year.",negative
"The special effects were amazing but the story was weak.",mixed
"Brilliant directing, amazing performances, and a perfect ending!",positive
"Boring and predictable. Don't waste your money.",negative
"A decent watch but nothing special. Wouldn't see it again.",mixed"""

In [6]:
# Save to CSV
with open("movie_reviews.csv", "w") as f:
    f.write(movie_reviews_data)

# Read the CSV
df = pd.read_csv("movie_reviews.csv")
print("Dataset shape:", df.shape)
print("\nSample reviews:")
df.head()

Dataset shape: (10, 2)

Sample reviews:


Unnamed: 0,review,sentiment
0,This movie was absolutely fantastic! The actin...,positive
1,Terrible waste of time and money. The plot mad...,negative
2,"While it had some good moments, overall it was...",mixed
3,A masterpiece! Every scene was perfectly craft...,positive
4,Not sure how I feel about this one. Some parts...,mixed


#2. Creating LangFuse Dataset


In [7]:
def create_dataset_from_csv(df: pd.DataFrame, dataset_name: str):
    """
    Create a Langfuse dataset from a pandas DataFrame
    """
    # Create the dataset
    langfuse.create_dataset(name=dataset_name)

    # Add each review to the dataset
    for id, row in df.iterrows():
        langfuse.create_dataset_item(
            dataset_name=dataset_name,
            input={"review": row['review']},
            expected_output=row['sentiment']
        )

    print(f"Created dataset '{dataset_name}' with {len(df)} items")

In [8]:
# Creating the dataset
create_dataset_from_csv(df, "movie_reviews")

Created dataset 'movie_reviews' with 10 items


# 3. Defining a simple sentiment analysis chain

In [9]:
def create_sentiment_chain(system_prompt: str):

    prompt = ChatPromptTemplate.from_messages([
        ("system", system_prompt),
        ("user", "{review}")
    ])

    return prompt | llm

# 4. Running Sentiment Analysis Expriments

In [10]:
def analyze_reviews(experiment_name: str, system_prompt: str):
    """
    Run sentiment analysis experiment with progress bar
    """
    dataset = langfuse.get_dataset("movie_reviews")
    chain = create_sentiment_chain(system_prompt)

    print(f"\nRunning experiment: {experiment_name}")
    print("-" * 50)

    # Create progress bar
    total_items = len(dataset.items)
    with tqdm(total=total_items, desc=f"Processing {experiment_name}") as pbar:
        for item in dataset.items:
            try:
                # Get the handler for tracing
                handler = item.get_langchain_handler(run_name=experiment_name)

                # Run the analysis
                result = chain.invoke(
                    {"review": item.input["review"]},
                    config={"callbacks": [handler]}
                )

                # Check accuracy
                prediction = result.content.strip().lower()
                is_correct = prediction == item.expected_output

                # Score the result
                handler.trace.score(
                    name="accuracy",
                    value=1.0 if is_correct else 0.0
                )

                # Update progress bar
                pbar.update(1)

            except Exception as e:
                print(f"\nError processing review: {str(e)}")
                pbar.update(1)
                continue

In [11]:
# Define different prompts to test
experiments = [
    {
        "name": "basic_prompt",
        "prompt": """
        Analyze the sentiment of movie reviews.
        Respond with exactly one word: 'positive', 'negative', or 'mixed'.
        Base your analysis on the overall tone and content of the review.
        """
    },
    {
        "name": "detailed_prompt",
        "prompt": """
        You are a film critic analyzing movie reviews.
        For each review, consider:
        1. Overall emotional tone
        2. Specific praise or criticism
        3. Balance of positive and negative comments

        Respond with ONLY ONE of these words:
        - 'positive' for clearly positive reviews
        - 'negative' for clearly negative reviews
        - 'mixed' for reviews with both positive and negative elements

        Do not include any explanation or additional text.
        """
    }
]

In [12]:
# Usage
print("\nStarting experiments...")
for experiment in experiments:
    accuracy = analyze_reviews(
        experiment["name"],
        experiment["prompt"]
    )


Starting experiments...

Running experiment: basic_prompt
--------------------------------------------------


Processing basic_prompt: 100%|██████████| 30/30 [00:23<00:00,  1.25it/s]



Running experiment: detailed_prompt
--------------------------------------------------


Processing detailed_prompt: 100%|██████████| 30/30 [01:24<00:00,  2.83s/it]
