## **Feature:** Dataset Summaries

**Names:** Zim, Dhruv

### **What it does**
Generates statistical summaries including descriptive statistics (mean, median, mode, variance, std, quartiles), for dataset exploration.

In [None]:
# Load dotenv
import os
from dotenv import load_dotenv
load_dotenv()

# Get API Key
OPENAI_API_KEY = os.environ.get("OPENAI_API_KEY")
if not OPENAI_API_KEY:
    print("OpenAI API Key not found")
    
# Import libraries
from pathlib import Path
import pandas as pd
import numpy as np

# Langchain imports
from langchain_openai import ChatOpenAI  
from langchain.schema import HumanMessage, SystemMessage


### **Helper Functions**
- `calculate_basic_stats(df, columns)`: For specified numeric columns, calculate numeric stats such as: count, missing, mean, median, standard deviation, variance, minimum, maximum, etc.
- `calculate_five_number_summary(df, columns)`: For specified numeric columns, calculate 5 number stats (min, max, Q1, median, Q3 , IQR)
- `calculate_mode_stats(df, columns)`: Calculate mode, frequency, unique values for specified (all) columns

In [18]:
def numerical_summary(df, columns=None, metrics=None):
    """
    Returns summary dataframe for given columns of numeric type

    - params:
        - columns: list[str] or None (defaults to numeric columns)
        - metrics: list[str] subset of ['count','missing','mean','std', 'var', 'min','median','q1','q3','max','skew','range', 'iqr']
    - returns: pd.DataFrame
    """
    
    if not isinstance(df, pd.DataFrame):
        return pd.DataFrame()
    
    # Default to numeric columns if no specific columns given
    if columns:
        columns = [col for col in columns if col in df.columns and pd.api.types.is_numeric_dtype(df[col])]
    else:
        columns = df.select_dtypes(include=[np.number]).columns.tolist()

    if not columns:
        return pd.DataFrame

    filtered_df = df[columns]

    # Default metric set
    default_metrics = {
        "count": filtered_df.count(),
        "missing": filtered_df.isna().sum(),
        "mean": filtered_df.mean(),
        "std": filtered_df.std(),
        "var": filtered_df.var(),
        "min": filtered_df.min(),
        "q1": filtered_df.quantile(0.25),
        "median": filtered_df.median(),
        "q3": filtered_df.quantile(0.75),
        "max": filtered_df.max(),
        "skew": filtered_df.skew(),
        "iqr": filtered_df.quantile(0.75) - filtered_df.quantile(0.25), 
        "range": filtered_df.max() - filtered_df.min()
    }
    
    # Filter to requested metrics (always including count)
    if metrics:
        metrics = {m.lower() for m in metrics}
        output_cols = {"count": default_metrics["count"]}
        for d in default_metrics:
            if d != "count" and d in metrics:
                output_cols[d] = default_metrics[d]
    else:
        output_cols = default_metrics
    
    output_df = pd.DataFrame(output_cols)
    print(output_df)
    return df

In [19]:
def categorical_summary(df, columns=None, metrics=None):
    """
    Returns summary dataframe for given columns of categorical type

    - params:
        - columns: list[str] or None (defaults to non-numeric columns)
        - metrics: list[str] subset of ['count', 'missing', 'nunique', 'mode', 'top_freq']
    - returns: pd.DataFrame
    """

    if not isinstance(df, pd.DataFrame):
        return pd.DataFrame()

    # Default to non-numeric columns if no specific columns given
    if columns:
        columns = [col for col in columns if col in df.columns]
    else:
        columns = [col for col in df.select_dtypes(include='object')]

    if not columns:
        return pd.DataFrame()
    
    filtered_df = df[columns]

    default_metrics = {
        "count": filtered_df.count(),
        "missing": filtered_df.isna().sum(),
        "nunique": filtered_df.nunique(dropna=True),
        "mode": filtered_df.apply(lambda col: col.mode(dropna=True).iloc[0] if not col.mode(dropna=True).empty else np.nan),
        "top_freq": filtered_df.apply(lambda col: col.value_counts(dropna=True).iloc[0] if not col.value_counts(dropna=True).empty else 0)
    }
    if metrics:
        metrics = {m.lower() for m in metrics}
        output_cols = {"count": default_metrics["count"]}
        for d in default_metrics:
            if d != "count" and d in metrics:
                output_cols[d] = default_metrics[d]
    else:
        output_cols = default_metrics

    output_df = pd.DataFrame(output_cols)
    print(output_df)
    return df

In [20]:
def calculate_mode_stats(df, columns=None, metrics=None):
    """
    Returns summary dataframe for given columns of categorical type

    - params:
        - columns: list[str] or None
        - metrics: list[str] subset of ['mode', 'mode_freq', 'unique_vals', 'most_freq']
    - returns: pd.DataFrame
    """
    if columns is None:
        columns = df.columns

    # Default metrics
    all_metrics = ['mode', 'mode_freq', 'unique_vals', 'most_freq']
    if metrics is not None:
        metrics = [m for m in metrics if m in all_metrics]
    else:
        metrics = all_metrics

    mode_stats = {}
    for col in columns:
        if col in df.columns:
            stats = {
                'mode': df[col].mode().iloc[0] if not df[col].mode().empty else None,
                'mode_freq': df[col].value_counts().iloc[0] if not df[col].value_counts().empty else 0,
                'unique_vals': df[col].nunique(),
                'most_freq': df[col].value_counts().head(3).to_dict()
            }
            # Only keep requested metrics
            mode_stats[col] = {k: stats[k] for k in metrics}
    print(mode_stats)
    return df

In [26]:
helper_docs = """ Helper functions available: 
- numerical_summary(df, columns=None, metrics=None): Returns a numeric summary DataFrame for given columns. 
prints summary of given metrics for given columns in dataframe of numeric type only
    - params:
        - columns: list[str] or None (defaults to numeric columns)
        - metrics: list[str] subset of ["count","missing","mean","std","min","median","q1","q3","max","skew","range", "iqr"]

- def categorical_summary(df, columns=None, metrics=None)
prints summary of given metrics for given columns in dataframe
    - params:
        - columns: list[str] or None (defaults to non-numeric columns)
        - metrics: list[str] subset of ['count', 'missing', 'nunique', 'mode', 'top_freq']

- def calculate_mode_stats(df, columns=None, metrics=None): 
prints summary of given metrics for given columns in dataframe
    - params:
        - columns: list[str] or None (defaults to non-numeric columns)
        - metrics: list[str] subset of ['mode', 'mode_freq', 'unique_vals', 'most_freq']

Examples:
- "Find cardinality of categorical columns" -> def categorical_summary(df, columns=None, metrics=['nunique'])
- "summary of all data" -> use multiple helper functions
"""

# **MAIN FEATURE FUNCTION**

In [None]:
def get_summaries(df, user_query):
    """
    Main function that gets called by the main router.
    MUST take (user_query, df) and return df
    """
    
    # Create message chain
    messages = []
    messages.append(SystemMessage(content=helper_docs))
    messages.append(SystemMessage(content=f"""
    You are a data cleaning agent trying to generate dataset summaries.
    
    Dataset info: Shape: {df.shape}, Sample: {df.head(3).to_string()}

    Libraries available:
    - pd (pandas), np (numpy)
    - math, re, datetime
    
    BASIC SUMMARIES
    - df.shape(): rows and columns
    - 5 number summary (numerical): 'IQR', 'Q3', 'Q1', 'Median', 'Mean' 
    - (categorical): mode, nunique, top_freq

    Rules:
    - Return only executable Python code, no explanations, no markdown blocks
    - Use helper functions if needed
    - For specific columns mentioned in query, pass them as lists to helper functions
    - ASSUME DF IS ALREADY DEFINED
    - In order to generate a response/message to the user use print statements
    print("message")
    - Write a detailed print message to summarise actions taken and reasons
    """))
    messages.append(HumanMessage(content=f"User request: {user_query}"))
    
    # Call LLM with message chain
    llm = ChatOpenAI(temperature=0, model_name="gpt-4o-mini")
    response = llm.invoke(messages)
    generated_code = response.content.strip()
    
    # Execute code
    try:
        original_df = df.copy()
        exec(generated_code)
        return df
    except Exception as e:
        print(f"Error: {e}")
        print(f"Generated Code:{generated_code}")
        return original_df

# **Testing**

In [None]:
# Enter CSV filename from "datasets" folder
dataset_name = "Life Expectancy Data.csv"

# Build CSV path (to avoid import errors)
load_dotenv()
PROJECT_ROOT = Path(os.environ["PROJECT_ROOT"])
path = PROJECT_ROOT / "datasets" / dataset_name

df = pd.read_csv(path)
test_df = df.copy()