## **Feature:** Missing Vals

**Names:** Tanat

### **What it does**
Intelligently handles missing values in datasets using data cleaning best practices. The feature analyzes your data's characteristics (data types, distribution, missing percentages) and automatically suggests or applies the most appropriate imputation method.

Users can either get imputation recommendations or have the system automatically clean their data based on best practices.

In [3]:
# Load dotenv
import os
from dotenv import load_dotenv
load_dotenv()

# Get API Key
OPENAI_API_KEY = os.environ.get("OPENAI_API_KEY")
if not OPENAI_API_KEY:
    print("OpenAI API Key not found")

# Import libraries
from pathlib import Path
import pandas as pd
import numpy as np

# Langchain imports
from langchain_openai import ChatOpenAI  
from langchain.schema import HumanMessage, SystemMessage


### **Helper Functions**

- `analyze_missing_values(df, drop_threshold=0.5)` = Suggest missing vlaue imputation strategies based on best practices/heuristics
- `auto_impute(df, drop_threshold=0.5)` = Automatically imputes missing values in a DataFrame based on simple best-practice heuristics.

In [28]:
def analyze_missing_values(df, drop_threshold=0.5):
    """
    Suggest missing value imputation strategies based on heuristics.
    """
    suggestions = {}
    for col in df.columns:
        print(col)
        missing_pct = df[col].isna().mean()
        dtype = df[col].dtype
        suggestion = None

        if missing_pct == 0:
            continue
        if missing_pct > drop_threshold:
            suggestion = "Drop column (too many missing values)"
        else:
            # Numerical features
            if pd.api.types.is_numeric_dtype(dtype):
                n_unique = df[col].nunique(dropna=True)
                if n_unique < 15:  # numeric but categorical (like codes)
                    suggestion = "Mode imputation (numeric categorical)"
                else:
                    non_null = df[col].dropna()
                    if len(non_null) < 10:
                        suggestion = "Median imputation (small sample)"
                    else:
                        skewness = non_null.skew()
                        suggestion = "Mean imputation" if abs(skewness) < 1 else "Median imputation (skewed)"
            
            # Categorical features
            elif pd.api.types.is_categorical_dtype(dtype) or pd.api.types.is_object_dtype(dtype):
                n_unique = df[col].nunique(dropna=True)
                if n_unique <= 10:
                    suggestion = "Mode imputation (most frequent)"
                else:
                    suggestion = "Impute with 'Unknown' or predictive model"

            # Boolean features
            elif pd.api.types.is_bool_dtype(dtype):
                suggestion = "Mode imputation (True/False)"

            # Datetime features
            elif pd.api.types.is_datetime64_any_dtype(dtype):
                suggestion = "Forward/Backward fill or interpolation (time series)"
            
            else:
                suggestion = "Custom handling needed"

        suggestions[col] = {
            "dtype": str(dtype),
            "missing_pct": round(missing_pct, 3),
            "suggestion": suggestion
        }

    return pd.DataFrame.from_dict(suggestions, orient="index")


In [None]:
def auto_impute(df, drop_threshold=0.5):
    """
    Automatically imputes missing values in a DataFrame based on simple 
    best-practice heuristics.
    """
    for col in df.columns:
        missing_pct = df[col].isna().mean()
        dtype = df[col].dtype

        # Drop if too many missing
        if missing_pct > drop_threshold:
            df = df.drop(columns=[col])
            continue

        # Numerical features (mean or median)
        if np.issubdtype(dtype, np.number):
            skewness = df[col].dropna().skew()
            if abs(skewness) < 1:
                fill_value = df[col].mean()
                print(f"Imputed '{col}' with mean")
            else:
                fill_value = df[col].median()
                print(f"'{col}' is skewed, imputed with median")
            df[col] = df[col].fillna(fill_value)

        # Categorical features
        elif df[col].dtype == "object" or pd.api.types.is_categorical_dtype(df[col]):
            n_unique = df[col].nunique(dropna=True)
            unique_ratio = n_unique / df.shape[0]
            # Low Cardinality Fill with median else impute with unknown 
            if n_unique <= 20 or unique_ratio < 0.05:
                fill_value = df[col].mode(dropna=True)[0] if not df[col].mode(dropna=True).empty else "Unknown"
                df[col] = df[col].fillna(fill_value)
                print(f"{col} has low Cardinality, imputed with mode")
            else:
                df[col] = df[col].fillna("Unknown")

        # Datetime features
        elif np.issubdtype(dtype, np.datetime64):
            df[col] = df[col].fillna(method="ffill").fillna(method="bfill")

        # Fallback
        else:
            df[col] = df[col].fillna("Unknown")

    return df

In [6]:
def impute_col(series, strategy):
    """
    Impute missing values in a pandas Series using the specified strategy.
    Supported strategies: 'mean', 'median', 'mode', 'unknown', 'ffill', 'bfill'
    """
    if strategy == "mean":
        fill_value = series.mean()
        return series.fillna(fill_value)
    elif strategy == "median":
        fill_value = series.median()
        return series.fillna(fill_value)
    elif strategy == "mode":
        fill_value = series.mode(dropna=True)[0] if not series.mode(dropna=True).empty else "Unknown"
        return series.fillna(fill_value)
    elif strategy == "unknown":
        return series.fillna("Unknown")
    elif strategy == "ffill":
        return series.fillna(method="ffill")
    elif strategy == "bfill":
        return series.fillna(method="bfill")
    else:
        print(f"Unknown strategy '{strategy}', no imputation performed.")
        return series

In [7]:
helper_docs = """ Helper functions available: 
- auto_impute(df, drop_threshold=0.5): Automatically imputes missing values in a DataFrame based on simple best-practice heuristics.
- impute_col(series, strategy): Impute missing values in a pandas Series using the specified strategy. and returns modified series
    - Supported Strategies: 'mean', 'median', 'mode', 'unknown', 'ffill', 'bfill'

Examples:
- df = auto_impute(df)
- df[col] = impute_col(df[col], 'mean')
"""

# **MAIN FEATURE FUNCTION**

In [8]:
def missing_vals(df, user_query):
    """
    Main function that gets called by the main router.
    MUST take (user_query, df) and return df
    """
    
    suggestions = analyze_missing_values(df)

    # Create message chain
    messages = []
    messages.append(SystemMessage(content=helper_docs))
    messages.append(SystemMessage(content=f"""
    You are a data cleaning agent trying to generate dataset summaries.
    
    Dataset info: Shape: {df.shape}, Sample: {df.head(3).to_string()}
    
    imputation suggestions: {suggestions if not suggestions.empty else "No Missing Values!"}

    Libraries available:
    - pd (pandas), np (numpy)
    - math, re, datetime
    - preprocessing, impute (from sklearn)
    
    Rules:
    - Return only executable Python code, no explanations, no markdown blocks
    - Use helper functions if needed
    - ASSUME "df" IS ALREADY DEFINED
    - In order to generate a response/message to the user use print statements
    print("message")
    - Write a detailed print message to summarise actions taken and reasons
    """))
    messages.append(HumanMessage(content=f"User request: {user_query}"))
    
    # Call LLM with message chain
    llm = ChatOpenAI(temperature=0, model_name="gpt-4o-mini")
    response = llm.invoke(messages)
    generated_code = response.content.strip()
    
    # Execute code
    try:
        original_df = df.copy()
        exec(generated_code)
        return df
    except Exception as e:
        print(f"Error: {e}")
        print(f"Generated Code:{generated_code}")
        return original_df

# **Testing**

In [31]:
# Enter CSV filename from "datasets" folder
dataset_name = "Life Expectancy Data.csv"

# Build CSV path (to avoid import errors)
load_dotenv()
PROJECT_ROOT = Path(os.environ["PROJECT_ROOT"])
path = PROJECT_ROOT / "datasets" / dataset_name

df = pd.read_csv(path)
test_df = df.copy()

In [None]:
# test_df = missing_vals(df, "Analyse missing values")

In [None]:
# test_df['Population'].isnull().sum()