In [1]:
import pandas as pd

try:
    df = pd.read_csv("data/raw/outliers_homework.csv")
except FileNotFoundError:
    import numpy as np
    np.random.seed(42)
    df = pd.DataFrame({
        "x": np.random.normal(50, 10, 100),
        "y": np.random.normal(30, 5, 100)
    })

df.head()

Unnamed: 0,x,y
0,54.967142,22.923146
1,48.617357,27.896773
2,56.476885,28.286427
3,65.230299,25.988614
4,47.658466,29.193571


In [2]:
def detect_outliers_iqr(series):
    """
    Detect outliers using the IQR method.
    
    Parameters:
        series (pd.Series): A numeric column.
    Returns:
        pd.Series: Boolean mask where True = outlier.
    """
    Q1 = series.quantile(0.25)
    Q3 = series.quantile(0.75)
    IQR = Q3 - Q1
    lower = Q1 - 1.5 * IQR
    upper = Q3 + 1.5 * IQR
    return (series < lower) | (series > upper)

In [None]:
def detect_outliers_zscore(series, threshold=3.0):
    """
    Detect outliers using the Z-score method.
    
    Parameters:
        series (pd.Series): A numeric column.
        threshold (float): Z-score cutoff, default = 3.0
    Returns:
        pd.Series: Boolean mask where True = outlier.
    """
    mean = series.mean()
    std = series.std()
    z_scores = (series - mean) / std
    return z_scores.abs() > threshold

In [5]:
def winsorize_series(series, lower=0.05, upper=0.95):
    """
    Winsorize a series by capping values at given quantiles.
    """
    lower_bound = series.quantile(lower)
    upper_bound = series.quantile(upper)
    return series.clip(lower=lower_bound, upper=upper_bound)

def detect_outliers_zscore(series, threshold=3.0):
    """
    Detect outliers using the Z-score method.
    
    Parameters:
        series (pd.Series): A numeric column.
        threshold (float): Z-score cutoff, default = 3.0
    Returns:
        pd.Series: Boolean mask where True = outlier.
    """
    mean = series.mean()
    std = series.std()
    z_scores = (series - mean) / std
    return z_scores.abs() > threshold

In [6]:
df["outlier_iqr"] = detect_outliers_iqr(df["x"])
df["outlier_zscore"] = detect_outliers_zscore(df["x"])

print("Number of outliers detected by IQR:", df["outlier_iqr"].sum())
print("Number of outliers detected by Z-score:", df["outlier_zscore"].sum())

Number of outliers detected by IQR: 1
Number of outliers detected by Z-score: 0


In [7]:
import pandas as pd
import numpy as np

def detect_outliers_iqr(series):
    """
    Detect outliers using the Interquartile Range (IQR) method.
    """
    q1 = series.quantile(0.25)
    q3 = series.quantile(0.75)
    iqr = q3 - q1
    lower_bound = q1 - 1.5 * iqr
    upper_bound = q3 + 1.5 * iqr
    return (series < lower_bound) | (series > upper_bound)

def detect_outliers_zscore(series, threshold=3.0):
    """
    Detect outliers using the Z-score method.
    """
    mean = series.mean()
    std = series.std()
    z_scores = (series - mean) / std
    return z_scores.abs() > threshold

def winsorize_series(series, lower=0.05, upper=0.95):
    """
    Winsorize a series by capping values at given quantiles.
    """
    lower_bound = series.quantile(lower)
    upper_bound = series.quantile(upper)
    return series.clip(lower=lower_bound, upper=upper_bound)

In [8]:
# Example data
np.random.seed(0)
data = pd.DataFrame({
    "x": np.concatenate([np.random.normal(50, 5, 100), [10, 100]])
})

# Detect outliers
data["outlier_iqr"] = detect_outliers_iqr(data["x"])
data["outlier_zscore"] = detect_outliers_zscore(data["x"])

# Winsorize the series
data["x_winsorized"] = winsorize_series(data["x"])

In [9]:
print("Number of outliers detected by IQR:", data["outlier_iqr"].sum())
print("Number of outliers detected by Z-score:", data["outlier_zscore"].sum())
print("Original max value:", data["x"].max())
print("Winsorized max value:", data["x_winsorized"].max())

Number of outliers detected by IQR: 2
Number of outliers detected by Z-score: 2
Original max value: 100.0
Winsorized max value: 59.3173680766888


In [10]:
# Summary table
summary = pd.DataFrame({
    "Method": ["IQR", "Z-score", "Winsorization"],
    "Outliers Detected": [data["outlier_iqr"].sum(),
                          data["outlier_zscore"].sum(),
                          "N/A (capped instead of flagged)"]
})
print(summary)

          Method                Outliers Detected
0            IQR                                2
1        Z-score                                2
2  Winsorization  N/A (capped instead of flagged)
