In [5]:
import numpy as np
import pandas as pd
import statsmodels.api as sm

def calculate_cooks_distance(data, dependent_var, independent_vars):
    """
    Calculates Cook's distance given a dataset, the dependent variable, and the independent variables.
    :param data: pandas DataFrame, the entire dataset
    :param dependent_var: string, the name of the dependent variable column in the dataset
    :param independent_vars: list of strings, the names of the independent variable columns in the dataset
    :return: numpy array, Cook's distance for each data point
    """
    # Define the model
    X = data[independent_vars]
    y = data[dependent_var]
    model = sm.OLS(y, sm.add_constant(X))

    # Fit the model
    results = model.fit()

    # Calculate influence statistics
    influence = results.get_influence()

    # Get Cook's distance
    cooks_d, _ = influence.cooks_distance

    return cooks_d

# Create a DataFrame
data = pd.DataFrame({
    'x': [1, 2, 3, 4, 5],
    'y': [2, 3, 5, 7, 10]
})

# Calculate Cook's distance
cooks_d = calculate_cooks_distance(data, 'y', ['x'])

print(cooks_d)

[1.6875     0.12244898 0.0625     0.12244898 1.6875    ]


In [None]:
calculate_cooks_distance