# **UTS PE**

In [3]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [4]:
# statsmodels
import statsmodels.api as sm
import statsmodels.formula.api as smf

# sklearn
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score

# assumptions
## kolmogorov-smirnov test
from scipy import stats
from statsmodels.stats.diagnostic import het_breuschpagan
from statsmodels.stats.diagnostic import acorr_ljungbox

Metrics Template:

In [4]:
def metrics(y_test, X_test, model):
    """
    Menghitung evaluasi model regresi: R-squared, Adjusted R-squared, Log-Likelihood, AIC, dan BIC.

    Parameters:
    - y_test: Array-like, nilai aktual dari variabel dependen
    - X_test: DataFrame, variabel independen untuk data uji
    - model: Hasil regresi dari statsmodels (model.fit())

    Returns:
    - DataFrame dengan nilai R², Adjusted R², Log-Likelihood, AIC, dan BIC.
    """
    from sklearn.metrics import r2_score
    import numpy as np
    import pandas as pd

    # Prediksi nilai dari model
    y_pred = model.predict(X_test)

    # Hitung R-squared (R²)
    r2 = r2_score(y_test, y_pred)

    # Hitung Adjusted R²
    n = X_test.shape[0]  # Jumlah observasia
    p = X_test.shape[1]  # Jumlah prediktor
    adj_r2 = 1 - (1 - r2) * (n - 1) / (n - p - 1)

    # Hitung Log-Likelihood (LL)
    residuals = y_test - y_pred
    sse = np.sum(residuals ** 2)  # Sum of Squared Errors
    sigma2 = sse / n  # Estimasi varians residual
    ll = -0.5 * n * (np.log(2 * np.pi * sigma2) + 1)

    # Hitung AIC dan BIC
    aic = 2 * (p + 1) - 2 * ll  # AIC formula: 2k - 2LL (k = jumlah parameter)
    bic = np.log(n) * (p + 1) - 2 * ll  # BIC formula: ln(n) * k - 2LL

    # Buat DataFrame hasil evaluasi
    metrics_dict = {
        "R-squared": r2,
        "Adjusted R-squared": adj_r2,
        "Log-Likelihood": ll,
        "AIC": aic,
        "BIC": bic
    }
    metrics_df = pd.DataFrame(metrics_dict, index=[0])

    return metrics_df

Assumption Template:

In [3]:
from scipy.stats import kstest
from statsmodels.stats.diagnostic import het_breuschpagan, acorr_ljungbox

def test_assumptions(residuals, X_train):
    """
    Test assumptions of normality, homoskedasticity, and no-autocorrelation.

    Parameters:
    - residuals: Residuals from the regression model.
    - X_train: Training data for independent variables.

    Returns:
    - DataFrame with columns: Assumption, Statistics, p-value, and Conclusion.
    """

    # Normality Test (Kolmogorov-Smirnov)
    ks_statistic, ks_pvalue = kstest(residuals, 'norm', args=(residuals.mean(), residuals.std()))
    ks_conclusion = "Fail to Reject H0 (Normal)" if ks_pvalue > 0.05 else "Reject H0 (Not Normal)"

    # Homoskedasticity Test (Breusch-Pagan)
    bp_statistic, bp_pvalue, _, _ = het_breuschpagan(residuals, X_train)
    bp_conclusion = "Fail to Reject H0 (Homoskedastic)" if bp_pvalue > 0.05 else "Reject H0 (Heteroskedastic)"

    # No-Autocorrelation Test (Ljung-Box) up to 10 lags
    lb_results = acorr_ljungbox(residuals, lags=10, return_df=True)
    
    # Extract statistics and p-values for each lag
    lb_statistics = lb_results['lb_stat'].tolist()  # lb_stat is the Ljung-Box statistic for each lag
    lb_pvalues = lb_results['lb_pvalue'].tolist()  # lb_pvalue is the p-value for each lag

    # Build the conclusion for autocorrelation test
    autocorrelation_lags = [lag + 1 for lag, pval in enumerate(lb_pvalues) if pval <= 0.05]
    if autocorrelation_lags:
        lb_conclusion = f"Autocorrelation exists at lag {', '.join(map(str, autocorrelation_lags))}"
    else:
        lb_conclusion = "No Autocorrelation up to lag 10"

    # Create DataFrame
    results = pd.DataFrame({
        "Assumption": ["Normality", "Homoskedasticity", "No Autocorrelation"],
        "Statistics": [ks_statistic, bp_statistic, lb_statistics],
        "p-value": [ks_pvalue, bp_pvalue, lb_pvalues],
        "Conclusion": [ks_conclusion, bp_conclusion, lb_conclusion]
    })

    return results

In [1]:
import pandas as pd

In [26]:
pdb = pd.read_excel('PDBB.xlsx')

In [28]:
pdb

Unnamed: 0,PDB
0,"1967-12-31,53.3382,,"
1,"1968-12-31,64.8392,21.56,"
2,"1969-12-31,74.3422,14.66,"
3,"1970-12-31,79.4135,6.82,"
4,"1971-12-31,78.8658,-0.69,"
5,"1972-12-31,90.5121,14.77,"
6,"1973-12-31,130.4897,44.17,"
7,"1974-12-31,201.6677,54.55,"
8,"1975-12-31,232.1706,15.13,"
9,"1976-12-31,277.0508,19.33,"


In [29]:
pdb[['Date', 'PDB', 'Unwanted', 'Check']] = pdb['PDB'].str.split(',', expand=True)
pdb

Unnamed: 0,PDB,Date,Unwanted,Check
0,53.3382,1967-12-31,,
1,64.8392,1968-12-31,21.56,
2,74.3422,1969-12-31,14.66,
3,79.4135,1970-12-31,6.82,
4,78.8658,1971-12-31,-0.69,
5,90.5121,1972-12-31,14.77,
6,130.4897,1973-12-31,44.17,
7,201.6677,1974-12-31,54.55,
8,232.1706,1975-12-31,15.13,
9,277.0508,1976-12-31,19.33,
