El Nino Data Exploration

In [3]:
import pandas as pd
import statsmodels.api as sm
from statsmodels.tsa.stattools import adfuller, kpss
from scipy import stats

def adf_test(data):
    result = adfuller(data)
    return result[1]  # p-value from ADF test

def kpss_test(data):
    result = kpss(data, regression='c', nlags='auto')
    return result[1]  # p-value from KPSS test

def jarque_bera_test(data):
    stat, p_value = stats.jarque_bera(data)
    return p_value

def shapiro_wilk_test(data):
    stat, p_value = stats.shapiro(data)
    return p_value

file_path = "/Users/jenniferzhang/Desktop/24_25/quantsc/EODHist_NQSSFB.xlsx"
df = pd.read_excel(file_path)

index_data = df['Index Value']

# Compute statistics
min_value = index_data.min()
max_value = index_data.max()
mean_value = index_data.mean()
std_dev = index_data.std()

# Perform tests
adf_p_value = adf_test(index_data)
kpss_p_value = kpss_test(index_data)
jb_p_value = jarque_bera_test(index_data)
sw_p_value = shapiro_wilk_test(index_data)

# Display results with better interpretation
print("=== Descriptive Statistics ===")
print(f"Minimum Index Value: {min_value:.2f}")
print(f"Maximum Index Value: {max_value:.2f}")
print(f"Mean Index Value: {mean_value:.2f}")
print(f"Standard Deviation: {std_dev:.2f}")

print("\n=== Stationarity Tests ===")
print(f"ADF Test p-value: {adf_p_value:.4f} -> {'Stationary' if adf_p_value < 0.05 else 'Non-stationary'}")
print(f"KPSS Test p-value: {kpss_p_value:.4f} -> {'Non-stationary' if kpss_p_value < 0.05 else 'Stationary'}")

if adf_p_value < 0.05 and kpss_p_value >= 0.05:
    print("Conclusion: The series is stationary.")
elif adf_p_value >= 0.05 and kpss_p_value < 0.05:
    print("Conclusion: The series is non-stationary.")
else:
    print("Conclusion: The results are conflicting, further analysis may be needed.")

print("\n=== Normality Tests ===")
print(f"Jarque-Bera Test p-value: {jb_p_value:.4f} -> {'Non-normal' if jb_p_value < 0.05 else 'Normal'}")
print(f"Shapiro-Wilk Test p-value: {sw_p_value:.4f} -> {'Non-normal' if sw_p_value < 0.05 else 'Normal'}")

if jb_p_value < 0.05 and sw_p_value < 0.05:
    print("Conclusion: The data is not normally distributed.")
else:
    print("Conclusion: The data appears to be normally distributed.")


=== Descriptive Statistics ===
Minimum Index Value: 0.00
Maximum Index Value: 1368.36
Mean Index Value: 1077.84
Standard Deviation: 136.46

=== Stationarity Tests ===
ADF Test p-value: 0.4214 -> Non-stationary
KPSS Test p-value: 0.0100 -> Non-stationary
Conclusion: The series is non-stationary.

=== Normality Tests ===
Jarque-Bera Test p-value: 0.0006 -> Non-normal
Shapiro-Wilk Test p-value: 0.0000 -> Non-normal
Conclusion: The data is not normally distributed.


look-up table. The actual p-value is smaller than the p-value returned.

  result = kpss(data, regression='c', nlags='auto')


Nasdaq US Smart Food & Beverage Index (NQSSFB) Data Exploration

In [1]:
import pandas as pd
import statsmodels.api as sm
from statsmodels.tsa.stattools import adfuller, kpss
from scipy import stats

file_path = "/Users/jenniferzhang/Desktop/24_25/quantsc/EODHist_NQSSFB.xlsx"

# Read the Excel file into a DataFrame
df = pd.read_excel(file_path)

# Extract the 'Index Value' column
index_data = df['Index Value']

# Basic statistics
min_value = index_data.min()
max_value = index_data.max()
mean_value = index_data.mean()
std_dev = index_data.std()

# Stationarity Tests
def adf_test(data):
    """Augmented Dickey-Fuller (ADF) test for stationarity.
    Null hypothesis: The time series has a unit root (not stationary).
    """
    result = adfuller(data)
    return result[1]  # p-value

def kpss_test(data):
    """Kwiatkowski-Phillips-Schmidt-Shin (KPSS) test for stationarity.
    Null hypothesis: The time series is stationary.
    """
    result = kpss(data, regression='c', nlags='auto')
    return result[1]  # p-value

# Normality Tests
def jarque_bera_test(data):
    """Jarque-Bera (JB) test for normality.
    Null hypothesis: The data follows a normal distribution.
    """
    stat, p_value = stats.jarque_bera(data)
    return p_value  # p-value

def shapiro_wilk_test(data):
    """Shapiro-Wilk (SW) test for normality.
    Null hypothesis: The data follows a normal distribution.
    """
    stat, p_value = stats.shapiro(data)
    return p_value  # p-value

# Compute test results
adf_p_value = adf_test(index_data)
kpss_p_value = kpss_test(index_data)
jb_p_value = jarque_bera_test(index_data)
sw_p_value = shapiro_wilk_test(index_data)

# Display the results
print("=== Descriptive Statistics ===")
print(f"Minimum Index Value: {min_value:.4f}")
print(f"Maximum Index Value: {max_value:.4f}")
print(f"Mean Index Value: {mean_value:.4f}")
print(f"Standard Deviation: {std_dev:.4f}")

print("\n=== Stationarity Tests ===")
print(f"ADF Test p-value: {adf_p_value:.4f} (p < 0.05 suggests stationarity)")
print(f"KPSS Test p-value: {kpss_p_value:.4f} (p < 0.05 suggests non-stationarity)")

print("\n=== Normality Tests ===")
print(f"Jarque-Bera Test p-value: {jb_p_value:.4f} (p < 0.05 suggests non-normality)")
print(f"Shapiro-Wilk Test p-value: {sw_p_value:.4f} (p < 0.05 suggests non-normality)")


=== Descriptive Statistics ===
Minimum Index Value: 0.0000
Maximum Index Value: 1368.3632
Mean Index Value: 1077.8388
Standard Deviation: 136.4647

=== Stationarity Tests ===
ADF Test p-value: 0.4214 (p < 0.05 suggests stationarity)
KPSS Test p-value: 0.0100 (p < 0.05 suggests non-stationarity)

=== Normality Tests ===
Jarque-Bera Test p-value: 0.0006 (p < 0.05 suggests non-normality)
Shapiro-Wilk Test p-value: 0.0000 (p < 0.05 suggests non-normality)


look-up table. The actual p-value is smaller than the p-value returned.

  result = kpss(data, regression='c', nlags='auto')


MSCI agricultural & FC index (M2WO0AGF) Data Exploration

In [None]:
import pandas as pd
import statsmodels.api as sm
from statsmodels.tsa.stattools import adfuller, kpss
from scipy import stats

file_path = "/Users/jenniferzhang/Desktop/24_25/quantsc/MSCI.xlsx"

# Read the Excel file into a DataFrame
df = pd.read_excel(file_path)

# Assuming your "Index Value" column is labeled 'Index Value'
# Replace 'Index Value' with the actual column name if different
index_data = df['Index Value']

# Basic statistics
min_value = index_data.min()
max_value = index_data.max()
mean_value = index_data.mean()
std_dev = index_data.std()

# Stationarity Tests - ADF (Augmented Dickey-Fuller) Test
def adf_test(data):
    result = adfuller(data)
    return result[1]  # p-value from ADF test

# KPSS Test (Kwiatkowski-Phillips-Schmidt-Shin)
def kpss_test(data):
    result = kpss(data, regression='c', nlags='auto')
    return result[1]  # p-value from KPSS test

# Normality Tests - Jarque-Bera Test (JB)
def jarque_bera_test(data):
    stat, p_value = stats.jarque_bera(data)
    return p_value  # p-value from JB test

# Shapiro-Wilk Test (SW test)
def shapiro_wilk_test(data):
    stat, p_value = stats.shapiro(data)
    return p_value  # p-value from SW test

# Calculate the stationarity and normality tests
adf_p_value = adf_test(index_data)
kpss_p_value = kpss_test(index_data)
jb_p_value = jarque_bera_test(index_data)
sw_p_value = shapiro_wilk_test(index_data)

# Display the results
print(f"Min: {min_value}")
print(f"Max: {max_value}")
print(f"Mean: {mean_value}")
print(f"Standard Deviation: {std_dev}")
print("\nStationarity Tests:")
print(f"ADF test p-value: {adf_p_value}")
print(f"KPSS test p-value: {kpss_p_value}")
print("\nNormality Tests:")
print(f"Jarque-Bera test p-value: {jb_p_value}")
print(f"Shapiro-Wilk test p-value: {sw_p_value}")


S&P food and beverage select industry index (SPSIFBN) Data Exploration

In [7]:
import pandas as pd
import numpy as np
from statsmodels.tsa.stattools import adfuller, kpss
from scipy.stats import jarque_bera, shapiro

file_path = "/Users/jenniferzhang/Desktop/24_25/quantsc/INDEX_XX_S&P.csv"

# Read the CSV file
df = pd.read_csv(file_path)

# Convert 'Date' column to datetime format
df['Date'] = pd.to_datetime(df['Date'], format='%m/%d/%Y')

# Ensure numeric columns are converted properly (remove commas and handle missing values)
numeric_columns = ['Open', 'High', 'Low', 'Close']
df[numeric_columns] = df[numeric_columns].replace(',', '', regex=True).apply(pd.to_numeric)

# Drop rows with missing values if any exist (optional, depending on data handling preference)
df = df.dropna(subset=numeric_columns)

# Compute Basic Statistics
summary_stats = {
    "Metric": ["Min", "Max", "Mean", "Standard Deviation"],
    "Open": [df['Open'].min(), df['Open'].max(), df['Open'].mean(), df['Open'].std()],
    "High": [df['High'].min(), df['High'].max(), df['High'].mean(), df['High'].std()],
    "Low": [df['Low'].min(), df['Low'].max(), df['Low'].mean(), df['Low'].std()],
    "Close": [df['Close'].min(), df['Close'].max(), df['Close'].mean(), df['Close'].std()],
}

# Convert summary stats to DataFrame
stats_df = pd.DataFrame(summary_stats)

# Stationarity Tests (ADF Test, KPSS Test)
def stationarity_tests(series):
    # ADF test (Augmented Dickey-Fuller Test)
    adf_test = adfuller(series, autolag='AIC')[1]  # ADF test p-value
    
    # KPSS test (Kwiatkowski-Phillips-Schmidt-Shin Test)
    try:
        kpss_test = kpss(series, regression='c', nlags='auto')[1]  # KPSS test p-value
    except ValueError:  # In case of a ValueError (e.g., small data sample)
        kpss_test = np.nan
        print("Warning: KPSS test could not be performed due to insufficient data.")
        
    return adf_test, kpss_test

# Perform stationarity tests on the Close price
adf_pval, kpss_pval = stationarity_tests(df['Close'])

# Normality Tests (Jarque-Bera, Shapiro-Wilk)
jb_pval = jarque_bera(df['Close'])[1]  # Jarque-Bera test p-value
sw_pval = shapiro(df['Close'])[1]  # Shapiro-Wilk test p-value

# Display Results
print("===== Summary Statistics =====")
print(stats_df)
print("\n===== Stationarity Tests =====")
print(f"ADF Test p-value: {adf_pval:.5f} ({'p < 0.05 suggests stationarity' if adf_pval < 0.05 else 'p >= 0.05 suggests non-stationarity'})")
print(f"KPSS Test p-value: {kpss_pval:.5f} ({'p < 0.05 suggests non-stationarity' if kpss_pval < 0.05 else 'p >= 0.05 suggests stationarity'})")
print("\n===== Normality Tests =====")
print(f"Jarque-Bera Test p-value: {jb_pval:.5f} ({'p < 0.05 suggests non-normality' if jb_pval < 0.05 else 'p >= 0.05 suggests normality'})")
print(f"Shapiro-Wilk Test p-value: {sw_pval:.5f} ({'p < 0.05 suggests non-normality' if sw_pval < 0.05 else 'p >= 0.05 suggests normality'})")


===== Summary Statistics =====
               Metric         Open         High          Low        Close
0                 Min  6972.130000  6972.130000  6972.130000  6972.130000
1                 Max  7819.550000  7819.550000  7819.550000  7819.550000
2                Mean  7387.671195  7387.671195  7387.671195  7387.670717
3  Standard Deviation   182.608980   182.608980   182.608980   182.608776

===== Stationarity Tests =====
ADF Test p-value: 0.08056 (p >= 0.05 suggests non-stationarity)
KPSS Test p-value: 0.09945 (p >= 0.05 suggests stationarity)

===== Normality Tests =====
Jarque-Bera Test p-value: 0.11749 (p >= 0.05 suggests normality)
Shapiro-Wilk Test p-value: 0.01960 (p < 0.05 suggests non-normality)
