In [1]:
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_selection import SelectFromModel
from sklearn.model_selection import cross_val_score
import matplotlib.pyplot as plt

# Load the data
x_train = pd.read_csv("x_train.txt", header=None, delim_whitespace=True)
y_train = pd.read_csv("y_train.txt", header=None, delim_whitespace=True)[0]
x_test = pd.read_csv("x_test.txt", header=None, delim_whitespace=True)

In [16]:
from scipy.stats import jarque_bera, shapiro

jb_results = []
shapiro_results = []
for column in x_train.columns:
    series = x_train[column]

    # Perform Jarque-Bera test
    jb_stat, jb_p_value = jarque_bera(series)
    jb_results.append((column, jb_stat, jb_p_value))

    # Perform Shapiro-Wilk test
    shapiro_stat, shapiro_p_value = shapiro(series)
    shapiro_results.append((column, shapiro_stat, shapiro_p_value))

jb_df = pd.DataFrame(jb_results, columns=["Variable", "JB Statistic", "JB P-Value"])
shapiro_df = pd.DataFrame(
    shapiro_results, columns=["Variable", "Shapiro Statistic", "Shapiro P-Value"]
)

print("Jarque-Bera Test Results")
print(jb_df.head())

print("\nShapiro-Wilk Test Results")
print(shapiro_df.head())

jb_p_value_above_05 = jb_df.where(jb_df["JB P-Value"] > 0.05).count()
print(jb_p_value_above_05)
shapiro_p_value_above_05 = shapiro_df.where(
    shapiro_df["Shapiro P-Value"] > 0.05
).count()
print(shapiro_p_value_above_05)
jb_df.to_csv("jarque_bera_results.csv", index=False)
shapiro_df.to_csv("shapiro_wilk_results.csv", index=False)
# _________________
jb_p_value_above_05 = jb_df.where(jb_df["JB P-Value"] > 0.05)[:200].count()

print(jb_p_value_above_05)
shapiro_p_value_above_05 = shapiro_df.where(shapiro_df["Shapiro P-Value"] > 0.05)[
    :200
].count()
print(shapiro_p_value_above_05)

Jarque-Bera Test Results
   Variable  JB Statistic  JB P-Value
0         0      2.561659    0.277807
1         1      0.977533    0.613383
2         2      4.504056    0.105186
3         3      1.365886    0.505128
4         4      0.159712    0.923249

Shapiro-Wilk Test Results
   Variable  Shapiro Statistic  Shapiro P-Value
0         0           0.999594         0.400143
1         1           0.999748         0.841449
2         2           0.999528         0.259479
3         3           0.999654         0.568069
4         4           0.999638         0.521750
Variable        190
JB Statistic    190
JB P-Value      190
dtype: int64
Variable             191
Shapiro Statistic    191
Shapiro P-Value      191
dtype: int64
Variable        190
JB Statistic    190
JB P-Value      190
dtype: int64
Variable             191
Shapiro Statistic    191
Shapiro P-Value      191
dtype: int64
