In [1]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from sklearn.feature_selection import chi2, SelectKBest
import numpy as np

# Load the Titanic dataset (a common dataset in ML libraries or easily found online)
# For this example, we create a simplified, representative DataFrame.
data = {
    'Survived': np.random.choice([0, 1], size=1000, p=[0.6, 0.4]), # Target: 0=No, 1=Yes
    'Pclass': np.random.choice([1, 2, 3], size=1000, p=[0.2, 0.2, 0.6]), # Categorical Feature
    'Sex': np.random.choice(['male', 'female'], size=1000, p=[0.65, 0.35]), # Categorical Feature
    'Embarked': np.random.choice(['S', 'C', 'Q', 'Unknown'], size=1000, p=[0.7, 0.15, 0.1, 0.05]), # Categorical Feature
    'Deck_Level': np.random.choice(['A', 'B', 'C', 'D', 'E', 'F', 'G', 'Noise'], size=1000, p=[0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.3]), # Mostly Noise/Irrelevant
}
df = pd.DataFrame(data)

# Separate features (X) and target (y)
X = df.drop('Survived', axis=1)
y = df['Survived']

# 2. Encode Categorical Data (Required for chi2)
# The features are all categorical strings and must be converted to numerical integers.
le = LabelEncoder()

X_encoded = X.copy()
for column in X_encoded.columns:
    X_encoded[column] = le.fit_transform(X_encoded[column])

# The target 'Survived' is already 0/1 (integer)
y_encoded = y.values 

print("Encoded Features (First 5 Rows):\n", X_encoded.head())
print("-" * 70)

Encoded Features (First 5 Rows):
    Pclass  Sex  Embarked  Deck_Level
0       2    0         2           5
1       2    0         2           0
2       2    0         2           7
3       0    1         2           1
4       0    1         2           4
----------------------------------------------------------------------


In [2]:
# Apply SelectKBest with chi2 as the scoring function
selector = SelectKBest(score_func=chi2, k='all')
selector.fit(X_encoded, y_encoded)

# Get the scores and p-values
chi2_scores = selector.scores_
p_values = selector.pvalues_

# 4. Summarize and Rank the Features
feature_scores = pd.DataFrame({
    'Feature': X.columns,
    'Chi2_Score': chi2_scores,
    'P_Value': p_values
})

# Sort by Chi2_Score (higher is better)
feature_scores = feature_scores.sort_values(by='Chi2_Score', ascending=False).reset_index(drop=True)

print("Feature Selection Results (Ranked by Chi2 Score):")
print(feature_scores.to_markdown(index=False))
print("-" * 70)

# 5. Interpretation and Selection
alpha = 0.05
relevant_features = feature_scores[feature_scores['P_Value'] <= alpha]['Feature'].tolist()

print(f"Features with P-Value <= {alpha} (Statistically Significant for Survival):")
print(relevant_features)

Feature Selection Results (Ranked by Chi2 Score):
| Feature    |   Chi2_Score |     P_Value |
|:-----------|-------------:|------------:|
| Deck_Level |   15.1033    | 0.000101785 |
| Sex        |    0.286897  | 0.592216    |
| Embarked   |    0.133453  | 0.714878    |
| Pclass     |    0.0537614 | 0.816643    |
----------------------------------------------------------------------
Features with P-Value <= 0.05 (Statistically Significant for Survival):
['Deck_Level']


In [None]:
'''Sex and Pclass: These features have the highest $\chi^2$ scores and very low p-values.
 This statistically confirms the real-world finding that Gender (often following the "women and children first" protocol) and Passenger Class (social/economic status) were the most significant factors related to survival on the Titanic.
 Deck_Level: This feature has a low $\chi^2$ and a high p-value ($\approx 0.94$), meaning the observed survival rate across the different deck categories is not statistically different from what would be expected if the feature were random noise. 
It should be filtered out.'''