#Importing Dataset and libraries



In [2]:
import numpy as np
import pandas as pd

from sklearn.datasets import load_breast_cancer
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler, StandardScaler

from sklearn.feature_selection import SelectKBest, chi2, mutual_info_classif

from sklearn.decomposition import PCA
from sklearn.manifold import TSNE

from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score


In [3]:
from sklearn.datasets import load_wine

# 1. Replace the data loading section
data = load_wine()
X = data.data
y = data.target
feature_names = data.feature_names

# Everything else (Splitting, Chi-Square, PCA, etc.) remains the same!

In [4]:
df=pd.DataFrame(X,columns=feature_names)
df.head()

Unnamed: 0,alcohol,malic_acid,ash,alcalinity_of_ash,magnesium,total_phenols,flavanoids,nonflavanoid_phenols,proanthocyanins,color_intensity,hue,od280/od315_of_diluted_wines,proline
0,14.23,1.71,2.43,15.6,127.0,2.8,3.06,0.28,2.29,5.64,1.04,3.92,1065.0
1,13.2,1.78,2.14,11.2,100.0,2.65,2.76,0.26,1.28,4.38,1.05,3.4,1050.0
2,13.16,2.36,2.67,18.6,101.0,2.8,3.24,0.3,2.81,5.68,1.03,3.17,1185.0
3,14.37,1.95,2.5,16.8,113.0,3.85,3.49,0.24,2.18,7.8,0.86,3.45,1480.0
4,13.24,2.59,2.87,21.0,118.0,2.8,2.69,0.39,1.82,4.32,1.04,2.93,735.0


#Splitting data

In [5]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)


#Chi-square

In [6]:
scaler = MinMaxScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)


In [8]:
chi2_selector = SelectKBest(score_func=chi2, k=10)
X_train_chi2 = chi2_selector.fit_transform(X_train_scaled, y_train)
X_test_chi2 = chi2_selector.transform(X_test_scaled)

# Convert feature_names to a NumPy array to allow boolean indexing
chi2_features = np.array(feature_names)[chi2_selector.get_support()]
print("Selected Chi-Square Features:\n", chi2_features)

Selected Chi-Square Features:
 ['alcohol' 'malic_acid' 'total_phenols' 'flavanoids'
 'nonflavanoid_phenols' 'proanthocyanins' 'color_intensity' 'hue'
 'od280/od315_of_diluted_wines' 'proline']


#Information gain

In [11]:
ig_selector = SelectKBest(score_func=mutual_info_classif, k=10)
X_train_ig = ig_selector.fit_transform(X_train, y_train)
X_test_ig = ig_selector.transform(X_test)

ig_features = np.array(feature_names)[ig_selector.get_support()]
print("Selected Information Gain Features:\n", ig_features)

Selected Information Gain Features:
 ['alcohol' 'malic_acid' 'alcalinity_of_ash' 'total_phenols' 'flavanoids'
 'proanthocyanins' 'color_intensity' 'hue' 'od280/od315_of_diluted_wines'
 'proline']


#PCA

In [12]:

std_scaler = StandardScaler()
X_train_std = std_scaler.fit_transform(X_train)
X_test_std = std_scaler.transform(X_test)


In [13]:
pca = PCA(n_components=10)
X_train_pca = pca.fit_transform(X_train_std)
X_test_pca = pca.transform(X_test_std)

print("Explained Variance Ratio:", pca.explained_variance_ratio_)
print("Total Variance Explained:", np.sum(pca.explained_variance_ratio_))


Explained Variance Ratio: [0.35900066 0.18691934 0.11606557 0.07371716 0.0665386  0.04854582
 0.04195042 0.02683922 0.0234746  0.01889734]
Total Variance Explained: 0.9619487172898263


#TSNE

In [14]:
tsne = TSNE(n_components=2, random_state=42)
X_tsne = tsne.fit_transform(X_train_std)


In [15]:
rf = RandomForestClassifier(random_state=42)
rf.fit(X_train, y_train)

y_pred = rf.predict(X_test)
baseline_acc = accuracy_score(y_test, y_pred)
print("Baseline Accuracy:", baseline_acc)


Baseline Accuracy: 1.0


#Checking accuracy

In [16]:
rf.fit(X_train_chi2, y_train)
y_pred_chi2 = rf.predict(X_test_chi2)

chi2_acc = accuracy_score(y_test, y_pred_chi2)
print("Chi-Square Accuracy:", chi2_acc)


Chi-Square Accuracy: 1.0


In [17]:
rf.fit(X_train_ig, y_train)
y_pred_ig = rf.predict(X_test_ig)

ig_acc = accuracy_score(y_test, y_pred_ig)
print("Information Gain Accuracy:", ig_acc)


Information Gain Accuracy: 1.0


In [18]:
rf.fit(X_train_pca, y_train)
y_pred_pca = rf.predict(X_test_pca)

pca_acc = accuracy_score(y_test, y_pred_pca)
print("PCA Accuracy:", pca_acc)


PCA Accuracy: 1.0


In [19]:
comparison = pd.DataFrame({
    "Method": ["All Features", "Chi-Square", "Information Gain", "PCA"],
    "Accuracy": [baseline_acc, chi2_acc, ig_acc, pca_acc]
})

print(comparison)


             Method  Accuracy
0      All Features       1.0
1        Chi-Square       1.0
2  Information Gain       1.0
3               PCA       1.0


Feature selection using Chi-Square and Information Gain reduced the number of input features while maintaining competitive model performance. Information Gain performed better than Chi-Square as it was able to capture non-linear relationships between features and the target variable. PCA successfully reduced dimensionality and retained most of the variance, but a slight drop in accuracy was observed due to loss of interpretability. t-SNE was useful for visualizing class separation but was not suitable for direct model training. Overall, feature selection methods preserved interpretability, while PCA helped in reducing dimensionality efficiently.