**Dummy DataSet**

In [1]:
import pandas as pd
import numpy as np

np.random.seed(0)
data = pd.DataFrame({
    'Age': np.random.randint(18, 65, 250),
    'Category': np.random.choice(['A', 'B', 'C', 'D'], 250),
    'Income': np.random.randint(30000, 100000, 250),
    'Target': np.random.choice([0, 1], 250)
})

data.head()


Unnamed: 0,Age,Category,Income,Target
0,62,D,50177,0
1,18,B,69920,1
2,21,B,85082,0
3,21,B,52036,1
4,57,A,68212,0


**Handling Missing Values**

In [2]:
data.loc[np.random.choice(data.index, 20, replace=False), 'Age'] = np.nan
data.loc[np.random.choice(data.index, 15, replace=False), 'Income'] = np.nan

data['Age'] = data['Age'].fillna(data['Age'].mean())
data['Income'] = data['Income'].fillna(data['Income'].mean())


**Removing Duplicates**

In [3]:
data.drop_duplicates(inplace=True)

**Outlier Detection and Removal**

In [6]:
from scipy import stats

data = data[(np.abs(stats.zscore(data[['Age', 'Income']])) < 3).all(axis=1)]

**Normalization**

In [7]:
from sklearn.preprocessing import MinMaxScaler

scaler = MinMaxScaler()
data[['Income']] = scaler.fit_transform(data[['Income']])


**StandardScaler**

In [8]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
data[['Age']] = scaler.fit_transform(data[['Age']])


**Encoding Categorical Variables**

In [9]:
data = pd.get_dummies(data, columns=['Category'])


**Principal Component Analysis (PCA)**

In [10]:
from sklearn.decomposition import PCA

pca = PCA(n_components=2)
reduced_data = pca.fit_transform(data.drop('Target', axis=1))


**Testing**

In [12]:
# Prepare non-negative features for Chi-Square selection
X_non_negative = data[['Income'] + [col for col in data.columns if col.startswith('Category_')]]
y = data['Target']

from sklearn.feature_selection import SelectKBest, chi2

selector = SelectKBest(chi2, k=2)
X_selected = selector.fit_transform(X_non_negative, y)
selected_features_chi2 = X_non_negative.columns[selector.get_support()]

print("Selected Features (Chi-Square):", selected_features_chi2)


Selected Features (Chi-Square): Index(['Category_B', 'Category_D'], dtype='object')


In [13]:
from sklearn.feature_selection import SelectKBest, chi2

X = data.drop('Target', axis=1)
y = data['Target']
selector = SelectKBest(chi2, k=2)
X_selected = selector.fit_transform(X, y)
selected_features_chi2 = X.columns[selector.get_support()]


ValueError: Input X must be non-negative.