In [None]:
# Feature Engineering and Data Preparation for Machine Learning in Python

# Import necessary libraries
import pandas as pd
import numpy as np
from sklearn.preprocessing import MinMaxScaler, StandardScaler, OneHotEncoder, LabelEncoder
from sklearn.impute import SimpleImputer
from sklearn.feature_selection import SelectKBest, f_classif
from sklearn.model_selection import train_test_split

# Sample dataset
data = {
    'Age': [25, 30, 35, np.nan, 40],
    'Salary': [50000, 60000, 70000, 80000, np.nan],
    'City': ['New York', 'London', 'Paris', 'London', 'Paris'],
    'Purchased': ['No', 'Yes', 'No', 'Yes', 'No']
}

# Convert to DataFrame
df = pd.DataFrame(data)

print("Original Dataset:")
print(df)

# Step 1: Handle Missing Values
# Use mean imputation for numerical data
imputer = SimpleImputer(strategy='mean')
df['Age'] = imputer.fit_transform(df[['Age']])
df['Salary'] = imputer.fit_transform(df[['Salary']])

print("\nAfter Handling Missing Values:")
print(df)

# Step 2: Encode Categorical Variables
# One-hot encoding for City column
encoder = OneHotEncoder()
cities_encoded = encoder.fit_transform(df[['City']]).toarray()
cities_encoded_df = pd.DataFrame(cities_encoded, columns=encoder.get_feature_names_out(['City']))

# Label encoding for target variable Purchased
label_encoder = LabelEncoder()
df['Purchased'] = label_encoder.fit_transform(df['Purchased'])

# Merge the one-hot encoded columns and drop the original column
df = pd.concat([df, cities_encoded_df], axis=1)
df.drop('City', axis=1, inplace=True)

print("\nAfter Encoding Categorical Variables:")
print(df)

# Step 3: Feature Scaling
# Normalize Salary and Age
scaler = MinMaxScaler()
df[['Age', 'Salary']] = scaler.fit_transform(df[['Age', 'Salary']])

print("\nAfter Feature Scaling:")
print(df)

# Step 4: Feature Selection
# Assume 'Purchased' is the target variable
X = df.drop(columns='Purchased')
y = df['Purchased']

# Select top 2 features using SelectKBest
selector = SelectKBest(score_func=f_classif, k=2)
X_new = selector.fit_transform(X, y)
selected_features = X.columns[selector.get_support()]

print("\nSelected Features:")
print(selected_features)

# Step 5: Split Data into Train and Test Sets
X_train, X_test, y_train, y_test = train_test_split(X_new, y, test_size=0.2, random_state=42)

print("\nTrain Test Split:")
print("X_train:", X_train)
print("X_test:", X_test)
print("y_train:", y_train)
print("y_test:", y_test)
