In [76]:
import warnings
warnings.filterwarnings("ignore", category=DeprecationWarning)
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA

In [77]:
# Load the dataset
df = pd.read_csv("bank-full.csv", delimiter=";")

# Define feature types
numerical_features = ['age', 'balance', 'day', 'duration', 'campaign', 'pdays', 'previous']
categorical_features = ['job', 'marital', 'education', 'contact', 'month', 'poutcome']
binary_features = ['default', 'housing', 'loan', 'y']

# Split the dataset based on the "marital" feature into three populations: married, single, and divorced
pop_married = df[df['marital'] == 'married']
pop_single = df[df['marital'] == 'single']
pop_divorced = df[df['marital'] == 'divorced']

# Function to preprocess categorical features
def preprocess_data(df):
    label_encoder = LabelEncoder()
    df = df.copy()  # Create a copy of the DataFrame
    for column in categorical_features + binary_features:
        if column in df.columns:
            df.iloc[:, df.columns.get_loc(column)] = label_encoder.fit_transform(df[column])
    return df

# Split the populations into training and validation sets, and preprocess categorical features
train_married, valid_married = train_test_split(preprocess_data(pop_married), test_size=0.1, random_state=42)
train_single, valid_single = train_test_split(preprocess_data(pop_single), test_size=0.1, random_state=42)
train_divorced, valid_divorced = train_test_split(preprocess_data(pop_divorced), test_size=0.1, random_state=42)

# Save the training and validation sets to separate CSV files
train_married.to_csv('train_married.csv', index=False)
valid_married.to_csv('valid_married.csv', index=False)

train_single.to_csv('train_single.csv', index=False)
valid_single.to_csv('valid_single.csv', index=False)

train_divorced.to_csv('train_divorced.csv', index=False)
valid_divorced.to_csv('valid_divorced.csv', index=False)



In [78]:
# Preprocess the data
df_processed = df.copy()  # Create a copy of the DataFrame

# Apply label encoding to categorical and binary features
label_encoder = LabelEncoder()
for column in categorical_features + binary_features:
    if column in df_processed.columns:
        df_processed[column] = label_encoder.fit_transform(df_processed[column])

# Separate input features (X) and target variable (y)
X = df_processed.drop('y', axis=1)
y = df_processed['y']

# Standardize the input features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Perform PCA on standardized input features
pca = PCA()
X_pca = pca.fit_transform(X_scaled)

# Identify feature importance based on loading values of principal components
loading_values = pca.components_[0]  

# Map loading values to corresponding feature names
feature_importance = pd.Series(loading_values, index=X.columns).abs().sort_values(ascending=False)

# Select top features based on feature importance from PCA
num_features_to_keep = len(X.columns) // 2
top_features_pca = feature_importance.head(num_features_to_keep).index.tolist()

# Include 'y' in the selected features list
selected_features = top_features_pca + ['y']

# Filter datasets
train_married = pd.read_csv('train_married.csv')
valid_married = pd.read_csv('valid_married.csv')
train_single = pd.read_csv('train_single.csv')
valid_single = pd.read_csv('valid_single.csv')
train_divorced = pd.read_csv('train_divorced.csv')
valid_divorced = pd.read_csv('valid_divorced.csv')

# Function to filter and save reduced datasets
def filter_and_save(df_train, df_valid, selected_features, output_train_file, output_valid_file):
    # Filter datasets to include only the selected features
    df_train_reduced = df_train[selected_features]
    df_valid_reduced = df_valid[selected_features]
    
    # Save the selected training and validation sets to separate CSV files with "_reduced" suffix
    df_train_reduced.to_csv(output_train_file, index=False)
    df_valid_reduced.to_csv(output_valid_file, index=False)

# Apply filter_and_save function for each marital status
filter_and_save(train_married, valid_married, selected_features, 'train_married_reduced.csv', 'valid_married_reduced.csv')
filter_and_save(train_single, valid_single, selected_features, 'train_single_reduced.csv', 'valid_single_reduced.csv')
filter_and_save(train_divorced, valid_divorced, selected_features, 'train_divorced_reduced.csv', 'valid_divorced_reduced.csv')