In [24]:
import warnings
warnings.filterwarnings("ignore", category=DeprecationWarning)

In [25]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
import pandas as pd

# Load the dataset
df = pd.read_csv("bank-full.csv", delimiter=";")

# Define feature types
numerical_features = ['age', 'balance', 'day', 'duration', 'campaign', 'pdays', 'previous']
categorical_features = ['job', 'marital', 'education', 'contact', 'month', 'poutcome']
binary_features = ['default', 'housing', 'loan', 'y']

# Split the dataset based on the "marital" feature into three populations: married, single, and divorced
pop_married = df[df['marital'] == 'married']
pop_single = df[df['marital'] == 'single']
pop_divorced = df[df['marital'] == 'divorced']

# Function to preprocess categorical features
def preprocess_data(df):
    label_encoder = LabelEncoder()
    df = df.copy()  # Create a copy of the DataFrame
    for column in categorical_features + binary_features:
        if column in df.columns:
            df.iloc[:, df.columns.get_loc(column)] = label_encoder.fit_transform(df[column])
    return df


# Split the populations into training and validation sets, and preprocess categorical features
train_married, valid_married = train_test_split(preprocess_data(pop_married), test_size=0.1, random_state=42)
train_single, valid_single = train_test_split(preprocess_data(pop_single), test_size=0.1, random_state=42)
train_divorced, valid_divorced = train_test_split(preprocess_data(pop_divorced), test_size=0.1, random_state=42)

# Save the training and validation sets to separate CSV files
train_married.to_csv('train_married.csv', index=False)
valid_married.to_csv('valid_married.csv', index=False)

train_single.to_csv('train_single.csv', index=False)
valid_single.to_csv('valid_single.csv', index=False)

train_divorced.to_csv('train_divorced.csv', index=False)
valid_divorced.to_csv('valid_divorced.csv', index=False)



In [26]:
from sklearn.decomposition import PCA

# Define a function for dimension reduction using PCA
def reduce_dimension(df, n_components):
    pca = PCA(n_components=n_components)
    reduced_features = pca.fit_transform(df[numerical_features])
    df_reduced = pd.DataFrame(reduced_features)
    return pd.concat([df.drop(columns=numerical_features), df_reduced], axis=1)

# Define the number of components (half of the original number of features)
n_components = len(numerical_features) // 2

# Reduce dimensionality for each population
train_married_reduced = reduce_dimension(train_married, n_components)
valid_married_reduced = reduce_dimension(valid_married, n_components)

train_single_reduced = reduce_dimension(train_single, n_components)
valid_single_reduced = reduce_dimension(valid_single, n_components)

train_divorced_reduced = reduce_dimension(train_divorced, n_components)
valid_divorced_reduced = reduce_dimension(valid_divorced, n_components)

# Drop the PCA columns by numerical indices
train_married_reduced.drop(columns=range(n_components), inplace=True)
valid_married_reduced.drop(columns=range(n_components), inplace=True)

train_single_reduced.drop(columns=range(n_components), inplace=True)
valid_single_reduced.drop(columns=range(n_components), inplace=True)

train_divorced_reduced.drop(columns=range(n_components), inplace=True)
valid_divorced_reduced.drop(columns=range(n_components), inplace=True)


# Save the reduced training and validation sets to separate CSV files
train_married_reduced.to_csv('train_married_reduced.csv', index=False)
valid_married_reduced.to_csv('valid_married_reduced.csv', index=False)

train_single_reduced.to_csv('train_single_reduced.csv', index=False)
valid_single_reduced.to_csv('valid_single_reduced.csv', index=False)

train_divorced_reduced.to_csv('train_divorced_reduced.csv', index=False)
valid_divorced_reduced.to_csv('valid_divorced_reduced.csv', index=False)


