In [7]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
import pandas as pd

# Load the dataset
df = pd.read_csv("bank-full.csv", delimiter=";")

# Split the dataset based on the "marital" feature into three populations: married, single, and divorced
pop_married = df[df['marital'] == 'married']
pop_single = df[df['marital'] == 'single']
pop_divorced = df[df['marital'] == 'divorced']

# Combine the populations into a new dataframe
new_df = pd.concat([pop_married, pop_single, pop_divorced])

# Selecting the marital status and education columns and the response column
new_df = new_df[['marital', 'education', 'y']]

# Encoding the response variable "y" using label encoding
label_encoder_y = LabelEncoder()
new_df['y'] = label_encoder_y.fit_transform(new_df['y'])

# Encoding the marital status using label encoding
label_encoder_marital = LabelEncoder()
new_df['marital'] = label_encoder_marital.fit_transform(new_df['marital'])

# Drop rows with 'unknown' education level
new_df = new_df[new_df['education'] != 'unknown']

# Encoding the education using label encoding
label_encoder_education = LabelEncoder()
new_df['education'] = label_encoder_education.fit_transform(new_df['education'])

# Split the dataset into features (X) and the response variable (y)
X = new_df[['marital', 'education']]
y = new_df['y']

# Split the dataset into training and validation sets, ensuring stratification based on marital status
X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size=0.1, random_state=42, stratify=X)

# Save the training and validation sets to separate CSV files
train_data = pd.concat([X_train, y_train], axis=1)
valid_data = pd.concat([X_valid, y_valid], axis=1)

train_data.to_csv('train_data.csv', index=False)
valid_data.to_csv('valid_data.csv', index=False)
