In [1]:
import pandas as pd

# Load the data with `;` as the delimiter
file_path = './bank.csv'
data = pd.read_csv(file_path, delimiter=';')

# Check the first few rows to confirm it is correctly loaded
print("Data before cleaning:")
print(data.head())

# Save the cleaned data to a new CSV file for further use
cleaned_file_path = './bank_cleaned.csv'
data.to_csv(cleaned_file_path, index=False)

print(f"Cleaned data has been saved to {cleaned_file_path}")


Data before cleaning:
   age          job  marital  education default  balance housing loan  \
0   30   unemployed  married    primary      no     1787      no   no   
1   33     services  married  secondary      no     4789     yes  yes   
2   35   management   single   tertiary      no     1350     yes   no   
3   30   management  married   tertiary      no     1476     yes  yes   
4   59  blue-collar  married  secondary      no        0     yes   no   

    contact  day month  duration  campaign  pdays  previous poutcome   y  
0  cellular   19   oct        79         1     -1         0  unknown  no  
1  cellular   11   may       220         1    339         4  failure  no  
2  cellular   16   apr       185         1    330         1  failure  no  
3   unknown    3   jun       199         4     -1         0  unknown  no  
4   unknown    5   may       226         1     -1         0  unknown  no  
Cleaned data has been saved to ./bank_cleaned.csv


In [2]:
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score, classification_report
from sklearn.preprocessing import LabelEncoder

# Load the cleaned data
data = pd.read_csv(cleaned_file_path)

# Encode categorical features
label_encoders = {}
for column in data.select_dtypes(include=['object']).columns:
    label_encoders[column] = LabelEncoder()
    data[column] = label_encoders[column].fit_transform(data[column])

# Separate the features and the target variable
X = data.drop(columns=['y'])  # 'y' is the target column
y = data['y']

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Initialize and train the Decision Tree Classifier
clf = DecisionTreeClassifier(random_state=42)
clf.fit(X_train, y_train)

# Make predictions on the test set
y_pred = clf.predict(X_test)

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
report = classification_report(y_test, y_pred)

# Print results
print(f"Accuracy: {accuracy}")
print("Classification Report:")
print(report)

Accuracy: 0.8718232044198895
Classification Report:
              precision    recall  f1-score   support

           0       0.93      0.92      0.93       807
           1       0.42      0.47      0.44        98

    accuracy                           0.87       905
   macro avg       0.68      0.70      0.68       905
weighted avg       0.88      0.87      0.88       905



# Test the data 

In [None]:
new_customer = {
    'age': 30,
    'job': 'admin.',  # Needs to be encoded as in training
    'marital': 'married',
    'education': 'secondary',
    'default': 'no',
    'balance': 1500,
    'housing': 'yes',
    'loan': 'no',
    'contact': 'cellular',
    'day': 15,
    'month': 'jul',
    'duration': 200,
    'campaign': 2,
    'pdays': -1,
    'previous': 0,
    'poutcome': 'unknown'
}

# Convert the new data into a DataFrame
new_customer_df = pd.DataFrame([new_customer])

# Encode the new data using the same LabelEncoders used for training
for column in new_customer_df.columns:
    if column in label_encoders:
        new_customer_df[column] = label_encoders[column].transform(new_customer_df[column])

# Predict whether the customer will purchase the product/service
prediction = clf.predict(new_customer_df)
purchase_decision = 'Yes' if prediction[0] == 1 else 'No'

print(f"The prediction for this customer is: {purchase_decision}")

The prediction for this customer is: No
