In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix

# 1. Data Preprocessing:
• Load the dataset into a Pandas DataFrame.
• Convert categorical variables into dummy variables.
• Handle missing values if any.
• Convert the target variable subscribed into a binary format (1 for yes, 0 for no

In [2]:
# Load the dataset into a Pandas DataFrame
df = pd.read_csv("bank_customers.csv")

# Display the first few rows of the DataFrame to verify that it has been loaded correctly
df.head()

Unnamed: 0,age,job,marital,education,default,balance,housing,loan,contact,day,month,duration,campaign,pdays,previous,poutcome,subscribed
0,58,management,married,tertiary,no,2143,yes,no,unknown,5,may,261,1,-1,0,unknown,no
1,44,technician,single,secondary,no,29,yes,no,unknown,5,may,151,1,-1,0,unknown,no
2,33,entrepreneur,married,secondary,no,2,yes,yes,unknown,5,may,76,1,-1,0,unknown,no
3,47,blue-collar,married,unknown,no,1506,yes,no,unknown,5,may,92,1,-1,0,unknown,no
4,33,unknown,single,unknown,no,1,no,no,unknown,5,may,198,1,-1,0,unknown,no


In [None]:
# Assuming df is your DataFrame containing the data
# Check for missing values
missing_values = df.isnull().sum()
print("Missing Values:\n", missing_values)

# Handle missing values by filling them with a specific value or dropping them
# For example, filling missing values with the mean of each column:
df.fillna(df.mean(), inplace=True)

# Alternatively, you can drop rows with missing values:
# df.dropna(inplace=True)

# Display the DataFrame after handling missing values
print("DataFrame after handling missing values:\n", df)

Missing Values:
               0
job           0
marital       0
education     0
default       0
balance       0
housing       0
loan          0
contact       0
day           0
month         0
duration      0
campaign      0
pdays         0
previous      0
poutcome      0
subscribed    0
dtype: int64
DataFrame after handling missing values:
                     job  marital  education default  balance housing loan  \
0      58    management  married   tertiary      no     2143     yes   no   
1      44    technician   single  secondary      no       29     yes   no   
2      33  entrepreneur  married  secondary      no        2     yes  yes   
3      47   blue-collar  married    unknown      no     1506     yes   no   
4      33       unknown   single    unknown      no        1      no   no   
...    ..           ...      ...        ...     ...      ...     ...  ...   
42634  21       student   single  secondary      no     2488      no   no   
42635  87       retired  married    prim

  df.fillna(df.mean(), inplace=True)


In [None]:
# Display the column names in the DataFrame
print(df.columns)


Index([' ', 'job', 'marital', 'education', 'default', 'balance', 'housing',
       'loan', 'contact', 'day', 'month', 'duration', 'campaign', 'pdays',
       'previous', 'poutcome', 'subscribed'],
      dtype='object')


In [None]:
# Convert the target variable to binary format
df['subscribed'] = df['subscribed'].map({'yes': 1, 'no': 0})

# Display the DataFrame after converting the target variable
print(df)

                    job  marital  education default  balance housing loan  \
0      58    management  married   tertiary      no     2143     yes   no   
1      44    technician   single  secondary      no       29     yes   no   
2      33  entrepreneur  married  secondary      no        2     yes  yes   
3      47   blue-collar  married    unknown      no     1506     yes   no   
4      33       unknown   single    unknown      no        1      no   no   
...    ..           ...      ...        ...     ...      ...     ...  ...   
42634  21       student   single  secondary      no     2488      no   no   
42635  87       retired  married    primary      no     2190      no   no   
42636  34   blue-collar  married    primary      no     6718      no   no   
42637  22       student   single  secondary      no      254      no   no   
42638  32    management   single   tertiary      no     1962      no   no   

         contact  day month  duration  campaign  pdays  previous poutcome  

# 2. Feature Selection:
• Decide which features to include in the model. You might exclude highly correlated features to
avoid multicollinearity

In [None]:
# Calculate the correlation matrix
correlation_matrix = df.corr()

# Print the correlation matrix
print("Correlation Matrix:")
print(correlation_matrix)

# Select features based on correlation with the target variable
target_correlation = correlation_matrix['subscribed'].abs().sort_values(ascending=False)
print("\nCorrelation with target variable:")
print(target_correlation)

# Select features based on correlation with each other
# For example, you can set a threshold for correlation coefficient (e.g., 0.5) and exclude features that have a correlation coefficient above this threshold with other features
threshold = 0.5
highly_correlated_features = set()
for i in range(len(correlation_matrix.columns)):
    for j in range(i):
        if abs(correlation_matrix.iloc[i, j]) > threshold:
            colname = correlation_matrix.columns[i]
            highly_correlated_features.add(colname)
print("\nHighly correlated features to exclude:")
print(highly_correlated_features)


Correlation Matrix:
                       balance       day  duration  campaign     pdays  \
            1.000000  0.088716 -0.009869 -0.016529  0.008296 -0.044165   
balance     0.088716  1.000000  0.004696  0.016721 -0.013358 -0.006797   
day        -0.009869  0.004696  1.000000 -0.032557  0.165906 -0.101391   
duration   -0.016529  0.016721 -0.032557  1.000000 -0.085640 -0.010812   
campaign    0.008296 -0.013358  0.165906 -0.085640  1.000000 -0.079981   
pdays      -0.044165 -0.006797 -0.101391 -0.010812 -0.079981  1.000000   
previous   -0.016279  0.005194 -0.048445 -0.007730 -0.024374  0.450869   
subscribed  0.009129  0.046826 -0.024916  0.417468 -0.058095  0.033657   

            previous  subscribed  
           -0.016279    0.009129  
balance     0.005194    0.046826  
day        -0.048445   -0.024916  
duration   -0.007730    0.417468  
campaign   -0.024374   -0.058095  
pdays       0.450869    0.033657  
previous    1.000000    0.033238  
subscribed  0.033238    1.000000 

  correlation_matrix = df.corr()


# 3. Data Splitting:
• Split the dataset into training and testing sets (typically a 70-30 or 80-20 split)

In [None]:
# Assuming df is your DataFrame containing the data and X contains the features while y contains the target variable
# Split the dataset into features (X) and target variable (y)
X = df.drop('subscribed', axis=1)  # Features
y = df['subscribed']  # Target variable

# Split the dataset into training and testing sets (80-20 split)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Print the shapes of the training and testing sets to verify the split
print("Training set shape:", X_train.shape, y_train.shape)
print("Testing set shape:", X_test.shape, y_test.shape)


Training set shape: (34111, 16) (34111,)
Testing set shape: (8528, 16) (8528,)


#4. Model Training:
• Train a logistic regression model on the training set

In [None]:
# Convert categorical variables into dummy variables
X_train_encoded = pd.get_dummies(X_train, drop_first=True)

# Initialize the logistic regression model
logistic_model = LogisticRegression()

# Train the logistic regression model on the training set
logistic_model.fit(X_train_encoded, y_train)


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


#5. Model Evaluation:
• Evaluate the model's performance on the testing set using metrics such as accuracy, precision,
recall, F1-score, and the confusion matrix.

In [None]:
# Assuming you have already trained the logistic regression model (logistic_model) and have the testing set (X_test, y_test)
# Convert categorical variables into dummy variables for the testing set if necessary
X_test_encoded = pd.get_dummies(X_test, drop_first=True)

# Make predictions on the testing set
y_pred = logistic_model.predict(X_test_encoded)

# Calculate accuracy
accuracy = accuracy_score(y_test, y_pred)

# Calculate precision
precision = precision_score(y_test, y_pred)

# Calculate recall
recall = recall_score(y_test, y_pred)

# Calculate F1-score
f1 = f1_score(y_test, y_pred)

# Calculate confusion matrix
conf_matrix = confusion_matrix(y_test, y_pred)

# Print the evaluation metrics
print("Accuracy:", accuracy)
print("Precision:", precision)
print("Recall:", recall)
print("F1 Score:", f1)
print("Confusion Matrix:\n", conf_matrix)


Accuracy: 0.9129924953095685
Precision: 0.602112676056338
Recall: 0.21375
F1 Score: 0.3154981549815498
Confusion Matrix:
 [[7615  113]
 [ 629  171]]


#6. Conclusion:
• Summarize the model's performance and discuss any insights or implications for the bank's
marketing strategies.

In [None]:
# Summarize the model's performance
print("Model Performance Summary:")
print("Accuracy:", accuracy)
print("Precision:", precision)
print("Recall:", recall)
print("F1 Score:", f1)
print("Confusion Matrix:\n", conf_matrix)

# Discuss insights or implications for the bank's marketing strategies
print("\nInsights and Implications:")
# Example insights:
print("- The model achieved an accuracy of {:.2f}%, indicating its overall effectiveness in predicting term deposit subscriptions.".format(accuracy * 100))
print("- Precision of {:.2f}% suggests that when the model predicts a term deposit subscription, it is correct {:.2f}% of the time.".format(precision * 100, precision * 100))
print("- Recall of {:.2f}% indicates the proportion of actual term deposit subscriptions that were correctly predicted by the model.".format(recall * 100))
print("- F1 Score of {:.2f}% provides a balance between precision and recall, which is important for this classification task.".format(f1 * 100))
print("- The confusion matrix provides insights into the model's performance across different classes (true positives, true negatives, false positives, false negatives).")


Model Performance Summary:
Accuracy: 0.9129924953095685
Precision: 0.602112676056338
Recall: 0.21375
F1 Score: 0.3154981549815498
Confusion Matrix:
 [[7615  113]
 [ 629  171]]

Insights and Implications:
- The model achieved an accuracy of 91.30%, indicating its overall effectiveness in predicting term deposit subscriptions.
- Precision of 60.21% suggests that when the model predicts a term deposit subscription, it is correct 60.21% of the time.
- Recall of 21.38% indicates the proportion of actual term deposit subscriptions that were correctly predicted by the model.
- F1 Score of 31.55% provides a balance between precision and recall, which is important for this classification task.
- The confusion matrix provides insights into the model's performance across different classes (true positives, true negatives, false positives, false negatives).
