### Dataset

In [42]:
# Import libraries
import pandas as pd
import numpy as np
import zipfile
import os

from sklearn.model_selection import train_test_split
from sklearn.metrics import mutual_info_score
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

In [1]:
# Download the dataset
data = "https://archive.ics.uci.edu/static/public/222/bank+marketing.zip"

In [2]:
# Save the file in the current directory
!wget $data -O data_week_2.zip

--2024-10-13 19:09:50--  https://archive.ics.uci.edu/static/public/222/bank+marketing.zip
Resolving archive.ics.uci.edu (archive.ics.uci.edu)... 128.195.10.252
Connecting to archive.ics.uci.edu (archive.ics.uci.edu)|128.195.10.252|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: unspecified
Saving to: 'data_week_2.zip'

     0K .......... .......... .......... .......... ..........  128K
    50K .......... .......... .......... .......... ..........  262K
   100K .......... .......... .......... .......... .......... 3.47M
   150K .......... .......... .......... .......... .......... 2.72M
   200K .......... .......... .......... .......... ..........  280K
   250K .......... .......... .......... .......... .......... 2.37M
   300K .......... .......... .......... .......... ..........  309K
   350K .......... .......... .......... .......... .......... 2.02M
   400K .......... .......... .......... .......... .......... 2.68M
   450K .......... .......... ..

In [5]:
# Unzip the file
with zipfile.ZipFile('data_week_2.zip', 'r') as zip_ref:
    zip_ref.extractall('extracted_files')

In [6]:
# List the files in the extracted folder to find the CSV file
extracted_folder = 'extracted_files'
extracted_files = os.listdir(extracted_folder)
print(extracted_files)

['bank-additional', 'bank-additional.zip', 'bank-full.csv', 'bank-names.txt', 'bank.csv', 'bank.zip', '__MACOSX']


We need to take bank/bank-full.csv file from the downloaded zip-file.

In [9]:
# Path to the CSV file
csv_file_path = os.path.join(extracted_folder, 'bank-full.csv')

# Load the CSV file into a DataFrame
df = pd.read_csv(csv_file_path, sep=';')  # or use encoding='latin1' if needed
df.head()

Unnamed: 0,age,job,marital,education,default,balance,housing,loan,contact,day,month,duration,campaign,pdays,previous,poutcome,y
0,58,management,married,tertiary,no,2143,yes,no,unknown,5,may,261,1,-1,0,unknown,no
1,44,technician,single,secondary,no,29,yes,no,unknown,5,may,151,1,-1,0,unknown,no
2,33,entrepreneur,married,secondary,no,2,yes,yes,unknown,5,may,76,1,-1,0,unknown,no
3,47,blue-collar,married,unknown,no,1506,yes,no,unknown,5,may,92,1,-1,0,unknown,no
4,33,unknown,single,unknown,no,1,no,no,unknown,5,may,198,1,-1,0,unknown,no


In [10]:
# Check the column names
df.columns

Index(['age', 'job', 'marital', 'education', 'default', 'balance', 'housing',
       'loan', 'contact', 'day', 'month', 'duration', 'campaign', 'pdays',
       'previous', 'poutcome', 'y'],
      dtype='object')

### Features

In [11]:
# Select the specified columns
columns_to_use = [
    'age', 'job', 'marital', 'education', 'balance', 'housing',
    'contact', 'day', 'month', 'duration', 'campaign', 'pdays',
    'previous', 'poutcome', 'y'
]

df_selected = df[columns_to_use]
df_selected.head()

Unnamed: 0,age,job,marital,education,balance,housing,contact,day,month,duration,campaign,pdays,previous,poutcome,y
0,58,management,married,tertiary,2143,yes,unknown,5,may,261,1,-1,0,unknown,no
1,44,technician,single,secondary,29,yes,unknown,5,may,151,1,-1,0,unknown,no
2,33,entrepreneur,married,secondary,2,yes,unknown,5,may,76,1,-1,0,unknown,no
3,47,blue-collar,married,unknown,1506,yes,unknown,5,may,92,1,-1,0,unknown,no
4,33,unknown,single,unknown,1,no,unknown,5,may,198,1,-1,0,unknown,no


### Data preparation

In [12]:
# Check for missing values
df_selected.isnull().sum()


age          0
job          0
marital      0
education    0
balance      0
housing      0
contact      0
day          0
month        0
duration     0
campaign     0
pdays        0
previous     0
poutcome     0
y            0
dtype: int64

No missing values

In [15]:
# Check the data types
df_selected.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 45211 entries, 0 to 45210
Data columns (total 15 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   age        45211 non-null  int64 
 1   job        45211 non-null  object
 2   marital    45211 non-null  object
 3   education  45211 non-null  object
 4   balance    45211 non-null  int64 
 5   housing    45211 non-null  object
 6   contact    45211 non-null  object
 7   day        45211 non-null  int64 
 8   month      45211 non-null  object
 9   duration   45211 non-null  int64 
 10  campaign   45211 non-null  int64 
 11  pdays      45211 non-null  int64 
 12  previous   45211 non-null  int64 
 13  poutcome   45211 non-null  object
 14  y          45211 non-null  object
dtypes: int64(7), object(8)
memory usage: 5.2+ MB


## Question 1

In [13]:
# Find the most frequent observation (mode) for the 'education' column
education_mode = df_selected['education'].mode()[0]
print(f"The most frequent observation (mode) for 'education' is: {education_mode}")

The most frequent observation (mode) for 'education' is: secondary


## Question 2

In [16]:
# Select numerical features
numerical_features = ['age', 'balance', 'day', 'duration', 'campaign', 'pdays', 'previous']

# Extract the numerical columns
df_numerical = df_selected[numerical_features]

# Compute the correlation matrix
correlation_matrix = df_numerical.corr()

# Display the correlation matrix
correlation_matrix

Unnamed: 0,age,balance,day,duration,campaign,pdays,previous
age,1.0,0.097783,-0.00912,-0.004648,0.00476,-0.023758,0.001288
balance,0.097783,1.0,0.004503,0.02156,-0.014578,0.003435,0.016674
day,-0.00912,0.004503,1.0,-0.030206,0.16249,-0.093044,-0.05171
duration,-0.004648,0.02156,-0.030206,1.0,-0.08457,-0.001565,0.001203
campaign,0.00476,-0.014578,0.16249,-0.08457,1.0,-0.088628,-0.032855
pdays,-0.023758,0.003435,-0.093044,-0.001565,-0.088628,1.0,0.45482
previous,0.001288,0.016674,-0.05171,0.001203,-0.032855,0.45482,1.0


In [18]:
# Find the pair of features with the highest correlation
correlation_matrix_unstacked = correlation_matrix.unstack()
correlation_matrix_unstacked = correlation_matrix_unstacked[correlation_matrix_unstacked != 1.0]
max_correlation_pair = correlation_matrix_unstacked.idxmax()
max_correlation_value = correlation_matrix_unstacked.max()
print(f"The two features with the highest correlation are: {max_correlation_pair} with a correlation of {max_correlation_value}")

The two features with the highest correlation are: ('pdays', 'previous') with a correlation of 0.4548196354805043


### Target encoding

In [19]:
# Check the distribution of the target variable
df_selected['y'].value_counts()

y
no     39922
yes     5289
Name: count, dtype: int64

In [20]:
# Replace 'yes' with 1 and 'no' with 0 in the target column 'y'
df_selected['y'] = df_selected['y'].replace({'yes': 1, 'no': 0})

  df_selected['y'] = df_selected['y'].replace({'yes': 1, 'no': 0})
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_selected['y'] = df_selected['y'].replace({'yes': 1, 'no': 0})


In [21]:
# Verify the transformation
df_selected['y'].value_counts()

y
0    39922
1     5289
Name: count, dtype: int64

### Split the data

In [23]:
# Separate features and target variable
X = df_selected.drop(columns=['y'])  
y = df_selected['y']

# Split the data: 60% train and 40% temp (which will be split further into val/test)
X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=0.4, random_state=42)

# Now split the temporary set into validation and test sets (50% each of 40% = 20% each)
X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.5, random_state=42)

# Output the sizes of the resulting datasets
print(f"Training set: {X_train.shape}, {y_train.shape}")
print(f"Validation set: {X_val.shape}, {y_val.shape}")
print(f"Test set: {X_test.shape}, {y_test.shape}")

Training set: (27126, 14), (27126,)
Validation set: (9042, 14), (9042,)
Test set: (9043, 14), (9043,)


## Question 3

In [39]:
# Select categorical variables from the training set
categorical_columns = ['job', 'marital', 'education', 'housing', 'contact', 'month', 'poutcome']
X_train_categorical = X_train[categorical_columns]

In [40]:
# Define the mutual information function
def mutual_info_churn_score(series):
    return mutual_info_score(series, y_train)

In [41]:
# Compute the mutual information score for each categorical variable
mi = X_train_categorical.apply(mutual_info_churn_score)
mi.sort_values(ascending=False)

poutcome     0.029403
month        0.024780
contact      0.014214
housing      0.009464
job          0.007910
education    0.002570
marital      0.002201
dtype: float64

## Question 4

In [43]:
# One-hot encoding for training data
X_train_encoded = pd.get_dummies(X_train, columns=categorical_columns, drop_first=True)

# One-hot encoding for validation data
X_val_encoded = pd.get_dummies(X_val, columns=categorical_columns, drop_first=True)

# Ensure the validation set has the same columns as the training set by aligning them
X_val_encoded = X_val_encoded.reindex(columns=X_train_encoded.columns, fill_value=0)

In [44]:
# Train Logistic Regression Model
model = LogisticRegression(solver='liblinear', C=1.0, max_iter=1000, random_state=42)
model.fit(X_train_encoded, y_train)

# Predict on the validation set
y_val_pred = model.predict(X_val_encoded)

In [45]:
# Calculate accuracy on the validation set
accuracy = accuracy_score(y_val, y_val_pred)

# Round the accuracy to 2 decimal places and print it
accuracy_rounded = round(accuracy, 2)
print(f"Validation Accuracy: {accuracy_rounded}")

Validation Accuracy: 0.9


## Question 5

In [46]:
baseline_accuracy = accuracy

In [51]:
# Feature Elimination - Train model without each feature and calculate accuracy
features = ['age', 'balance', 'marital', 'previous']
accuracy_diffs = {}

for feature in features:
    if feature in categorical_columns:
        # Find all one-hot encoded columns related to the categorical feature
        columns_to_drop = [col for col in X_train_encoded.columns if col.startswith(feature)]
    else:
        # Drop the numerical feature directly
        columns_to_drop = [feature]

    # Remove the selected feature (or its related one-hot columns) from the training and validation sets
    X_train_without_feature = X_train_encoded.drop(columns=columns_to_drop)
    X_val_without_feature = X_val_encoded.drop(columns=columns_to_drop)
    
    # Train the model without this feature
    model.fit(X_train_without_feature, y_train)
    
    # Predict on the validation set and calculate accuracy
    y_val_pred = model.predict(X_val_without_feature)
    accuracy_without_feature = accuracy_score(y_val, y_val_pred)
    
    # Calculate the accuracy difference
    accuracy_diff = baseline_accuracy - accuracy_without_feature
    accuracy_diffs[feature] = accuracy_diff
    
    # Print the difference for this feature
    print(f"Difference in accuracy when excluding '{feature}': {accuracy_diff:.4f}")

# Find the feature with the smallest accuracy difference
least_useful_feature = min(accuracy_diffs, key=accuracy_diffs.get)
print(f"\nThe feature with the smallest accuracy difference is: {least_useful_feature}")

Difference in accuracy when excluding 'age': 0.0001
Difference in accuracy when excluding 'balance': -0.0003
Difference in accuracy when excluding 'marital': 0.0001
Difference in accuracy when excluding 'previous': -0.0002

The feature with the smallest accuracy difference is: balance


## Question 6

In [52]:
# Define the values of C to try
C_values = [0.01, 0.1, 1, 10, 100]

# Train a Logistic Regression model for each value of C and calculate validation accuracy
best_accuracy = 0
best_C = None
accuracies = {}

for C in C_values:
    # Train the logistic regression model with the current value of C
    model = LogisticRegression(solver='liblinear', C=C, max_iter=1000, random_state=42)
    model.fit(X_train_encoded, y_train)
    
    # Predict on the validation set
    y_val_pred = model.predict(X_val_encoded)
    
    # Calculate accuracy and round it to 3 decimal places
    accuracy = accuracy_score(y_val, y_val_pred)
    rounded_accuracy = round(accuracy, 3)
    
    # Store accuracy for this C
    accuracies[C] = rounded_accuracy
    
    # Update the best C if this is the best accuracy
    if rounded_accuracy > best_accuracy:
        best_accuracy = rounded_accuracy
        best_C = C

    # Print accuracy for this C
    print(f"C={C}, Validation Accuracy={rounded_accuracy:.3f}")

# Print the best C and its accuracy
print(f"\nThe best C value is: {best_C} with accuracy: {best_accuracy:.3f}")

C=0.01, Validation Accuracy=0.898
C=0.1, Validation Accuracy=0.900
C=1, Validation Accuracy=0.900
C=10, Validation Accuracy=0.901
C=100, Validation Accuracy=0.901

The best C value is: 10 with accuracy: 0.901
