In [1]:
import pandas as pd
import numpy as np

In [8]:
df = pd.read_csv('bank-full.csv', delimiter=';')

In [9]:
df.head()

Unnamed: 0,age,job,marital,education,default,balance,housing,loan,contact,day,month,duration,campaign,pdays,previous,poutcome,y
0,58,management,married,tertiary,no,2143,yes,no,unknown,5,may,261,1,-1,0,unknown,no
1,44,technician,single,secondary,no,29,yes,no,unknown,5,may,151,1,-1,0,unknown,no
2,33,entrepreneur,married,secondary,no,2,yes,yes,unknown,5,may,76,1,-1,0,unknown,no
3,47,blue-collar,married,unknown,no,1506,yes,no,unknown,5,may,92,1,-1,0,unknown,no
4,33,unknown,single,unknown,no,1,no,no,unknown,5,may,198,1,-1,0,unknown,no


In [10]:
df.columns

Index(['age', 'job', 'marital', 'education', 'default', 'balance', 'housing',
       'loan', 'contact', 'day', 'month', 'duration', 'campaign', 'pdays',
       'previous', 'poutcome', 'y'],
      dtype='object')

In [11]:
columns = ['age', 'job', 'marital', 'education', 'balance', 'housing', 'contact', 'day', 'month', 'duration', 'campaign', 'pdays', 'previous', 'poutcome', 'y']
df = df[columns]

In [12]:
df.isnull().sum()

age          0
job          0
marital      0
education    0
balance      0
housing      0
contact      0
day          0
month        0
duration     0
campaign     0
pdays        0
previous     0
poutcome     0
y            0
dtype: int64

## Q1

In [13]:
education_mode = df['education'].mode()[0]
education_mode

'secondary'

## Q2

In [14]:
numerical_columns = ['age', 'balance', 'day', 'duration', 'campaign', 'pdays', 'previous']
numerical_df = df[numerical_columns]

In [17]:
correlation_matrix = numerical_df.corr()
print(correlation_matrix)

               age   balance       day  duration  campaign     pdays  previous
age       1.000000  0.097783 -0.009120 -0.004648  0.004760 -0.023758  0.001288
balance   0.097783  1.000000  0.004503  0.021560 -0.014578  0.003435  0.016674
day      -0.009120  0.004503  1.000000 -0.030206  0.162490 -0.093044 -0.051710
duration -0.004648  0.021560 -0.030206  1.000000 -0.084570 -0.001565  0.001203
campaign  0.004760 -0.014578  0.162490 -0.084570  1.000000 -0.088628 -0.032855
pdays    -0.023758  0.003435 -0.093044 -0.001565 -0.088628  1.000000  0.454820
previous  0.001288  0.016674 -0.051710  0.001203 -0.032855  0.454820  1.000000


In [16]:
correlation_unstacked = correlation_matrix.unstack()
sorted_correlation = correlation_unstacked.sort_values(ascending=False)
sorted_correlation = sorted_correlation[sorted_correlation < 1]
biggest_correlation = sorted_correlation.head(1)
print(biggest_correlation)

previous  pdays    0.45482
dtype: float64


## Q3

In [18]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.feature_selection import mutual_info_classif


df['y'] = df['y'].map({'yes': 1, 'no': 0})

# Split the data into train, validation, and test sets (60/20/20) using random seed 42
df_train, df_temp = train_test_split(df, test_size=0.4, random_state=42)
df_val, df_test = train_test_split(df_temp, test_size=0.5, random_state=42)

# Define categorical columns
categorical_columns = ['contact', 'education', 'housing', 'poutcome']

# Encode categorical variables in the training set
le = LabelEncoder()
for col in categorical_columns:
    df_train[col] = le.fit_transform(df_train[col])

# Define X (features) and y (target) for the training set
X_train = df_train[categorical_columns]
y_train = df_train['y']

# Calculate mutual information scores between y and categorical features in the training set
mi_scores = mutual_info_classif(X_train, y_train, discrete_features=True)

# Create a DataFrame to display feature names and mutual information scores
mi_df = pd.DataFrame({'Feature': categorical_columns, 'Mutual Information': mi_scores})

# Round the mutual information scores to 2 decimal places
mi_df['Mutual Information'] = mi_df['Mutual Information'].round(2)

# Sort the DataFrame by mutual information scores in descending order
mi_df = mi_df.sort_values(by='Mutual Information', ascending=False)

# Print the feature with the highest mutual information score
print(mi_df)

     Feature  Mutual Information
3   poutcome                0.03
0    contact                0.01
2    housing                0.01
1  education                0.00


In [20]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline


# Define features and target for training and validation datasets
X_train = df_train.drop('y', axis=1)
y_train = df_train['y']
X_val = df_val.drop('y', axis=1)
y_val = df_val['y']

# Create a pipeline for preprocessing and model fitting
pipeline = Pipeline(steps=[
    ('ohe', OneHotEncoder(handle_unknown='ignore')),  # One-hot encoding
    ('log_reg', LogisticRegression(solver='liblinear', C=1.0, max_iter=1000, random_state=42))  # Logistic regression
])

# Fit the model on the training dataset
pipeline.fit(X_train, y_train)

# Predict on the validation dataset
y_val_pred = pipeline.predict(X_val)

# Calculate accuracy on the validation dataset
accuracy = accuracy_score(y_val, y_val_pred)

# Print the rounded accuracy
print(f'Validation Accuracy: {round(accuracy, 2)}')

Validation Accuracy: 0.89


In [21]:
# Define features and target for training dataset
X_train = df_train.drop('y', axis=1)
y_train = df_train['y']

# One-hot encode the features
ohe = OneHotEncoder(handle_unknown='ignore')
X_train_encoded = ohe.fit_transform(X_train)

# Train the logistic regression model with all features
model = LogisticRegression(solver='liblinear', C=1.0, max_iter=1000, random_state=42)
model.fit(X_train_encoded, y_train)

# Calculate original accuracy on validation set
X_val = df_val.drop('y', axis=1)
y_val = df_val['y']
X_val_encoded = ohe.transform(X_val)
original_accuracy = accuracy_score(y_val, model.predict(X_val_encoded))

# Features to evaluate
features = ['age', 'balance', 'marital', 'previous']

# Dictionary to store accuracy differences
accuracy_differences = {}

# Evaluate each feature by excluding it
for feature in features:
    # Drop the feature from training set
    X_train_excluded = X_train.drop(feature, axis=1)
    X_train_encoded_excluded = ohe.fit_transform(X_train_excluded)
    
    # Train the model without the excluded feature
    model.fit(X_train_encoded_excluded, y_train)
    
    # Calculate accuracy on validation set
    X_val_excluded = X_val.drop(feature, axis=1)
    X_val_encoded_excluded = ohe.transform(X_val_excluded)
    accuracy_without_feature = accuracy_score(y_val, model.predict(X_val_encoded_excluded))
    
    # Calculate the difference and store it
    accuracy_differences[feature] = original_accuracy - accuracy_without_feature

# Identify the feature with the smallest difference
least_useful_feature = min(accuracy_differences, key=accuracy_differences.get)

# Print the results
print("Accuracy Differences:", accuracy_differences)
print("Least useful feature:", least_useful_feature)

Accuracy Differences: {'age': 0.000774165007741745, 'balance': 0.0014377350143773837, 'marital': 0.0013271400132713884, 'previous': 0.0017695200176952586}
Least useful feature: age


In [22]:
# One-hot encode the features
ohe = OneHotEncoder(handle_unknown='ignore')
X_train_encoded = ohe.fit_transform(X_train)
X_val_encoded = ohe.transform(X_val)

# List of C values to try
C_values = [0.01, 0.1, 1, 10, 100]

# Dictionary to store accuracy for each C value
accuracy_results = {}

# Train logistic regression for each value of C
for C in C_values:
    model = LogisticRegression(solver='liblinear', C=C, max_iter=1000, random_state=42)
    model.fit(X_train_encoded, y_train)
    y_val_pred = model.predict(X_val_encoded)
    accuracy = accuracy_score(y_val, y_val_pred)
    accuracy_results[C] = round(accuracy, 3)  # Round to 3 decimal digits

# Identify the best C value based on accuracy
best_C = max(accuracy_results, key=accuracy_results.get)

# Print the results
print("Accuracy Results:", accuracy_results)
print("Best C value:", best_C)

Accuracy Results: {0.01: 0.885, 0.1: 0.889, 1: 0.89, 10: 0.886, 100: 0.883}
Best C value: 1
