# Module 4 homework

**This homework has 7 questions.**

In [59]:
import numpy as np
import pandas as pd
from sklearn.dummy import DummyClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.metrics import make_scorer
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import (
    accuracy_score,
    ConfusionMatrixDisplay,
    mean_squared_error,
    precision_score,
    recall_score
)
from sklearn.metrics import roc_curve

In this Homework, we take another look at the Bank Marketing dataset that we have already examined in Module 1.

## Question 1 (1 points)

Load the `bank_data.csv` dataset in a variable named `bank_data`.

## Answer 1

In [60]:
bank_data = pd.read_csv('data/bank_data.csv', sep=';')
bank_data.head()

Unnamed: 0,age,job,marital,education,default,balance,housing,loan,contact,day,month,duration,campaign,pdays,previous,poutcome,y
0,58,management,married,tertiary,no,2143,yes,no,unknown,5,may,261,1,-1,0,unknown,no
1,44,technician,single,secondary,no,29,yes,no,unknown,5,may,151,1,-1,0,unknown,no
2,33,entrepreneur,married,secondary,no,2,yes,yes,unknown,5,may,76,1,-1,0,unknown,no
3,47,blue-collar,married,unknown,no,1506,yes,no,unknown,5,may,92,1,-1,0,unknown,no
4,33,unknown,single,unknown,no,1,no,no,unknown,5,may,198,1,-1,0,unknown,no


Like we did in Module 1, we remove some variables that would be hard to handle when using our models in a predictive fashion.

In [61]:
try:
    bank_data.drop(['day', 'month', 'duration', 'pdays', 'poutcome'], axis=1, inplace=True)
except NameError:
    print('The object `bank_data` does not exist! Did you forget to create it?')

Here, we encode the response variable (whether or not a customer subscribed a term deposit with the bank following a marketing campaign) with a binary indicator.

In [62]:
try:
    bank_data["y"] = bank_data['y'].apply(lambda y: 1 if y == 'yes' else 0)
    bank_data.head()
except NameError:
    print('The object `bank_data` does not exist! Did you forget to create it?')

## Question 2 (1 points)

Create "dummy" variables for the categorical predictors in the dataset (also known as "one-hot encoding").

Reassign the new dataframe to the variable `bank_data`.

## Answer 2

In [64]:
bank_data = pd.get_dummies(bank_data, dtype=int)
bank_data.head()

Unnamed: 0,age,balance,campaign,previous,y,job_admin.,job_blue-collar,job_entrepreneur,job_housemaid,job_management,...,education_unknown,default_no,default_yes,housing_no,housing_yes,loan_no,loan_yes,contact_cellular,contact_telephone,contact_unknown
0,58,2143,1,0,0,False,False,False,False,True,...,False,True,False,False,True,True,False,False,False,True
1,44,29,1,0,0,False,False,False,False,False,...,False,True,False,False,True,True,False,False,False,True
2,33,2,1,0,0,False,False,True,False,False,...,False,True,False,False,True,False,True,False,False,True
3,47,1506,1,0,0,False,True,False,False,False,...,True,True,False,False,True,True,False,False,False,True
4,33,1,1,0,0,False,False,False,False,False,...,True,True,False,True,False,True,False,False,False,True


Let's now separate the predictors from the response variable.

In [65]:
try:
    X = bank_data.drop('y', axis=1)
    Y = bank_data['y']
except NameError:
    print('The object `bank_data` does not exist! Did you forget to create it?')

## Question 3 (1 points)

Split the data into a training and test set (`X_train, X_test, Y_train, Y_test`).

Use `random_state=42` and `test_size=0.25`).

## Answer 3

In [66]:
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.25, random_state=42)

We want to use the available data to build a predictive model that can assist us in making our next marketing campaign more efficient.


Here are some quantities to keep in mind. Our department estimates that:

- a marketing contact with a potential customer costs around 10 Euros on average

- a successful contact (i.e. the customer subscribes a term deposit) generates on average 100 Euros of profits for the bank (say, present value net the cost of marketing).

Accordingly, we estimate that:

- the value associated with a true negative prediction from our model is 10 Euros (it saves us the waste of 10 Euros associated with the marketing contact)

- the value associated with a false positive prediction from our model is -10 Euros

- the value associated with a false negative prediction from our model is -100 Euros

- the value associated with a true positive prediction from our model is +100 Euros.

Let's encode this information in a "value function" that we will use later.

In [67]:
def value_function(y_true, y_pred, tn_value=10, fp_value=-10, fn_value=-100, tp_value=100):
    sum_ = y_pred + y_true
    diff_ = y_pred - y_true
    tn_contrib = tn_value * np.mean((sum_ == 0) & (diff_ == 0))
    fp_contrib = fp_value * np.mean((sum_ == 1) & (diff_ == 1))
    fn_contrib = fn_value * np.mean((sum_ == 1) & (diff_ == -1))
    tp_contrib = tp_value * np.mean((sum_ == 2) & (diff_ == 0))
    return tn_contrib + fp_contrib + fn_contrib + tp_contrib

## Question 4 (1 points)

In this exercise, assume that you have already performed cross-validation for an `AdaBoostClassifier` and found that good values for its parameters are as follows:

- `base_estimator=DecisionTreeClassifier(random_state=42, max_depth=5)`

- `n_estimators=2000`

- `learning_rate=0.80`

Fit an `AdaBoostClassifier` on the training data using these parameter values (and `random_state=42`).

*Warning: it may take a few minutes to fit this beefy model! Feel free to take a coffee break ;)*

## Answer 4

In [None]:
ada_boost = AdaBoostClassifier(base_estimator=DecisionTreeClassifier(max_depth=5, random_state=42),
                               random_state=42, n_estimators=2000, learning_rate=0.80).fit(X_train, Y_train)



We will also fit a conventional decision tree for reference.

In [None]:
try:
    tree = DecisionTreeClassifier(random_state=42).fit(X_train, Y_train)
except NameError:
    print('The objects `X_train, Y_train` do not exist!')

## Question 5 (1 points)

Compute the problem-specific "value function" for the `AdaBoostClassifier` and for the `DecisionTreeClassifier`. 

Which model is performing best with respect to this metric?

## Answer 5

In [42]:
# your answer here

#adaboost:

value_function(y_true, y_pred, tn_value=10, fp_value=-10, fn_value=-100, tp_value=100):
    
print("accuracy_score train: " + round(accuracy_score(Y_train, ada_boost.predict(X_train)), 2).astype(str))
print("accuracy_score test: " + round(accuracy_score(Y_test, ada_boost.predict(X_test)), 2).astype(str))
print('--------------------')
print("precision_score train: " + round(precision_score(Y_train, ada_boost.predict(X_train)), 2).astype(str))
print("precision_score test: " + round(precision_score(Y_test, ada_boost.predict(X_test)), 2).astype(str))
print('--------------------')
print("recall_score train: " + round(recall_score(Y_train, ada_boost.predict(X_train)), 2).astype(str))
print("recall_score: " + round(recall_score(Y_test, ada_boost.predict(X_test)), 2).astype(str))
print('------------------------------------------------------------------------')
#Tree:
print("accuracy_score train: " + round(accuracy_score(Y_train, tree.predict(X_train)), 2).astype(str))
print("accuracy_score test: " + round(accuracy_score(Y_test, tree.predict(X_test)), 2).astype(str))
print('--------------------')
print("precision_score train: " + round(precision_score(Y_train, tree.predict(X_train)), 2).astype(str))
print("precision_score test: " + round(precision_score(Y_test, tree.predict(X_test)), 2).astype(str))
print('--------------------')
print("recall_score train: " + round(recall_score(Y_train, tree.predict(X_train)), 2).astype(str))
print("recall_score test: " + round(recall_score(Y_test, tree.predict(X_test)), 2).astype(str))

accuracy_score train: 1.0
accuracy_score test: 0.86
--------------------
precision_score train: 1.0
precision_score test: 0.39
--------------------
recall_score train: 1.0
recall_score: 0.23
------------------------------------------------------------------------
accuracy_score train: 1.0
accuracy_score test: 0.81
--------------------
precision_score train: 1.0
precision_score test: 0.23
--------------------
recall_score train: 1.0
recall_score test: 0.26


In [39]:
# the number for adaboost is better
accuracy_score train: 1.0
accuracy_score test: 0.86
--------------------
precision_score train: 1.0
precision_score test: 0.39
--------------------
recall_score train: 1.0
recall_score: 0.23
------------------------------------------------------------------------
accuracy_score train: 1.0
accuracy_score test: 0.81
--------------------
precision_score train: 1.0
precision_score test: 0.23
--------------------
recall_score train: 1.0
recall_score test: 0.26

Let's now try to quantify what is the monetary impact of using the `AdaBoostClassifier` as opposed to the `DecisionTreeClassifier` on our marketing campaign.

First off, for the evaluation of a given model, we will assume that in our marketing campaign we will only contact customers that are predicted as subscribers by our model.

With this in mind, let's create a "marketing campaign profit function".

In [40]:
def marketing_profits(model, X, Y, fp_value=-10, tp_value=100):
    tp_contrib = np.sum((model.predict(X) > 0) & (Y > 0)) * tp_value
    fp_contrib = np.sum((model.predict(X) > 0) & (Y < 1)) * fp_value
    return tp_contrib + fp_contrib

## Question 6 (2 points)

Based on the test data, by how much (percent-wise) do the profits for our future marketing campaign will increase (or decrease) if we use the `AdaBoostClassifier` ensemble model as opposed to the conventional `DecisionTreeClassifier`?

## Answer 6

In [47]:
# your answer here
#_ = ConfusionMatrixDisplay.from_estimator(ada_boost, X_test, Y_test)
adaboost_profit = marketing_profits(ada_boost, X, Y)
print(adaboost_profit)

#1545390

#ada_boost_profits = marketing_profits(ada_boost, X_test, Y_test)
#tree_profits = marketing_profits(tree, X_test, Y_test)

1545390


In [49]:
#_ = ConfusionMatrixDisplay.from_estimator(tree, X_test, Y_test)
tree_profit = marketing_profits(ada_boost, X, Y, 1180, 358)
print(tree_profit)

#2098440

2098440


In [None]:
 #prof
   # On Q6,  by how much (percent-wise) do the profits for our future marketing campaign will increaseyou divide by the ADA boosted value but you want to divide by the tree value in this case. 
    #variation = ada_boost_profits / tree_profits - 1

In [50]:
#mine
(ada_boost-tree)/tree
(1545390-2098440)/1545390=-35.79%
the profit will decrease by 35.79%
7.7%

SyntaxError: cannot assign to expression here. Maybe you meant '==' instead of '='? (3293766809.py, line 1)

## Question 7 (3 points)

1. Build an additional classifier of your choice for this problem.
   Make sure to follow best practices with cross-validation and evaluation on
   the test set!

2. Evaluate it against the two models above with respect to

   - the `value_function` that we defined
   
   - and increase/decrease in marketing campaign profits.

A few notes:

- The dataset is imbalanced, with only about 10% of the observations in the training
  data representing a positive marketing contact (i.e., $Y=1$).
  Is there any way to address this issue when fitting the model?
  See e.g., the `class_weight` parameter of `AdaBoostClassifier` or other
  classification algorithms. Chances are that setting `class_weight="balanced"`
  will improve the results.

- You can also try and use AUC (the area under the ROC curve) as a target metric
  for optimization. If you would like to experiment with that, try to set
  `scoring="roc_auc"` in `GridSearchCV`.

- We could try and optimize our models for our `value_function` directly.
  This may further improve our results. To do so, simply set
  `scoring=value_function_wrapper` in `GridSearchCV`.
  Note that `value_function_wrapper` is defined in the next cell and it is
  simply a version of our `value_function` that can be used as a scoring
  function by `sklearn`.

In [None]:
def _value_function(y, y_pred, **kwargs):
    return value_function(y, y_pred, **kwargs)


value_function_wrapper = make_scorer(_value_function)

## Answer 7

In [58]:
# your answer here
ada_boost2 = AdaBoostClassifier(base_estimator=DecisionTreeClassifier(max_depth=5, random_state=42),
                                 random_state=42, n_estimators=2000, learning_rate=0.80)

# Determine class weights
class_counts = np.bincount(Y_train)
class_weights = {i: sum(class_counts) / count for i, count in enumerate(class_counts)}

# Adjust sample weights
sample_weights = [class_weights[label] for label in Y_train]

# Fit AdaBoostClassifier with adjusted sample weights
ada_boost2.fit(X_train, Y_train, sample_weight=sample_weights)

# Get predicted probabilities
ada_probs2 = ada_boost2.predict_proba(X_test)[:, 1]

# Compute ROC curve and AUC for AdaBoost
fpr_ada2, tpr_ada2, _ = roc_curve(y_test, ada_probs2)
roc_auc_ada2 = auc(fpr_ada, tpr_ada)

# Plot ROC curve for AdaBoostClassifier
plt.figure(figsize=(8, 6))
plt.plot(fpr_ada, tpr_ada, color='blue', lw=2, label='AdaBoost2 (AUC = %0.2f)' % roc_auc_ada)
plt.plot([0, 1], [0, 1], color='gray', lw=1, linestyle='--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver Operating Characteristic (ROC) - AdaBoost2')
plt.legend(loc='lower right')
plt.show()



NameError: name 'y_test' is not defined

In [None]:
#prod
rf_cv = GridSearchCV(
    RandomForestClassifier(
        n_estimators=256,
        class_weight="balanced",  # account for class imbalance
        n_jobs=-1, random_state=42
    ),
    {
        "max_depth": [2**i for i in range(5)],
        "max_features": [2**i for i in range(5)]
    },
    n_jobs=-1,
    cv=2,
    scoring=value_function_wrapper  # use custom metric to find the best model
).fit(X_train, Y_train)

In [None]:
# my code Define your RandomForestClassifier and parameter grid
param_grid = {
        "max_depth": [2**i for i in range(5)],
        "max_features": [2**i for i in range(5)]
    }  # Define your parameter grid here
rf_classifier = RandomForestClassifier()

# Perform grid search with custom scoring function
grid_search = GridSearchCV(estimator=rf_classifier, param_grid=param_grid, scoring=value_function_wrapper, cv=5)
grid_search.fit(X_train, y_train)

# Get the best parameters and best score
best_params = grid_search.best_params_
best_score = grid_search.best_score_

In [None]:
param_grid = {
    'n_estimators': [2000],
    'max_depth': [5],
    'random_state': [42],
    # Add other parameters as needed
}
random_forest = RandomForestClassifier(class_weight="balanced")

# Create a GridSearchCV object
grid_search = GridSearchCV(random_forest, param_grid, scoring="roc_auc", cv=5)

# Fit the GridSearchCV object to your data
grid_search.fit(X_train, Y_train)

