In [4]:
%matplotlib inline

In [68]:
import pandas as pd

from nose.tools import *
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.linear_model import LogisticRegression

from sklearn.metrics import f1_score, classification_report

In [None]:
# YOUR CODE HERE
raise NotImplementedError()

In [None]:
np.random.seed(1234)

# Model Training and Improvement Lab
## Comparing and selecting models

### 1. Read the data (1 point)
Like in the previous lab, you need to read the Portuguese bank dataset [here](https://archive.ics.uci.edu/ml/machine-learning-databases/00222/). It has been provided for you in the `data` folder.

Read the dataset using `pandas` (you can use the library with the alias `pd`). Save it in the `bank_data` variable.

In [11]:
bank_data = pd.read_csv('data/bank.csv', sep=";")

In [14]:
bank_data.sample(5)

Unnamed: 0,age,job,marital,education,default,balance,housing,loan,contact,day,month,duration,campaign,pdays,previous,poutcome,y
1451,29,services,married,secondary,no,3748,no,no,cellular,28,may,322,3,-1,0,unknown,no
4362,40,technician,married,secondary,no,697,no,no,cellular,24,jun,220,2,219,2,other,no
4105,39,entrepreneur,married,secondary,no,0,no,no,cellular,21,nov,108,7,-1,0,unknown,no
3371,49,unemployed,married,tertiary,no,-471,yes,no,cellular,20,nov,77,3,185,7,failure,no
3483,40,management,married,tertiary,no,7780,no,no,telephone,4,may,364,1,355,1,other,no


In [13]:
# From now on, all test cells might contain hidden tests. If you follow the instructions correctly, 
# your solution will be graded with maximum points
assert_is_not_none(bank_data)

### 2. Preprocess the data (1 point)
Separate explanatory features from labels. Save all features (16 columns total) in the variable `bank_features`. Save the labels (corresponding to the `y` column) in the `bank_labels` variable. Rewrite the labels to be `0` and `1` instead of `no` and `yes`: `bank_labels` should be a numeric column.

In [15]:
bank_data.dtypes

age           int64
job          object
marital      object
education    object
default      object
balance       int64
housing      object
loan         object
contact      object
day           int64
month        object
duration      int64
campaign      int64
pdays         int64
previous      int64
poutcome     object
y            object
dtype: object

In [17]:
bank_features, bank_labels = [bank_data.drop("y", axis=1), bank_data.y]

In [20]:
bank_labels.sample(5)

259     yes
3522     no
1444     no
1754    yes
85       no
Name: y, dtype: object

In [30]:
bank_labels = bank_labels.replace('yes', 1)
bank_labels = bank_labels.replace('no', 0)

In [31]:
bank_labels.unique()

array([0, 1], dtype=int64)

In [18]:
assert_is_not_none(bank_features)
assert_is_not_none(bank_labels)

### 3. Get indicator variables (1 point)
Get indicator (dummy) variables for all categorical columns in `bank_features`. Overwrite the `bank_features` variable to store the new data.

In [32]:
bank_features = pd.get_dummies(bank_features)

In [33]:
assert_equal(bank_features.shape, (4521, 51))

### 4. Split the data (1 point)
Split the data into training and testing set, with 70% of the data for training. Because the output labels are not equaly distributed, use stratification based on the `bank_labels`.

In [35]:
bank_features_train, bank_features_test, bank_labels_train, bank_labels_test = train_test_split(
                bank_features, bank_labels, train_size=0.7, stratify=bank_labels)

In [36]:
assert_is_not_none(bank_features_train)
assert_is_not_none(bank_labels_train)
assert_is_not_none(bank_features_test)
assert_is_not_none(bank_labels_test)

### 5. Train a baseline algorithm (1 point)
Train a logistic regression using the training data. Use 1 000 000 (`1e6`) as the value of C. Score it using the testing data. Save the score in the `baseline_score` variable. You should see a fairly high score.

In [47]:
model = LogisticRegression(C=1e6, solver='liblinear')
model.fit(bank_features_train, bank_labels_train)
baseline_score = model.score(bank_features_test, bank_labels_test)

In [48]:
baseline_score

0.899042004421518

In [49]:
assert_is_not_none(model)
assert_greater(baseline_score, 0.7)

### 6. Select a better score (2 points)
As you alrady saw, the positive examples are very few. If you aren't convinced, just check the counts.

We know that the default scoring (accuracy) isn't correct in this case. Better measures would be precision and recall. However, we only want one number. Evaluate the algorithm once again, using a standard scoring method which combines precision and recall. Overwrite the `baseline_score` variable.

Don't forget to score the model on the testing data only.

In [59]:
new_bank_labels_test = model.predict(bank_features_test)

In [64]:
report = classification_report(bank_labels_test, new_bank_labels_test)
print(report)

              precision    recall  f1-score   support

           0       0.91      0.98      0.94      1201
           1       0.64      0.28      0.39       156

    accuracy                           0.90      1357
   macro avg       0.78      0.63      0.67      1357
weighted avg       0.88      0.90      0.88      1357



In [65]:
baseline_score = f1_score(bank_labels_test, new_bank_labels_test)
print(baseline_score)

0.3911111111111111


In [66]:
assert_less(baseline_score, 0.7)

### 7. Tune your model (2 points)
Fine-tune the `C` and `max_iter` parameters.

Use full grid search with the following values:
* `C`: 0.0001, 0.01, 0.1, 1, 10, 100, 10000
* `max_iter`: 50, 100, 300, 1000
* `fit_itercept`: True, False

Save the grid search result in the `grid_search` variable. Don't forget to use the better scoring model that you obtained in the previous task.

In [67]:
mapper = {
    "C": [0.0001, 0.01, 0.1, 1, 10, 100, 10000],
    "max_iter": [50, 100, 300, 1000],
    "fit_intercept": [True, False]
}

In [75]:
grid_search = GridSearchCV(LogisticRegression(solver = 'liblinear'), mapper)

In [76]:
grid_search.fit(bank_features_train, bank_labels_train)



GridSearchCV(estimator=LogisticRegression(solver='liblinear'),
             param_grid={'C': [0.0001, 0.01, 0.1, 1, 10, 100, 10000],
                         'fit_intercept': [True, False],
                         'max_iter': [50, 100, 300, 1000]})

In [83]:
assert_is_not_none(grid_search)
assert_is_not_none(grid_search.best_estimator_)

### 8. Compare scores (1 point)
Use the best estimator from your grid search. Score it using the function from problem 6. Save your answer in `tuned_score`.

In [84]:
grid_search.best_estimator_

LogisticRegression(C=100, max_iter=50, solver='liblinear')

In [89]:
best_result = grid_search.best_estimator_.predict(bank_features_test)

In [93]:
tuned_score = f1_score(bank_labels_test, best_result)

In [94]:
print(tuned_score)

0.38392857142857145


In [95]:
print(baseline_score - tuned_score)

0.007182539682539624


Hmmmm, it seems we have not obtained a better algorithm, even the opposite (the difference is marginal and depends on the random initialization of the cross-validation datasets).

We can, of course, do a lot more things to improve our model's performance, such as normalizing the data, feature selection and feature engineering, trying out different aspects, e.g. polynomial terms, RANSAC; even boosting (we'll talk about this later). However, we'll stop at this point.

What can we conclude? It seems that this is close to the best performance we can get out of this algorithm, given these data points.

We can try improving (cleaning) our dataset, selecting features, etc. but we most likely need a better algorithm. In the next labs, we're going to explore that.