# Homework 04 - Evaluation
The goal of this homework is to determine if a client has subscribed a term deposit or not, using alternative methods to evaluate the model.

In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction import DictVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_auc_score

## Dataset
This homework will be using the Bank Marketing dataset.

In [2]:
#!mkdir data
#!wget -O data/bank_marketing.zip https://archive.ics.uci.edu/static/public/222/bank+marketing.zip
#!unzip -d data data/bank_marketing.zip 
#!unzip -d data data/bank.zip

bm_df = pd.read_csv("data/bank-full.csv", sep=';')
bm_df.columns = bm_df.columns.str.lower().str.replace(' ', '_')
bm_df.head()

Unnamed: 0,age,job,marital,education,default,balance,housing,loan,contact,day,month,duration,campaign,pdays,previous,poutcome,y
0,58,management,married,tertiary,no,2143,yes,no,unknown,5,may,261,1,-1,0,unknown,no
1,44,technician,single,secondary,no,29,yes,no,unknown,5,may,151,1,-1,0,unknown,no
2,33,entrepreneur,married,secondary,no,2,yes,yes,unknown,5,may,76,1,-1,0,unknown,no
3,47,blue-collar,married,unknown,no,1506,yes,no,unknown,5,may,92,1,-1,0,unknown,no
4,33,unknown,single,unknown,no,1,no,no,unknown,5,may,198,1,-1,0,unknown,no


## Data preparation
Split the data into train/val/test (60%/20%/20%) with a `random_state=1`.  There is no mention of checking for missing values.

In [3]:
cols = [
    'age',
    'job',
    'marital',
    'education',
    'balance',
    'housing',
    'contact',
    'day',
    'month',
    'duration',
    'campaign',
    'pdays',
    'previous',
    'poutcome',
    'y'
]
full_train_df, test_df = train_test_split(bm_df, test_size=0.2, random_state=1)
train_df, val_df = train_test_split(full_train_df, test_size=0.25, random_state=1)
(len(train_df), len(val_df), len(test_df))

(27126, 9042, 9043)

## Question 1
Which numerical variable has the highest AUC? `balance`, `day`, `duration`, or `previous`?

In [4]:
feature_importance = []

for num_col in train_df.dtypes[train_df.dtypes != 'object'].index:
    ras = roc_auc_score(train_df.y, train_df[num_col])
    if ras < 0.5:
        ras = roc_auc_score(train_df.y, -train_df[num_col])
    feature_importance.append(
        (num_col, ras))

pd.DataFrame(feature_importance, columns=['feature', 'roc_auc_score']).sort_values(by='roc_auc_score', ascending=False)

Unnamed: 0,feature,roc_auc_score
3,duration,0.8147
6,previous,0.598565
5,pdays,0.590128
1,balance,0.588831
4,campaign,0.571454
2,day,0.525958
0,age,0.512186


## Question 2
What is the AUC of the model on the validation dataset?

In [5]:
def train(tr_df, v_df):
    train_dict = tr_df.drop('y', axis=1).to_dict(orient='records')
    train_y = tr_df.y.map({'yes': 1, 'no': 0}).values

    val_dict = v_df.drop('y', axis=1).to_dict(orient='records')
    val_y = v_df.y.map({'yes': 1, 'no': 0}).values
    
    dv = DictVectorizer(sparse=False)
    train_X = dv.fit_transform(train_dict)
    val_X = dv.fit_transform(val_dict)

    model = LogisticRegression(
        solver='liblinear',
        C=1.0,
        max_iter=1000)
    model.fit(train_X, train_y)

    pred_y = model.predict_proba(val_X)[:, 1]

    return val_y, pred_y

In [6]:
def ras(tr_df, v_df):
    val_y, pred_y = train(tr_df, v_df)
    return roc_auc_score(val_y, pred_y)

In [7]:
round(ras(train_df, val_df), 3)

0.901