# Homework 04 - Evaluation
The goal of this homework is to determine if a client has subscribed a term deposit or not, using alternative methods to evaluate the model.

In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction import DictVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_auc_score

## Dataset
This homework will be using the Bank Marketing dataset.

In [2]:
bm_url = "https://raw.githubusercontent.com/alexeygrigorev/datasets/master/course_lead_scoring.csv"
bm_df = pd.read_csv(bm_url)
bm_df.columns = bm_df.columns.str.lower().str.replace(' ', '_')
bm_df.head()

Unnamed: 0,lead_source,industry,number_of_courses_viewed,annual_income,employment_status,location,interaction_count,lead_score,converted
0,paid_ads,,1,79450.0,unemployed,south_america,4,0.94,1
1,social_media,retail,1,46992.0,employed,south_america,1,0.8,0
2,events,healthcare,5,78796.0,unemployed,australia,3,0.69,1
3,paid_ads,retail,2,83843.0,,australia,1,0.87,0
4,referral,education,3,85012.0,self_employed,europe,3,0.62,1


## Data preparation
Split the data into train/val/test (60%/20%/20%) with a `random_state=1`.

In [3]:
categorical_cols = list(bm_df.dtypes[bm_df.dtypes == 'object'].index)
numeric_cols = [col for col in list(bm_df.dtypes.index) if col not in categorical_cols and col != 'converted']
(categorical_cols, numeric_cols)

(['lead_source', 'industry', 'employment_status', 'location'],
 ['number_of_courses_viewed',
  'annual_income',
  'interaction_count',
  'lead_score'])

In [4]:
bm_df.isna().sum()

lead_source                 128
industry                    134
number_of_courses_viewed      0
annual_income               181
employment_status           100
location                     63
interaction_count             0
lead_score                    0
converted                     0
dtype: int64

In [5]:
bm_df[categorical_cols] = bm_df[categorical_cols].fillna('NA')
bm_df[numeric_cols] = bm_df[numeric_cols].fillna(0.0)

In [6]:
bm_df.isna().sum()

lead_source                 0
industry                    0
number_of_courses_viewed    0
annual_income               0
employment_status           0
location                    0
interaction_count           0
lead_score                  0
converted                   0
dtype: int64

In [7]:
full_train_df, test_df = train_test_split(bm_df, test_size=0.2, random_state=1)
train_df, val_df = train_test_split(full_train_df, test_size=0.25, random_state=1)
(len(train_df), len(val_df), len(test_df))

(876, 293, 293)

## Question 1
Which numerical variable has the highest AUC? `balance`, `day`, `duration`, or `previous`?

In [8]:
feature_importance = []

for num_col in train_df.dtypes[train_df.dtypes != 'object'].index:
    ras = roc_auc_score(train_df.converted, train_df[num_col])
    if ras < 0.5:
        ras = roc_auc_score(train_df.converted, -train_df[num_col])
    feature_importance.append(
        (num_col, ras))

pd.DataFrame(feature_importance, columns=['feature', 'roc_auc_score']).sort_values(by='roc_auc_score', ascending=False)

Unnamed: 0,feature,roc_auc_score
4,converted,1.0
0,number_of_courses_viewed,0.763568
2,interaction_count,0.73827
3,lead_score,0.614499
1,annual_income,0.551958


## Question 2
What is the AUC of the model on the validation dataset?

In [9]:
def train(tr_df, v_df):
    train_dict = tr_df.drop('converted', axis=1).to_dict(orient='records')
    train_y = tr_df.converted.values

    val_dict = v_df.drop('converted', axis=1).to_dict(orient='records')
    val_y = v_df.converted.values
    
    dv = DictVectorizer(sparse=False)
    train_X = dv.fit_transform(train_dict)
    val_X = dv.fit_transform(val_dict)

    model = LogisticRegression(
        solver='liblinear',
        C=1.0,
        max_iter=1000)
    model.fit(train_X, train_y)

    pred_y = model.predict_proba(val_X)[:, 1]

    return val_y, pred_y

In [10]:
def ras(tr_df, v_df):
    val_y, pred_y = train(tr_df, v_df)
    return roc_auc_score(val_y, pred_y)

In [11]:
round(ras(train_df, val_df), 3)

0.817