# Homework 03 - Classification
The goal of this homework is to determine if a client has signed up for the platform yet or not.

In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import mutual_info_score
from sklearn.feature_extraction import DictVectorizer
from sklearn.linear_model import LogisticRegression

## Dataset
This homework will use the Bank Marketing dataset.

In [2]:
bm_url = "https://raw.githubusercontent.com/alexeygrigorev/datasets/master/course_lead_scoring.csv"
bm_df = pd.read_csv(bm_url)
bm_df.columns = bm_df.columns.str.lower().str.replace(' ', '_')
bm_df.head()

Unnamed: 0,lead_source,industry,number_of_courses_viewed,annual_income,employment_status,location,interaction_count,lead_score,converted
0,paid_ads,,1,79450.0,unemployed,south_america,4,0.94,1
1,social_media,retail,1,46992.0,employed,south_america,1,0.8,0
2,events,healthcare,5,78796.0,unemployed,australia,3,0.69,1
3,paid_ads,retail,2,83843.0,,australia,1,0.87,0
4,referral,education,3,85012.0,self_employed,europe,3,0.62,1


## Data preparation
Check for missing values.  Replace missing categorical values with `NA` and missing numeric values with `0.0`.

In [3]:
categorical_cols = list(bm_df.dtypes[bm_df.dtypes == 'object'].index)
numeric_cols = [col for col in list(bm_df.dtypes.index) if col not in categorical_cols and col != 'converted']
(categorical_cols, numeric_cols)

(['lead_source', 'industry', 'employment_status', 'location'],
 ['number_of_courses_viewed',
  'annual_income',
  'interaction_count',
  'lead_score'])

In [4]:
bm_df.isna().sum()

lead_source                 128
industry                    134
number_of_courses_viewed      0
annual_income               181
employment_status           100
location                     63
interaction_count             0
lead_score                    0
converted                     0
dtype: int64

In [5]:
bm_df[categorical_cols] = bm_df[categorical_cols].fillna('NA')
bm_df[numeric_cols] = bm_df[numeric_cols].fillna(0.0)

In [6]:
bm_df.isna().sum()

lead_source                 0
industry                    0
number_of_courses_viewed    0
annual_income               0
employment_status           0
location                    0
interaction_count           0
lead_score                  0
converted                   0
dtype: int64

## Question 1
What is the most frequent observation for the column `industry`?


In [7]:
bm_df.industry.mode()[0]

'retail'

## Question 2
Create the correlation matrix for the numerical features of your dataset.
What are the two features that have the biggest correlation?

In [8]:
bm_df[numeric_cols].corr().abs() - np.eye(len(numeric_cols))

Unnamed: 0,number_of_courses_viewed,annual_income,interaction_count,lead_score
number_of_courses_viewed,0.0,0.00977,0.023565,0.004879
annual_income,0.00977,0.0,0.027036,0.01561
interaction_count,0.023565,0.027036,0.0,0.009888
lead_score,0.004879,0.01561,0.009888,0.0


## Split the data

In [9]:
full_train_df, test_df = train_test_split(bm_df, test_size=0.2, random_state=42)
train_df, val_df = train_test_split(full_train_df, test_size=0.25, random_state=42)
(len(train_df), len(val_df), len(test_df))

(876, 293, 293)

In [10]:
full_train_y = full_train_df.converted
test_y = test_df.converted
train_y = train_df.converted
val_y = val_df.converted
del full_train_df['converted']
del test_df['converted']
del train_df['converted']
del val_df['converted']

## Question 3
Which variable has the highest mutual information score?

In [11]:
train_df[categorical_cols].apply(lambda col: mutual_info_score(col, train_y)).map(lambda score: round(score, 2)).sort_values(ascending=False)

lead_source          0.04
industry             0.01
employment_status    0.01
location             0.00
dtype: float64

## Question 4
Train a logistic regression with one-hot encoding.
For repeatability, use the following parameters:
```py
model = LogisticRegression(solver='liblinear', C=1.0, max_iter=1000, random_state=42)
```
What is the accuracy?

In [12]:
def train_log_reg(tr_df, tr_y, v_df, v_y, cols, C=1.0):
    dv = DictVectorizer(sparse=False)
    train_dict = tr_df[cols].to_dict(orient='records')
    val_dict = v_df[cols].to_dict(orient='records')

    train_X = dv.fit_transform(train_dict)
    val_X = dv.transform(val_dict)

    model = LogisticRegression(
        solver='liblinear',
        C=C,
        max_iter=1000,
        random_state=42)
    model.fit(train_X, tr_y)
    
    pred_y = model.predict_proba(val_X)[:, 1]
    pred_converted = (pred_y >= 0.5)

    return (v_y == pred_converted).mean()

In [13]:
round(
    train_log_reg(
        train_df,
        train_y,
        val_df,
        val_y,
        categorical_cols + numeric_cols),
    2)

np.float64(0.7)

## Question 5
Find the least useful feature using the _feature elimination_ technique.
Which feature has the smallest difference?

In [14]:
cols = categorical_cols + numeric_cols
orig_score = train_log_reg(
    train_df,
    train_y,
    val_df,
    val_y,
    cols)

results = []

for col in cols:
    results.append((
        col,
        abs(orig_score - train_log_reg(
            train_df,
            train_y,
            val_df,
            val_y,
            [c for c in cols if c != col]))))

pd.DataFrame(results, columns=['excluded_feature', 'score']).sort_values(by='score')


Unnamed: 0,excluded_feature,score
1,industry,0.0
2,employment_status,0.003413
0,lead_source,0.003413
7,lead_score,0.006826
3,location,0.010239
4,number_of_courses_viewed,0.143345
6,interaction_count,0.143345
5,annual_income,0.153584


## Question 6
Which of the `C` values leads to the best accuracy on the validation set?

In [15]:
cols = categorical_cols + numeric_cols
c_vals = [0.01, 0.1, 1, 10, 100]

results = []

for c in c_vals:
    results.append((
        c,
        round(train_log_reg(
            train_df,
            train_y,
            val_df,
            val_y,
            cols,
            C=c), 3)))

pd.DataFrame(results, columns=['C', 'score']).sort_values(by='score', ascending=False)


Unnamed: 0,C,score
0,0.01,0.7
1,0.1,0.7
2,1.0,0.7
3,10.0,0.7
4,100.0,0.7
