In [5]:
import math # for sqrt...
import pandas as pd
import numpy as np
import matplotlib as mpl
import seaborn as sb
from sklearn.model_selection import train_test_split # for train test split
from sklearn.metrics import mutual_info_score # for mutual information score
from sklearn.metrics import accuracy_score # for accuracy score
from sklearn.metrics import mean_squared_error # for q6
from sklearn.feature_extraction import DictVectorizer # for one-hot encoding
from sklearn.linear_model import LogisticRegression # for log reg
from sklearn.linear_model import Ridge # for q6
import pickle

from sklearn.metrics import roc_auc_score # for hw4 q1

In [2]:
df = pd.read_csv("AER_credit_card_data.csv")
# The goal of this homework is to inspect the output of different evaluation metrics 
# by creating a classification model (target column card).

In [3]:
# Preparation
# Create the target variable by mapping yes to 1 and no to 0.
df['card'] = df['card'].map(dict(yes=1, no=0)) # <-- target
df['owner'] = df['owner'].map(dict(yes=1, no=0))
df['selfemp'] = df['selfemp'].map(dict(yes=1, no=0))
# df.head()

Unnamed: 0,card,reports,age,income,share,expenditure,owner,selfemp,dependents,months,majorcards,active
0,1,0,37.66667,4.52,0.03327,124.9833,1,0,3,54,1,12
1,1,0,33.25,2.42,0.005217,9.854167,0,0,3,34,1,13
2,1,0,33.66667,4.5,0.004156,15.0,1,0,4,58,1,5
3,1,0,30.5,2.54,0.065214,137.8692,0,0,0,25,1,7
4,1,0,32.16667,9.7867,0.067051,546.5033,1,0,2,64,1,5


In [4]:
# Split the dataset into 3 parts: train/validation/test with 60%/20%/20% distribution. 
# Use train_test_split funciton for that with random_state=1.
X_train_valid, X_test, y_train_valid, y_test = train_test_split(
    df.drop('card', axis = 1),
    df[['card']],
    train_size = 0.8,
    test_size = 0.2,
    random_state = 42)
# print(df.shape)
# print()
# print(X_train_valid.shape)
# print(y_train_valid.shape)
# print()
# print(X_test.shape)
# print(y_test.shape)

X_train, X_valid, y_train, y_valid = train_test_split(
    X_train_valid,
    y_train_valid,
    train_size = 0.75,
    test_size = 0.25,
    shuffle = False)
# print(X_train.shape)
# print(y_train.shape)
# print()
# print(X_valid.shape)
# print(y_valid.shape)

In [10]:
# Question 1
# ROC AUC could also be used to evaluate feature importance of numerical variables.
# Let's do that

# For each numerical variable, use it as score and compute AUC with the card variable.
# Use the training dataset for that.
# If your AUC is < 0.5, invert this variable by putting "-" in front
roc_auc_reports = roc_auc_score(y_train, -X_train[['reports']])
print(roc_auc_reports)

roc_auc_dependents = roc_auc_score(y_train, -X_train[['dependents']])
print(roc_auc_dependents)

roc_auc_active = roc_auc_score(y_train, X_train[['active']])
print(roc_auc_active)

roc_auc_share = roc_auc_score(y_train, X_train[['share']])
print(roc_auc_share)

# numerical variables: reports, age, income, share, expenditure, dependents, months, active
# relevant for question: reports, dependents, active, share

# (e.g. -df_train['expenditure'])
# AUC can go below 0.5 if the variable is negatively correlated with the target varialble. You can change the direction of the correlation by negating this variable - then negative correlation becomes positive.

# Which numerical variable (among the following 4) has the highest AUC?
# reports
# dependents
# active
# --> share <--

0.7119481870960674
0.5229214590864278
0.5951363785737758
0.9876446854346954


In [None]:
# Training the model
# From now on, use these columns only:
# ["reports", "age", "income", "share", "expenditure", "dependents", "months", "majorcards", "active", "owner", "selfemp"]

# Apply one-hot-encoding using DictVectorizer and train the logistic regression with these parameters:
# LogisticRegression(solver='liblinear', C=1.0, max_iter=1000)

In [None]:
# Question 2
# What's the AUC of this model on the validation dataset? (round to 3 digits)
# 0.615
# 0.515
# 0.715
# 0.995

In [None]:
# Question 3
# Now let's compute precision and recall for our model.
# Evaluate the model on all thresholds from 0.0 to 1.0 with step 0.01
# For each threshold, compute precision and recall
# Plot them
# At which threshold precision and recall curves intersect?
# 0.1
# 0.3
# 0.6
# 0.8

In [None]:
# Question 4
# Precision and recall are conflicting - when one grows, the other goes down. That's why they are often combined into the F1 score - a metrics that takes into account both
# This is the formula for computing F1:
# F1 = 2 * P * R / (P + R)
# Where P is precision and R is recall.
# Let's compute F1 for all thresholds from 0.0 to 1.0 with increment 0.01
# At which threshold F1 is maximal?
# 0.1
# 0.4
# 0.6
# 0.7

In [None]:
# Question 5
# Use the KFold class from Scikit-Learn to evaluate our model on 5 different folds:
# KFold(n_splits=5, shuffle=True, random_state=1)
# Iterate over different folds of df_full_train
# Split the data into train and validation
# Train the model on train with these parameters: LogisticRegression(solver='liblinear', C=1.0, max_iter=1000)
# Use AUC to evaluate the model on validation
# How large is standard devidation of the AUC scores across different folds?
# 0.003
# 0.014
# 0.09
# 0.24

In [None]:
# Question 6
# Now let's use 5-Fold cross-validation to find the best parameter C
# Iterate over the following C values: [0.01, 0.1, 1, 10]
# Initialize KFold with the same parameters as previously
# Use these parametes for the model: LogisticRegression(solver='liblinear', C=C, max_iter=1000)
# Compute the mean score as well as the std (round the mean and std to 3 decimal digits)
# Which C leads to the best mean score?
# 0.01
# 0.1
# 1
# 10
# If you have ties, select the score with the lowest std. If you still have ties, select the smallest C