# Introduction: Logistic Regression

The purpose of this notebook is to gain an understanding of the simple method of logistic regression. Logistic regression is generally considered the simplest method of classification, but can still be a powerful and explainable algorithm.

In [1]:
# Standard Data Science Helpers
import numpy as np
import pandas as pd
import scipy

import plotly.plotly as py
import plotly.graph_objs as go
from plotly.offline import iplot, init_notebook_mode
init_notebook_mode(connected=True)

import cufflinks as cf
cf.set_config_file(world_readable=True, theme="pearl")
cf.go_offline(connected=True)

# Extra options
pd.options.display.max_rows = 10
pd.options.display.max_columns = 25
# Show all code cells outputs
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = 'all'

In [2]:
original_data = pd.read_csv('data/adult_income.csv')

def process_data(data):
    """
    Process the adult income dataset
    """
    data = data.copy()
    # Replace missing values
    data = data.replace({' ?': np.nan})
    
    # Code gender
    data['female'] = data['sex'].replace({' Male': 0, ' Female': 1})
    # Code target
    data['target'] = data['target'].replace({' >50K': 1, ' <=50K': 0})
    # Create single column for capital wealth
    data['capital'] = data['capital_gain'] - data['capital_loss']
    to_drop = ['country', 'education', 'sex', 
           'capital_gain', 'capital_loss', 
           'working_class',
          'race', 'occupation']
    # Remove excess columns
    data = data.drop(columns=to_drop)
    data = pd.get_dummies(data)
    return data

data = process_data(original_data)
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 32561 entries, 0 to 32560
Data columns (total 20 columns):
age                                      32561 non-null int64
weighting                                32561 non-null int64
education_num                            32561 non-null int64
hours_per_week_work                      32561 non-null int64
target                                   32561 non-null int64
female                                   32561 non-null int64
capital                                  32561 non-null int64
marital_status_ Divorced                 32561 non-null uint8
marital_status_ Married-AF-spouse        32561 non-null uint8
marital_status_ Married-civ-spouse       32561 non-null uint8
marital_status_ Married-spouse-absent    32561 non-null uint8
marital_status_ Never-married            32561 non-null uint8
marital_status_ Separated                32561 non-null uint8
marital_status_ Widowed                  32561 non-null uint8
relationship_ Husband  

In [3]:
corrs = data.corr()
corrs['target']

age                             0.234037
weighting                      -0.009463
education_num                   0.335154
hours_per_week_work             0.229689
target                          1.000000
                                  ...   
relationship_ Not-in-family    -0.188497
relationship_ Other-relative   -0.083716
relationship_ Own-child        -0.228532
relationship_ Unmarried        -0.142857
relationship_ Wife              0.123264
Name: target, Length: 20, dtype: float64

In [4]:
import plotly.figure_factory as ff
from plotly.offline import iplot

# Correlation Heatmap
iplot(ff.create_annotated_heatmap(corrs.iloc[:10, :10].round(3).values, x=list(corrs.iloc[:10, :10].columns), 
                                  y=list(corrs.iloc[:10, :10].index), annotation_text=corrs.iloc[:10, :10].round(3).values))


In [5]:
from sklearn.linear_model import LogisticRegressionCV
from sklearn.model_selection import train_test_split

# Features and target
X = data.copy()
y = X.pop('target')

# Split into training and testing
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.4, random_state=42)

# Create the model
model = LogisticRegressionCV(Cs=10, cv = 3, scoring='roc_auc', n_jobs=-1, verbose=1, random_state=42)
model.fit(X_train, y_train)

[Parallel(n_jobs=-1)]: Done   3 out of   3 | elapsed:    2.3s finished


LogisticRegressionCV(Cs=10, class_weight=None, cv=3, dual=False,
           fit_intercept=True, intercept_scaling=1.0, max_iter=100,
           multi_class='ovr', n_jobs=-1, penalty='l2', random_state=42,
           refit=True, scoring='roc_auc', solver='lbfgs', tol=0.0001,
           verbose=1)

## Metrics

In [6]:
from sklearn.metrics import f1_score, roc_auc_score

def evaluate(model, X_test, y_test):
    """
    Test a model on a few classification metrics.
    """
    # Predictions and probabilities
    predictions = model.predict(X_test)
    probabilities = model.predict_proba(X_test)[:, 1]
    roc_auc = roc_auc_score(y_test, probabilities)
    f1_value = f1_score(y_test, predictions)
    accuracy = np.mean(predictions == y_test)
    
    # Get a baseline
    base_accuracy = np.mean(y_test == 0)
    print(f'ROC AUC: {roc_auc:.4f}')
    print(f'F1 Score: {f1_value:.4f}')
    print(f'Accuracy: {100 * accuracy:.2f}%')
    print(f'Baseline Accuracy: {100 * base_accuracy:.2f}%')
    
evaluate(model, X_test, y_test)

ROC AUC: 0.5500
F1 Score: 0.3103
Accuracy: 79.59%
Baseline Accuracy: 75.97%


# Model Outputs

In [7]:
probability = model.predict_proba(X_test)[:, 1]
log_odds = model.decision_function(X_test)
classes = model.predict(X_test)

In [8]:
yhat = pd.DataFrame(dict(probability=probability, log_odds=log_odds, classes=classes))
yhat.describe()

Unnamed: 0,probability,log_odds,classes
count,13025.0,13025.0,13025.0
mean,0.278616,-0.891544,0.055585
std,0.148823,2.021135,0.229128
min,0.002541,-5.972546,0.0
25%,0.201376,-1.377718,0.0
50%,0.252283,-1.086475,0.0
75%,0.317065,-0.767294,0.0
max,1.0,24.621157,1.0


# Converting Between Log Odds and Probabilities

In [9]:
log_odds = yhat['log_odds']

# Exponentiate the log odds
odds_ratio = np.exp(log_odds)

# Calculate the probabilitiy
probability = odds_ratio / (1 + odds_ratio)

# Sanity check
np.allclose(probability, yhat['probability'].values)

True

# Visualization of Log Odds and Probabilities

In [10]:
log_odds = yhat['log_odds']

# Apply sigmoid function by hand
probability = 1 / (1 + np.exp(-log_odds))

# Check
np.allclose(probability, yhat['probability'].values)

True

In [11]:
yhat['odds_ratio'] = odds_ratio
yhat['classes'] = yhat['classes'].map({0: '< 50K', 1: '>= 50K'})

In [12]:
yhat.sample(1000).iplot(x='log_odds', y='probability', 
           yTitle='Probability', xrange=(-5, 5), xTitle='Log Odds',
           title='Probability vs Log Odds', categories='classes')

In [13]:
yhat.sample(1000).iplot(x='log_odds', y='odds_ratio', 
           yTitle='odds_ratio', xrange=(-2, 4), yrange=(0,15),
           title='Odds Ratio vs Log Odds', categories='classes')

In [14]:
yhat.sample(1000).iplot(x='odds_ratio', y='probability', xTitle='Odds Ratio', yTitle='Probability', 
                        title='Probability vs Odds Ratio', 
                        mode='markers', xrange=(0, 10))

In [15]:
yhat.sort_values('probability').sample(1000)

Unnamed: 0,probability,log_odds,classes,odds_ratio
12289,0.333226,-0.693630,< 50K,0.499759
7075,0.321340,-0.747621,< 50K,0.473492
714,0.289100,-0.899762,< 50K,0.406667
760,0.303536,-0.830516,< 50K,0.435824
12278,0.343228,-0.648944,< 50K,0.522598
...,...,...,...,...
4207,0.182416,-1.500063,< 50K,0.223116
3001,0.305110,-0.823083,< 50K,0.439076
7006,0.163736,-1.630690,< 50K,0.195794
5065,0.241372,-1.145171,< 50K,0.318170


In [16]:
yhat.sort_values('probability').sample(1000).iplot(x='probability', mode='markers',
                                                   yrange=(0, 10), xTitle='Probability', yTitle='Odds',
                                                   title="Odds and Log Odds vs Probability")

In [17]:
from sklearn.ensemble import RandomForestClassifier

model = RandomForestClassifier()
_=model.fit(X_train, y_train)
evaluate(model, X_test, y_test)

ROC AUC: 0.8700
F1 Score: 0.6188
Accuracy: 83.32%
Baseline Accuracy: 75.97%


In [18]:
model = RandomForestClassifier(n_estimators=100, max_depth=20, random_state=50)
_ = model.fit(X_train, y_train)
evaluate(model, X_test, y_test)

ROC AUC: 0.9052
F1 Score: 0.6649
Accuracy: 85.53%
Baseline Accuracy: 75.97%
