https://github.com/jem1031/pandas-pipelines-custom-transformers

In [1]:
import pandas as pd
import numpy as np

In [2]:
# SET UP
from sklearn.model_selection import train_test_split

# Read in data
# source: https://data.seattle.gov/Permitting/Special-Events-Permits/dm95-f8w5
data_folder = './data/'
data_file = 'Special_Events_Permits_2016.csv'
df = pd.read_csv(data_folder + data_file)

# Set aside 25% as test data
df_train, df_test = train_test_split(df, random_state=4321)

# Take a look
df_train.head()


Unnamed: 0,application_date,permit_status,permit_type,event_category,event_sub_category,name_of_event,year_month_app,event_start_date,event_end_date,event_location_park,event_location_neighborhood,council_district,precinct,organization,attendance
497,04/25/2016 12:00:00 AM,Complete,Special Event,Community,,Rainier Valley Heritage Festival,S16AU260,08/13/2016 12:00:00 AM,08/13/2016 12:00:00 AM,,Rainier Valley,2,South,Rainier Chamber Foundation,3020.0
303,08/16/2016 12:00:00 AM,Complete,Special Event,Community,,West Seattle Hometown Holidays,S16DE438,12/03/2016 12:00:00 AM,12/03/2016 12:00:00 AM,,West Seattle,1,West,West Seattle Junction Association,770.0
39,02/10/2016 12:00:00 AM,Complete,Special Event,Commercial,,NFFTY 2016 Opening Night,S16AP122,04/28/2016 12:00:00 AM,04/28/2016 12:00:00 AM,,Downtown,7,West,The Talented Youth (AKA NFFTY),750.0
364,08/03/2016 12:00:00 AM,Complete,Special Event,Free Speech,,Carpenters 4 Carpenters Rally and March to Tru...,S16AU421,08/04/2016 12:00:00 AM,08/04/2016 12:00:00 AM,,Belltown,7,West,Carpenters 4 Carpenters,250.0
410,06/13/2016 12:00:00 AM,Complete,Charter Vessel,,,Argosy Cruises - Celebrations,CV16JY344,07/04/2016 12:00:00 AM,07/04/2016 12:00:00 AM,,Wallingford,4,North,Argosy Cruises,149.0


In [3]:
# SIMPLE MODEL
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_auc_score

# Binary outcome
y_train = np.where(df_train.permit_status == 'Complete', 1, 0)
y_test = np.where(df_test.permit_status == 'Complete', 1, 0)

# Single feature
X_train_1 = df_train[['attendance']].fillna(value=0)
X_test_1 = df_test[['attendance']].fillna(value=0)

# Fit model
model_1 = LogisticRegression(random_state=5678)
model_1.fit(X_train_1, y_train)
y_pred_train_1 = model_1.predict(X_train_1)
p_pred_train_1 = model_1.predict_proba(X_train_1)[:, 1]

# Evaluate model
# baseline: always predict the average
p_baseline_test = [y_train.mean()]*len(y_test)
auc_baseline = roc_auc_score(y_test, p_baseline_test)
print(auc_baseline)  # 0.5
y_pred_test_1 = model_1.predict(X_test_1)
p_pred_test_1 = model_1.predict_proba(X_test_1)[:, 1]
auc_test_1 = roc_auc_score(y_test, p_pred_test_1)
print(auc_test_1)  # 0.576553672316


0.5
0.5765536723163842


In [4]:
# MODEL W/PIPELINE
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_auc_score
from sklearn.pipeline import Pipeline

from custom_transformers import ColumnExtractor, DFStandardScaler, DFFeatureUnion
from custom_transformers import DummyTransformer, Log1pTransformer, ZeroFillTransformer

# Group columns by type of preprocessing needed
OUTCOME = 'permit_status'
CAT_FEATS = [
    'permit_type', 'event_category', 'event_sub_category',
    'event_location_park', 'event_location_neighborhood']
NUM_FEATS = ['attendance']

# Preprocessing with a Pipeline
pipeline = Pipeline([
    ('features', DFFeatureUnion([
        ('categoricals', Pipeline([
            ('extract', ColumnExtractor(CAT_FEATS)),
            ('dummy', DummyTransformer())
        ])),
        ('numerics', Pipeline([
            ('extract', ColumnExtractor(NUM_FEATS)),
            ('zero_fill', ZeroFillTransformer()),
            ('log', Log1pTransformer())
        ]))
    ])),
    ('scale', DFStandardScaler())
])
pipeline.fit(df_train)
X_train_2 = pipeline.transform(df_train)
X_test_2 = pipeline.transform(df_test)

# Fit model
model_2 = LogisticRegression(random_state=5678)
model_2.fit(X_train_2, y_train)
y_pred_train_2 = model_2.predict(X_train_2)
p_pred_train_2 = model_2.predict_proba(X_train_2)[:, 1]

# Evaluate model
p_pred_test_2 = model_2.predict_proba(X_test_2)[:, 1]
auc_test_2 = roc_auc_score(y_test, p_pred_test_2)
print(auc_test_2)  # 0.705084745763


0.7016949152542373


In [7]:
# MODEL W/EVEN MORE FEATURES
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_auc_score
from sklearn.pipeline import Pipeline

from custom_transformers import ColumnExtractor, DFStandardScaler, DFFeatureUnion, DFImputer
from custom_transformers import DummyTransformer, Log1pTransformer, ZeroFillTransformer
from custom_transformers import DateFormatter, DateDiffer, MultiEncoder

# Group columns by type of preprocessing needed
OUTCOME = 'permit_status'
NEAR_UNIQUE_FEATS = ['name_of_event', 'year_month_app', 'organization']
CAT_FEATS = [
    'permit_type', 'event_category', 'event_sub_category',
    'event_location_park', 'event_location_neighborhood']
MULTI_FEATS = ['council_district', 'precinct']
#DATE_FEATS = ['application_date', 'event_start_date', 'event_end_date']
NUM_FEATS = ['attendance']

# Preprocessing with a Pipeline
pipeline3 = Pipeline([
    ('features', DFFeatureUnion([
        #('dates', Pipeline([
        #    ('extract', ColumnExtractor(DATE_FEATS)),
        #    ('to_date', DateFormatter()),
        #    ('diffs', DateDiffer()),
        #    ('mid_fill', DFImputer(strategy='median'))
        #])),
        ('categoricals', Pipeline([
            ('extract', ColumnExtractor(CAT_FEATS)),
            ('dummy', DummyTransformer())
        ])),
        ('multi_labels', Pipeline([
            ('extract', ColumnExtractor(MULTI_FEATS)),
            ('multi_dummy', MultiEncoder(sep=';'))
        ])),
        ('numerics', Pipeline([
            ('extract', ColumnExtractor(NUM_FEATS)),
            ('zero_fill', ZeroFillTransformer()),
            ('log', Log1pTransformer())
        ]))
    ])),
    ('scale', DFStandardScaler())
])
pipeline3.fit(df_train)
X_train_3 = pipeline3.transform(df_train)
X_test_3 = pipeline3.transform(df_test)

# Fit model
model_3 = LogisticRegression(random_state=5678)
model_3.fit(X_train_3, y_train)
y_pred_train_3 = model_3.predict(X_train_3)
p_pred_train_3 = model_3.predict_proba(X_train_3)[:, 1]

# Evaluate model
p_pred_test_3 = model_3.predict_proba(X_test_3)[:, 1]
auc_test_3 = roc_auc_score(y_test, p_pred_test_3)
print(auc_test_3)  # 0.6790960451977401  # too many features -> starting to overfit


0.6790960451977401
