Getting Started | Data Prep | Data Exploration | **Preprocessing** | Model Tuning | Final Model

*For preprocessing, we will:*
* Create a Preprocessor object to process data
* Complete feature engineering and value imputation (if needed)
* Encode categorical Variables

In [1]:
import pandas as pd
import numpy as np

In [2]:
# adding the modules directory to path
import sys
sys.path.insert(0, '../modules')

# reading in functions
from helpers import read_in_dataset, get_percent, filter_duplicates, get_data

## Read in Data

In [3]:
train_X, train_y = get_data('train')

In [6]:
# Sanity check for lengths and opportunity numbers
assert len(train_X) == len(train_y)
assert set(train_X.index) == set(train_y.index)

## Create Initial Preprocessor Object 

In [13]:
class Preprocessor: 
    
    def __init__(self, cols_to_filter=None):
        self.cols_to_filter = cols_to_filter
        
    def fit(self, X_features, y_target=None):
        '''
        Learn information from training get to transform test set
        '''
        
        return self
    
    def transform(self, X_features, y_target=None):
        '''
        transform the training or test data based on info learned in fit step
        '''
        
        X_features_new = X_features.drop(self.cols_to_filter, axis=1)
        return X_features_new

In [15]:
p = Preprocessor(cols_to_filter=['Competitor Type'])
p.transform(train_X).head()

Unnamed: 0_level_0,Supplies Subgroup,Supplies Group,Region,Route To Market,Elapsed Days In Sales Stage,Sales Stage Change Count,Total Days Identified Through Closing,Total Days Identified Through Qualified,Opportunity Amount USD,Client Size By Revenue,Client Size By Employee Count,Revenue From Client Past Two Years,Ratio Days Identified To Total Days,Ratio Days Validated To Total Days,Ratio Days Qualified To Total Days,Deal Size Category
Opportunity Number,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1
6748356,Batteries & Accessories,Car Accessories,Northwest,Fields Sales,91,2,3,3,20000,4,4,0,1.0,0.0,0.0,2
9608239,Exterior Accessories,Car Accessories,Northeast,Reseller,12,2,5,5,8000,1,1,0,0.0,1.0,0.0,1
8139132,Garage & Car Care,Car Accessories,Pacific,Reseller,35,3,7,7,9300,1,1,0,0.0,1.0,0.0,1
8423006,Motorcycle Parts,Performance & Non-auto,Northwest,Reseller,7,7,20,20,26315,1,1,0,0.0,0.172589,0.827411,3
8104406,Motorcycle Parts,Performance & Non-auto,Midwest,Reseller,26,2,17,17,9202,1,1,0,0.0,1.0,0.0,1


## Encoding Categorical/Discrete Features

In [26]:
class Preprocessor: 
    
    def __init__(self, cols_to_filter=None):
        self.cols_to_filter = cols_to_filter
        self.was_fit = False
        
    def fit(self, X_features, y_target=None):
        '''
        Learn information from training get to transform test set
        '''
        self.was_fit = True
        
        # Filter columns
        X_features_new = X_features.drop(self.cols_to_filter, axis=1)
        
        # Encode categorical variables
        self.categorical_features = X_features_new.dtypes[X_features_new.dtypes == 'object'].index
        
        dummies = pd.get_dummies(X_features_new, columns=self.categorical_features)
        self.col_names = dummies.columns
        del dummies
        
        return self
    
    def transform(self, X_features, y_target=None):
        '''
        transform the training or test data based on info learned in fit step
        '''
        if not self.was_fit:
            raise Error("Need to fit preprocessor first")
            
        # Filter columns
        X_features_new = X_features.drop(self.cols_to_filter, axis=1)
        
        # Encode categorical variables
        X_features_new = pd.get_dummies(X_features_new, columns=self.categorical_features)
        new_cols = set(self.col_names) - set(X_features_new.columns)
        
        for x in new_cols:
            X_features_new[x] = 0
        
        # Fill Null Values
        X_features_new = X_features_new.fillna(-1)   
        
        return X_features_new
    
    def fit_transform(self, X_features, y_target=None):
        '''
        fit and transform for sklearn pipeline
        '''
        
        return self.fit(X_features).transform(X_features)

In [27]:
p = Preprocessor(cols_to_filter=['Competitor Type'])
p.fit(train_X)

<__main__.Preprocessor instance at 0x118e5b830>

In [28]:
train_X_transformed = p.transform(train_X)

In [31]:
train_X_transformed.head()

Unnamed: 0_level_0,Elapsed Days In Sales Stage,Sales Stage Change Count,Total Days Identified Through Closing,Total Days Identified Through Qualified,Opportunity Amount USD,Client Size By Revenue,Client Size By Employee Count,Revenue From Client Past Two Years,Ratio Days Identified To Total Days,Ratio Days Validated To Total Days,...,Region_Northeast,Region_Northwest,Region_Pacific,Region_Southeast,Region_Southwest,Route To Market_Fields Sales,Route To Market_Other,Route To Market_Reseller,Route To Market_Telecoverage,Route To Market_Telesales
Opportunity Number,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
6748356,91,2,3,3,20000,4,4,0,1.0,0.0,...,0,1,0,0,0,1,0,0,0,0
9608239,12,2,5,5,8000,1,1,0,0.0,1.0,...,1,0,0,0,0,0,0,1,0,0
8139132,35,3,7,7,9300,1,1,0,0.0,1.0,...,0,0,1,0,0,0,0,1,0,0
8423006,7,7,20,20,26315,1,1,0,0.0,0.172589,...,0,1,0,0,0,0,0,1,0,0
8104406,26,2,17,17,9202,1,1,0,0.0,1.0,...,0,0,0,0,0,0,0,1,0,0


In [34]:
assert all(train_X_transformed.isna().sum() == 0)