## Basic Imports

In [190]:
import numpy as np
import pandas as pd
import datetime as dt
import os
import sys

from sklearn.preprocessing import Imputer
from sklearn.preprocessing import StandardScaler
from sklearn.base import BaseEstimator
from sklearn.base import TransformerMixin
from sklearn.pipeline import Pipeline
from sklearn.pipeline import FeatureUnion

import matplotlib.pyplot as plt
%matplotlib inline

## Loading Data

**Note:** looks like the training set has 40,000 data-points and the test set has 10,000 data-points

In [191]:
DATA_PATH = os.path.join("../data/")

def load_churn_data(path=DATA_PATH, train=True):
    if train:
        csv_path = os.path.join(path, "churn_train.csv")
    else:
        csv_path = os.path.join(path, "churn_test.csv")
    return pd.read_csv(csv_path)

In [192]:
#load train data
churn_train_orig = load_churn_data(path=DATA_PATH, train=True)
churn_train = churn_train_orig.copy()

#load train data
churn_test_orig = load_churn_data(path=DATA_PATH, train=False)
churn_test = churn_test_orig.copy()

In [193]:
churn_train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 40000 entries, 0 to 39999
Data columns (total 12 columns):
avg_dist                  40000 non-null float64
avg_rating_by_driver      39838 non-null float64
avg_rating_of_driver      33472 non-null float64
avg_surge                 40000 non-null float64
city                      40000 non-null object
last_trip_date            40000 non-null object
phone                     39681 non-null object
signup_date               40000 non-null object
surge_pct                 40000 non-null float64
trips_in_first_30_days    40000 non-null int64
luxury_car_user           40000 non-null bool
weekday_pct               40000 non-null float64
dtypes: bool(1), float64(6), int64(1), object(4)
memory usage: 3.4+ MB


In [194]:
churn_test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10000 entries, 0 to 9999
Data columns (total 12 columns):
avg_dist                  10000 non-null float64
avg_rating_by_driver      9961 non-null float64
avg_rating_of_driver      8406 non-null float64
avg_surge                 10000 non-null float64
city                      10000 non-null object
last_trip_date            10000 non-null object
phone                     9923 non-null object
signup_date               10000 non-null object
surge_pct                 10000 non-null float64
trips_in_first_30_days    10000 non-null int64
luxury_car_user           10000 non-null bool
weekday_pct               10000 non-null float64
dtypes: bool(1), float64(6), int64(1), object(4)
memory usage: 869.2+ KB


## Pipline Custom Transfomer Classes

In [252]:
boolean_attributes     = ['luxury_car_user']

datetime_attributes    = ["last_trip_date", "signup_date"]

categorical_attributes = ["city", "phone"]

numerical_attributes   = ["avg_dist", 
                          "avg_rating_by_driver", 
                          "avg_rating_of_driver", 
                          "avg_surge", 
                          "surge_pct", 
                          "trips_in_first_30_days", 
                          "weekday_pct"]


numerical_indices = churn_train.drop(datetime_attributes + categorical_attributes + boolean_attributes, axis=1).columns
bool_indices = churn_train.drop(datetime_attributes + categorical_attributes + numerical_attributes, axis=1).columns

### Dataframe Selector Class
this class will take in a dataframe X and return only the selected attributes of X

In [253]:
class dataFrameSelector(BaseEstimator, TransformerMixin):
    def __init__(self, selected_attributes):
        self.selected_attributes = selected_attributes
        
    def fit(self, X, y=None):
        return self
    
    def transform(self, X):
        assert isinstance(X, pd.DataFrame)
        return X[self.selected_attributes]

### Feature Engineering Class
this class will take in a dataframe X and and return X after selectively adding in newly engineered features

In [254]:
class featureEngineering(BaseEstimator, TransformerMixin):
    def __init__(self, add_weak_user=True, add_power_user=True, add_extreme_dist=True):
        self.add_power_user = add_power_user
        self.add_weak_user = add_weak_user
        self.add_extreme_dist = add_extreme_dist

    def fit(self, X, y=None):
        return self
    
    def transform(self, X):
        assert isinstance(X, pd.DataFrame)
        if self.add_weak_user:
            weak_user = (X.loc[:, 'trips_in_first_30_days'] <= 2).astype(int)
            X = pd.concat([X, weak_user.rename('weak_user')], axis=1)
            
        if self.add_power_user:
            power_user = (X.loc[:, 'trips_in_first_30_days'] >= 5).astype(int)
            X = pd.concat([X, power_user.rename('power_user')], axis=1)
            
        if self.add_extreme_dist:
            bins = [0,1.8,10,100]
            avg_distance_split = pd.cut(X['avg_dist'], bins=bins, labels=False)
            X['extreme_dist'] = (avg_distance_split != 1).astype(int)
        
        return X

### Imputer Converter Class
this class  will take in a dataframe X and return X with imputed values to missing data points 

In [255]:
class imputerConverter(BaseEstimator, TransformerMixin):
    def __init__(self, attribute_indices):
        self.attribute_indices = attribute_indices

    def fit(self, X, y=None):
        return self
    
    def transform(self, X):
        assert isinstance(X, pd.DataFrame)
        imputer = Imputer(strategy='median')
        imputer.fit(X)
        X = imputer.transform(X)
        return pd.DataFrame(X, columns=self.attribute_indices)

### Type Converter Class
this class will take in a dataframe and return a dataframe after converting tye types of certain columns

In [256]:
class typeConverter(BaseEstimator, TransformerMixin):
    def __init__(self, convert_to_int=True, convert_to_datetime=False):
        self.convert_to_int = convert_to_int
        self.convert_to_datetime = convert_to_datetime
        
    def fit(self, X, y=None):
        return self
    
    def transform(self, X):
        assert isinstance(X, pd.DataFrame)
        if self.convert_to_int:
            return X[:].astype(int)
        elif self.convert_to_datetime:
            return X.iloc[:, :].apply(pd.to_datetime, errors='coerce')
        else:
            return X

### Output Adder Class
this class will take in a dataframe and return the dataframe after adding the target output column

In [257]:
class outputAdder(BaseEstimator, TransformerMixin):
    def __init__(self):
        pass
        
    def fit(self, X, y=None):
        return self
    
    def transform(self, X):
        assert isinstance(X, pd.DataFrame)
        end_date = pd.to_datetime('2014-07-01')
        delta = pd.Timedelta('30 days')
        
        X['churn'] = ((end_date - X['last_trip_date']) >= delta).astype(int)
        return X

### Get Dummies Class
this class will take in a dataframe and return a dummified dataframe of all categorical features

In [258]:
class getDummies(BaseEstimator, TransformerMixin):
    def __init__(self, dummy_cols = []):
        self.dummy_cols = dummy_cols
        
    def fit(self, X, y=None):
        return self
    
    def transform(self, X):
        assert isinstance(X, pd.DataFrame)
        return pd.get_dummies(data=X, columns=self.dummy_cols, drop_first=True)

### Feature Dropper Class
this class will take in a dataframe and return the dataframe after dropping a few selected features

In [259]:
class featureDropper(BaseEstimator, TransformerMixin):
    def __init__(self, drop_list = []):
        self.drop_list = drop_list
        
    def fit(self, X, y=None):
        return self
    
    def transform(self, X):
        assert isinstance(X, pd.DataFrame)
        X.drop(self.drop_list, axis=1, inplace=True)
        return X

## Building Final Pipeline

In [260]:
datetime_pipe = Pipeline([
                            ('selector', dataFrameSelector(datetime_attributes)),
                            ('type converter', typeConverter(convert_to_int=False, convert_to_datetime=True)),
                            ('output adder', outputAdder()),
                            ('feature dropper', featureDropper(datetime_attributes))
                        ])

In [261]:
numerical_pipe = Pipeline([
                            ('selector', dataFrameSelector(numerical_attributes)),
                            ('imputor', imputerConverter(numerical_indices)),
                            ('feature engineer', featureEngineering()),
                            ('scale', StandardScaler())
                         ])

In [262]:
bool_pipe = Pipeline([
                        ('selector', dataFrameSelector(boolean_attributes)),
                        ('imputor', imputerConverter(bool_indices)),
                        ('type converter', typeConverter(convert_to_int=True, convert_to_datetime=False))
                    ])

In [263]:
categorical_pipe = Pipeline([
                                ('selector', dataFrameSelector(categorical_attributes)),
                                ('dummify', getDummies(categorical_attributes))
                            ])

In [267]:
pre_process_pipe = FeatureUnion(transformer_list=[
                            ('num pipe', numerical_pipe),
                            ('bool pipe', bool_pipe),
                            ('date pipe', datetime_pipe),
                            ('cat pipe', categorical_pipe)
                        ])

In [268]:
train_prepared = pre_process_pipe.fit_transform(churn_train)

In [269]:
type(train_prepared)

numpy.ndarray

In [271]:
train_prepared[0,:]

array([ 0.20124404,  0.49545414,  0.61004852, -0.33699489, -0.44256269,
       -0.59841402,  1.05490722,  0.59540643, -0.39471778, -0.63169744,
        0.        ,  1.        ,  0.        ,  0.        ,  0.        ])