In [1]:
import pandas as pd

In [2]:
import sys
sys.path.append("../")

In [3]:
# Standard Imports
import pandas as pd
import seaborn as sns
import numpy as np
import matplotlib.pyplot as plt
import pickle

# Transformers
from sklearn.preprocessing import LabelEncoder, OneHotEncoder, StandardScaler, MinMaxScaler

# Modeling Evaluation
from sklearn.model_selection import train_test_split, cross_val_score, KFold, GridSearchCV
from sklearn.metrics import accuracy_score, precision_score, recall_score,f1_score, confusion_matrix, classification_report
from IPython.display import display, Markdown

# Pipelines
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.compose import ColumnTransformer

# Machine Learning
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier

In [5]:
%load_ext autoreload
%autoreload 2

from src.pipeline import *

with open('../models/final_model_no_pipeline.pkl', 'rb') as file:
    final_model = pickle.load(file)

final_data_binner = Pipeline(steps=[
    ('bin_capital_gain', BinCapital(col_name = 'capital_gain')),
    ('bin_capital_loss', BinCapital(col_name = 'capital_loss')),
    ('bin_age', BinAge()),
    ('hot_encode', HotEncodeMerge())])


final_data_prep = Pipeline(steps=[
    ('validate_data', ValidateAndRenameColumns()),
    ("data_binning", final_data_binner)])

final_pipeline = Pipeline(steps=[('prep',final_data_prep), ('model', final_model)])

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [6]:
val = pd.read_csv('../data/validation.csv')

In [7]:
check = ValidateAndRenameColumns()
check.transform(val).info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3256 entries, 0 to 3255
Data columns (total 11 columns):
age               3256 non-null int64
workclass         3256 non-null object
education         3256 non-null object
education_num     3256 non-null object
marital_status    3256 non-null object
occupation        3256 non-null object
relationship      3256 non-null object
capital_gain      3256 non-null int64
capital_loss      3256 non-null int64
hours_per_week    3256 non-null int64
native_country    3256 non-null object
dtypes: int64(4), object(7)
memory usage: 279.9+ KB


In [26]:
class ValidateAndRenameColumns(BaseEstimator):
    def __init__(self):
        self.needed_columns = ['age', 'workclass', 'education', 'education_num',
           'marital_status', 'occupation', 'relationship',
           'capital_gain', 'capital_loss', 'hours_per_week', 'native_country']
        self.needed_types = {'age': 'int',
                             'workclass': 'str',
                             'education': 'str',
                             'education_num': 'str',
                             'marital_status': 'str',
                             'occupation': 'str',
                             'relationship': 'str',
                             'capital_gain': 'int',
                             'capital_loss': 'int',
                             'hours_per_week': 'int',
                             'native_country': 'str'}
            
    def fit(self, X, y=None):
         self._validate(X)
         return self
    
    def transform(self, X):
        self._validate(X)
        if self.type == dict:
            data = pd.DataFrame([X], index = range(len([X])))
        data = X.copy()
        return data[self.needed_columns]
    
    def _validate(self, X):
        self.proper_columns = False
        self._check_object_type(X)
        self._reformat(X)
        self._check_columns(X)
        self._check_column_types(X)
        if not self.proper_type:
            raise TypeError('Data must be formatted as a Dictionary or a pandas DataFrame')
        if not self.proper_columns:
            raise ValueError("The following features are missing. {}".format(self.missing_keys))   
                
    def _reformat(self, X):
        change_text = lambda x: x.lower().strip().replace(' ', '_').replace('-', '_')
        if self.type == dict:
            for key in X:
                new_key = change_text(key)
                X[new_key] = X.pop(key)
        else:
            X.columns = [change_text(column) for column in X.columns]

    
    def _check_columns(self, X):   
        if self.type == dict:
            if all(key in list(X.keys()) for key in self.needed_columns):
                self.proper_columns = True
                return
            else:
                self.missing_keys = [key for key in self.needed_columns if key not in list(X.keys())]   
        else:
            if all(column in X.columns for column in self.needed_columns):
                self.proper_columns = True
                return
            
            else:
                self.missing_keys = [column for column in self.needed_columns if column not in X.columns]
        return
    
    def _check_column_types(self, X):
        for column in self.needed_columns:
            needed_type = self.needed_types[column]
            if self.type == dict:
                if needed_type == 'str':
                    X[column] = str(X[column])
                else:
                    X[column] == int(X[column])
            else:
                X[column] = X[column].astype(needed_type)
                
    def _check_object_type(self, X):
        if type(X) == dict:
            self.type = dict
            self.proper_type = True
                
        elif type(X) == pd.core.frame.DataFrame:
            self.type = pd.core.frame.DataFrame
            self.proper_type = True
        else:
            self.proper_type = False

In [27]:
check = ValidateAndRenameColumns()

In [28]:
check.transform(val).info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3256 entries, 0 to 3255
Data columns (total 11 columns):
age               3256 non-null int64
workclass         3256 non-null object
education         3256 non-null object
education_num     3256 non-null object
marital_status    3256 non-null object
occupation        3256 non-null object
relationship      3256 non-null object
capital_gain      3256 non-null int64
capital_loss      3256 non-null int64
hours_per_week    3256 non-null int64
native_country    3256 non-null object
dtypes: int64(4), object(7)
memory usage: 279.9+ KB


In [13]:
val

Unnamed: 0.1,Unnamed: 0,age,workclass,fnlwgt,education,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,native-country,income
0,19094,21,Private,131473,Some-college,10,Never-married,Sales,Own-child,Asian-Pac-Islander,Male,0,0,20,Vietnam,<=50K
1,23338,21,Private,283969,HS-grad,9,Never-married,Craft-repair,Own-child,White,Male,0,0,40,Mexico,<=50K
2,18638,36,Private,469056,HS-grad,9,Divorced,Sales,Unmarried,Black,Female,0,0,25,United-States,<=50K
3,31363,27,Self-emp-not-inc,365110,Bachelors,13,Never-married,Prof-specialty,Not-in-family,White,Female,0,0,20,United-States,<=50K
4,20305,44,Federal-gov,113597,Some-college,10,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,55,United-States,>50K
5,27093,18,Private,334676,HS-grad,9,Never-married,Handlers-cleaners,Own-child,White,Male,0,0,25,United-States,<=50K
6,25068,55,Federal-gov,146477,Some-college,10,Married-civ-spouse,Craft-repair,Husband,White,Male,0,0,45,United-States,<=50K
7,5550,29,Private,132874,HS-grad,9,Married-civ-spouse,Craft-repair,Husband,White,Male,0,0,40,United-States,<=50K
8,27842,24,Private,227594,Some-college,10,Never-married,Sales,Own-child,White,Female,0,0,20,United-States,<=50K
9,6293,37,Private,333651,HS-grad,9,Never-married,Adm-clerical,Not-in-family,White,Male,0,0,42,United-States,<=50K
