In [3]:
import pandas as pd
from sqlalchemy import create_engine
import sys

%load_ext autoreload
%autoreload 2

sys.path.insert(1, '../src/')
pd.set_option("display.max_columns", 999)

import get_data
import train
import predict

from guara.modeling.supervised_modelz import *
from guara.feature_engineering.pipeline_modules import *
from guara.feature_engineering.window_features import *
from guara.feature_selection.feature_selection import *


import utils

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


# Get Data

In [2]:
df = utils.import_s3_example_dataset('marketing').drop('ID', axis = 1)
df.head()

Unnamed: 0,Education,Marital_Status,Income,Kidhome,Teenhome,Recency,MntWines,MntFruits,MntMeatProducts,MntFishProducts,MntSweetProducts,MntGoldProds,NumDealsPurchases,NumWebPurchases,NumCatalogPurchases,NumStorePurchases,NumWebVisitsMonth,Complain,Response,idade,dias_cliente,education_n,parceiro,pessoas_casa,renda_per_capta,compra_total_ano,porcentagem_compras_gold,gasto_renda,total_purchases,disc_purchases,num_compras_ano,percent_disc_compra,ticket_medio,conversao_site,cupons_aceitados
0,Graduation,Single,10.970592,0,0,58,6.455199,4.488636,6.304449,5.153292,4.488636,4.488636,1.386294,8,2.397895,4,7,0,1,63,2675,2,0,1,10.970592,764.5,0.055959,0.013064,22,1.386294,3.001869,0.136364,5.543906,0.012736,0
1,Graduation,Single,10.743869,1,1,38,2.484907,0.693147,1.94591,1.098612,0.693147,1.94591,1.098612,1,0.693147,2,5,0,0,66,2125,2,0,3,9.6453,10.5,0.251314,0.000227,4,1.098612,0.687059,0.5,2.790093,0.00282,0
2,Graduation,Together,11.179046,0,0,26,6.056784,3.912023,4.85203,4.718499,3.091042,3.7612,0.693147,8,1.098612,10,4,0,0,55,2324,2,1,2,10.485913,367.0,0.055643,0.005112,20,0.693147,3.141136,0.05,4.7693,0.02549,0
3,Graduation,Together,10.190432,1,0,26,2.484907,1.609438,3.044522,2.397895,1.386294,1.791759,1.098612,2,0.0,4,6,0,0,36,2151,2,1,3,9.091895,24.0,0.099091,0.0009,6,1.098612,1.018131,0.333333,3.201632,0.004638,0
4,PhD,Married,10.973254,1,0,94,5.159055,3.78419,4.779123,3.850148,3.332205,2.772589,1.791759,5,1.386294,6,5,0,0,39,2173,4,1,3,9.874676,203.5,0.036192,0.003485,14,1.791759,2.351588,0.357143,4.472065,0.013711,0


In [3]:
df.shape

(2216, 35)

# Split data

In [4]:
Xtrain = df[:1500].copy()
Xval = df[1500:].copy()

print(Xtrain.shape)
print(Xval.shape)

(1500, 35)
(716, 35)


In [5]:
float_columns = [
    'Income', 'MntWines', 'MntFruits', 'MntMeatProducts', 'MntFishProducts',
    'MntSweetProducts', 'MntGoldProds', 'NumDealsPurchases',
    'NumCatalogPurchases', 'renda_per_capta', 'compra_total_ano',
    'porcentagem_compras_gold', 'gasto_renda', 'disc_purchases',
    'num_compras_ano', 'percent_disc_compra', 'ticket_medio',
    'conversao_site'
]

int_columns = [
    'Kidhome', 'Teenhome', 'Recency', 'NumWebPurchases',
    'NumStorePurchases', 'NumWebVisitsMonth', 'Complain',
    'idade', 'dias_cliente', 'education_n', 'parceiro', 'pessoas_casa',
    'total_purchases', 'cupons_aceitados'
]

binary_columns = ['Response']

categorical_columns = ['Education', 'Marital_Status']

datetime_columns = [] 

In [6]:
dtypes_ = ApplyDtypes(float_columns,
                int_columns,
                binary_columns,
                categorical_columns,
                datetime_columns)

Xtrain = dtypes_.fit_transform(Xtrain)
Xval = dtypes_.transform(Xval)


----------- ApplyDtypes



# Great expectations

In [7]:
import great_expectations as ge

In [9]:
exp = ValidateDF().fit_transform(Xtrain)


----------- ValidateDF



In [26]:
Xval = exp.transform(Xval)


----- Validations Report -----

Validation success: False

evaluated_expectations: 137.00
successful_expectations: 127.00
unsuccessful_expectations: 10.00
success_percent: 92.70


In [25]:
exp.errors_on_validation

Unnamed: 0,success,column,expectation,expectation_values,element_count,unexpected_percent,unexpected_percent_total,missing_percent,unexpected_percent_nonmissing,observed_value
6,False,Income,expect_column_max_to_be_between,"{'min_value': 7.456454555176209, 'max_value': ...",716.0,,,,,13.410046
22,False,MntWines,expect_column_max_to_be_between,"{'min_value': 0.0, 'max_value': 7.30854279753919}",716.0,,,,,7.309212
31,False,MntMeatProducts,expect_column_min_to_be_between,"{'min_value': 0.6931471805599453, 'max_value':...",716.0,,,,,0.0
38,False,MntSweetProducts,expect_column_max_to_be_between,"{'min_value': 0.0, 'max_value': 5.293304824724...",716.0,,,,,5.572154
42,False,MntGoldProds,expect_column_max_to_be_between,"{'min_value': 0.0, 'max_value': 5.572154032177...",716.0,,,,,5.774552
50,False,NumWebPurchases,expect_column_max_to_be_between,"{'min_value': 0, 'max_value': 11}",716.0,,,,,27.0
95,False,renda_per_capta,expect_column_max_to_be_between,"{'min_value': 6.705231014870663, 'max_value': ...",716.0,,,,,12.311437
100,False,compra_total_ano,expect_column_min_to_be_between,"{'min_value': 2.5, 'max_value': 1245.5}",716.0,,,,,2.0
103,False,porcentagem_compras_gold,expect_column_max_to_be_between,"{'min_value': 0.0, 'max_value': 1.212048218331...",716.0,,,,,2.245736
131,False,conversao_site,expect_column_max_to_be_between,"{'min_value': 0.0, 'max_value': 0.083073573050...",716.0,,,,,0.333813


In [42]:
falses = [i for i in exp.validation['results'] if i['success'] == False]

In [43]:
falses

[{'meta': {},
  'expectation_config': {'expectation_type': 'expect_column_max_to_be_between',
   'kwargs': {'column': 'Income',
    'min_value': 7.456454555176209,
    'max_value': 11.997805391362945},
   'meta': {}},
  'result': {'observed_value': 13.410045949855984,
   'element_count': 716,
   'missing_count': None,
   'missing_percent': None},
  'exception_info': {'raised_exception': False,
   'exception_message': None,
   'exception_traceback': None},
  'success': False},
 {'meta': {},
  'expectation_config': {'expectation_type': 'expect_column_max_to_be_between',
   'kwargs': {'column': 'MntWines',
    'min_value': 0.0,
    'max_value': 7.30854279753919},
   'meta': {}},
  'result': {'observed_value': 7.309212365692763,
   'element_count': 716,
   'missing_count': None,
   'missing_percent': None},
  'exception_info': {'raised_exception': False,
   'exception_message': None,
   'exception_traceback': None},
  'success': False},
 {'meta': {},
  'expectation_config': {'expectation_t