# mlbox automl to raw data analysis

In [2]:
import pandas as pd
import numpy as np
import os
import os.path
import sys
import tempfile
import shutil

import mlflow
import tempfile

In [3]:
from mlbox.preprocessing import *
from mlbox.optimisation import *
from mlbox.prediction import *

  from ._conv import register_converters as _register_converters
Using TensorFlow backend.


In [8]:
sys.path.append('..')
from utils.kaggle import get_global_parameters
from utils.mlflow_experiments import retrieve_artifacts, extract_run_data_for_experiment
global_parms = get_global_parameters()

In [9]:
global_parms

{'PROJ_DIR': '/opt/project'}

In [25]:
TMPDIR= tempfile.mkdtemp()

## Retrieve data

In [26]:
DATA_DIR = os.path.join(global_parms['PROJ_DIR'],'data','raw')

pd.read_pickle(os.path.join(DATA_DIR,'train_combined.pkl')) \
    .sample(frac=0.2, random_state=13).to_csv(os.path.join(TMPDIR,"train.csv"), index=False)

pd.read_pickle(os.path.join(DATA_DIR,'test_combined.pkl')) \
    .sample(frac=0.2, random_state=31).to_csv(os.path.join(TMPDIR,"test.csv"), index=False)

In [27]:
os.listdir(TMPDIR)

['test.csv', 'train.csv']

In [28]:
paths = [os.path.join(TMPDIR,'train.csv'), os.path.join(TMPDIR,'test.csv')]

In [31]:
rd = Reader(sep=',', to_path='mlbox_save')
df = rd.train_test_split(paths, 'isFraud')



reading csv : train.csv ...
cleaning data ...
CPU time: 18.621618509292603 seconds

reading csv : test.csv ...
cleaning data ...
CPU time: 14.525218963623047 seconds

> Number of common features : 433

gathering and crunching for train and test datasets ...
reindexing for train and test datasets ...
dropping training duplicates ...
dropping constant variables on training set ...

> Number of categorical features: 31
> Number of numerical features: 402
> Number of training samples : 118108
> Number of test samples : 101338

> Top sparse features (% missing values on train set):
id_24    99.2
id_25    99.1
id_21    99.1
id_07    99.1
id_08    99.1
dtype: float64

> Task : classification
0.0    113972
1.0      4136
Name: isFraud, dtype: int64

encoding target ...


In [33]:
dft = Drift_thresholder(to_path='mlbox_save')
df = dft.fit_transform(df)


computing drifts ...
CPU time: 285.5420751571655 seconds

> Top 10 drifts

('id_31', 0.34081502685385967)
('id_13', 0.22482702038054025)
('D15', 0.1861408064503085)
('D11', 0.16513318587023518)
('D10', 0.14922871084693767)
('D4', 0.14561317888526437)
('V78', 0.14073041630073302)
('V45', 0.13843295619502527)
('V38', 0.1381913654663225)
('V77', 0.13807240612125993)

> Deleted variables : []
> Drift coefficients dumped into directory : mlbox_save


In [35]:
%%javascript
IPython.notebook.kernel.execute(`notebookName = '${IPython.notebook.notebook_name}'`);

<IPython.core.display.Javascript object>

In [37]:
# save sample as mlflow artifact
experiment_id = mlflow.set_experiment('eda')

#%%
with mlflow.start_run(experiment_id=experiment_id, run_name='mlbox_drift_analysis'):
    mlflow.log_param('notebook_name',notebookName)
    mlflow.log_artifacts('mlbox_save')


## Clean-up

In [38]:
shutil.rmtree(TMPDIR)

In [None]:
pd.__version__