In [1]:
# to handle datasets
import pandas as pd

# Common imports
import sys
import re
from pathlib import Path

# automated exploratory analysis
from pandas_profiling import ProfileReport

### Import project modules

In [2]:
sys.path.append('../')
# Load the "autoreload" extension so that code can change
%load_ext autoreload
# reload modules so any new changes gets loaded
%autoreload 2
from tb_analysis.config.core import config, PACKAGE_ROOT, DATA_DIR, IMAGES_DIR, TRAINED_MODEL_DIR
from tb_analysis import preprocessors as pp, train as tr, rule_generation as cr, evaluate as ev
from tb_analysis import __version__ as _version

## Import data

In [3]:
raw = pd.read_csv(Path(f"{DATA_DIR}/{config.app_config.data_file}"))

# rows and columns of the data
print(raw.shape)

(204, 802)


## Rename input variables to prototype-specific LCA

In [4]:
# read input variables to LCA: raw variable name and label
data_dict= pd.read_csv(Path(f"{DATA_DIR}/orig_scale_dict.csv"))
# create dictionary of {variable_names:variable_label}
mapping_dict = dict(zip(data_dict.variable_names, data_dict.variable_labels))

In [5]:
# Rename variables in dataframe using the mapping dict
data = raw.rename(mapping_dict, axis=1)

In [6]:
# Find all variables in the dataframe that ends with "orig_scl"
pattern = re.compile(r"_orig_scl$")
orig_scale_vars = [x for x in data.columns.values.tolist() if re.search(pattern, x)] 

# New scale vars has the same name as orig_scale variables, except w/o the "orig_scale" prefix
new_scale_vars = [s.replace("_orig_scl", "") for s in orig_scale_vars]

selected_vars = orig_scale_vars + new_scale_vars

In [7]:
# Filter input variables to LCA in their original scale
data = data.filter(items=selected_vars)
# convert all columns to type 'category'
for col in data.columns.values:
    data[col] = data[col].astype('category',copy=False)
    
    
# sort column names in alphabetical order
data.sort_index(axis=1, inplace=True)

## AutoEDA

In [8]:
profile = ProfileReport(data, minimal=True)
profile.to_file(output_file=Path(f"{IMAGES_DIR}/input_var_lca_orig_scale.html"))

Summarize dataset:   0%|          | 0/5 [00:00<?, ?it/s]

Generate report structure:   0%|          | 0/1 [00:00<?, ?it/s]

Render HTML:   0%|          | 0/1 [00:00<?, ?it/s]

Export report to file:   0%|          | 0/1 [00:00<?, ?it/s]