<img src="https://www.northwestern.edu/brand/images/wordmark/wordmark-vert.gif" alt="Northwestern Logo" width="500" align="left">"

# Overview 

Test `src.generate_features.drop_na()` funtion with arguments and keyword arguments given as 

## Imports and setup

In [1]:
# must go first
%matplotlib inline
%config InlineBackend.figure_format='retina'

# Reloads functions each time so you can edit a script 
# and not need to restart the kernel
%load_ext autoreload
%autoreload 2

# plotting
import matplotlib as mpl
from matplotlib import pyplot as plt
import seaborn as sns
sns.set_context("poster", font_scale=1.3)
import folium

import sys
import os
import datetime

sns.set()
sns.set_context('poster', font_scale=1.3)
sns.set_style("white")

import warnings
warnings.filterwarnings('ignore')
import logging 

# basic wrangling
import numpy as np
import yaml
import json
import re
import pandas as pd

# eda tools
import pivottablejs
import missingno as msno

# Update matplotlib defaults to something nicer
mpl_update = {
    'font.size': 16,
    'xtick.labelsize': 14,
    'ytick.labelsize': 14,
    'figure.figsize': [12.0, 8.0],
    'axes.labelsize': 20,
    'axes.labelcolor': '#677385',
    'axes.titlesize': 20,
    'lines.color': '#0055A7',
    'lines.linewidth': 3,
    'text.color': '#677385',
    'font.family': 'sans-serif',
    'font.sans-serif': 'Tahoma'
}
mpl.rcParams.update(mpl_update)

In [2]:
# Create helper functions for specifying paths and appending
# directories with relevant python source code.
# This is a lot at the top of your notebook but if you get the jupyter
# extension for collapsing headings, you can always have this and the
# imports collapsed

root_dir = os.curdir
max_nest = 10  # arbitrary, 3 would probably suffice
nest = 0
while "src" not in os.listdir(root_dir) and nest < max_nest:
    # Look up the directory structure for a src directory
    root_dir = os.path.join(os.pardir, root_dir)
    nest += 1
    
# If you don't find the src directory, the root directory is this directory
root_dir = os.path.abspath(root_dir) if nest < max_nest else os.path.abspath(
    os.curdir)

# Add the root directory to be able to import from src, etc
sys.path.append(root_dir)

# Get the source directory and append path to access
# python packages/scripts within directory
if "src" in os.listdir(root_dir):
    src_dir = os.path.join(root_dir, "src")

# If data or figures directory don't exist in project directory,
# they will be saved to this directory
data_dir = os.path.join(
    root_dir, "data") if "data" in os.listdir(root_dir) else os.curdir
external_data_dir = os.path.join(
    data_dir, "external") if "external" in os.listdir(data_dir) else os.curdir
figure_dir = os.path.join(
    root_dir,
    "figures") if "figures" in os.listdir(root_dir) else os.curdir
models_dir = os.path.join(
    root_dir,
    "models") if "models" in os.listdir(root_dir) else os.curdir
config_dir = os.path.join(
    root_dir,
    "config") if "config" in os.listdir(root_dir) else os.curdir

# Prepends the directory path for specifying paths to data or figures
# dataplus("data.csv") -> "/Users/cmawer/project/data/data.csv"
# figplus("cool.png") -> "/Users/cmawer/project/figures/cool.png"
dataplus = lambda x: os.path.join(data_dir, x)
dataextplus = lambda x: os.path.join(external_data_dir, x)
figplus = lambda x: os.path.join(figure_dir, x)
modelsplus = lambda x: os.path.join(models_dir, x)
configplus = lambda x: os.path.join(config_dir, x)

# Prepends the date to a string (e.g. to save dated files)
# dateplus("cool-figure.png") -> "2018-12-05-cool-figure.png"
now = datetime.datetime.now().strftime("%Y-%m-%d")
dateplus = lambda x: "%s-%s" % (now, x)

In [3]:
# Import from project src 
from src import load_data as ld
from src import generate_features as gf 
from src.helpers.helpers import Timer

## Load config YAML

In [4]:
with open(configplus("test_model_config.yml"), "r") as f:
    config = yaml.load(f)

In [5]:
config.keys()

dict_keys(['model', 'load_data', 'generate_features', 'train_model', 'score_model', 'evaluate_model'])

## Load data

In [6]:
df = pd.read_csv(dataplus("sample/music_data_combined.csv"), index_col=0)

In [7]:
df.head()

Unnamed: 0,artist.hotttnesss,artist.id,artist.name,artist_mbtags,artist_mbtags_count,bars_confidence,bars_start,beats_confidence,beats_start,duration,...,start_of_fade_out,tatums_confidence,tatums_start,tempo,terms,terms_freq,time_signature,time_signature_confidence,title,year
0,0.401998,ARD7TVE1187B99BFB1,Casual,,0.0,0.643,0.58521,0.834,0.58521,218.93179,...,218.932,0.779,0.28519,92.198,hip hop,1.0,4.0,0.778,I Didn't Mean To,0
1,0.4175,ARMJAGH1187FB546F3,The Box Tops,classic pop and rock,1.0,0.007,0.71054,1.0,0.20627,148.03546,...,137.915,0.969,0.20627,121.274,blue-eyed soul,1.0,4.0,0.384,Soul Deep,1969
2,0.343428,ARKRRTF1187B9984DA,Sonora Santanera,,0.0,0.98,0.73152,0.98,0.73152,177.47546,...,172.304,0.482,0.42132,100.07,salsa,1.0,1.0,0.0,Amor De Cabaret,0
3,0.454231,AR7G5I41187FB4CE6C,Adam Ant,uk,1.0,0.017,1.30621,0.809,0.81002,233.40363,...,217.124,0.601,0.56254,119.293,pop rock,0.988584,4.0,0.0,Something Girls,1982
4,0.401724,ARXR32B1187FB57099,Gob,,0.0,0.175,1.06368,0.883,0.13576,209.60608,...,198.699,1.0,0.13576,129.738,pop punk,0.887288,4.0,0.562,Face the Ashes,2007


## Test `generate features.drop_na()`

### Configurations

In [8]:
config["generate_features"]

{'make_categorical': {'columns': 'terms',
  'terms': {'load_column_as_list': {'path': 'data/auxiliary/genres.csv',
    'header': None,
    'column': 0},
   'one_hot_encode': True}},
 'bin_values': {'columns': ['key', 'artist.hotttnesss'], 'quartiles': [4, 2]},
 'drop_na': {'columns': 'song.hotttnesss'},
 'choose_features': {'features_to_use': ['key',
   'beats_start',
   'bars_start',
   'duration',
   'terms',
   'loudness'],
  'target': 'song.hotttnesss'},
 'save_features': 'test/model/test/music_processed.csv'}

In [9]:
config["generate_features"]["drop_na"]

{'columns': 'song.hotttnesss'}

### Function help

In [10]:
help(gf.drop_na)

Help on function drop_na in module src.generate_features:

drop_na(df, columns=None)
    Drops rows of dataframe where there are null values in the columns given.
    
    Args:
        df (:py:class:`pandas.DataFrame`): DataFrame containing data
        columns (str or list of str, optional): Name of column or list of columns for which to drop rows
            that contain nulls. If None, the original dataframe will be returned.
    
    Returns:
        df (:py:class:`pandas.DataFrame`): DataFrame containing only data for which no nulls existed in the columns



### Providing the `columns` argument directly

In [11]:
dfA = gf.drop_na(df, columns="song.hotttnesss")

4351 values were dropped from the dataset because of missing values


### What if we provide the dictionary containing the arguments for `generate_features.drop_na()`?

In [12]:
dfB = gf.drop_na(df, **config["generate_features"]["drop_na"])

4351 values were dropped from the dataset because of missing values


In [13]:
dfB.equals(dfA)

True

It works! That's because

```python
dfB = gf.drop_na(df, **config["generate_features"]["drop_na"])
```

is equivalent to executing

```python
dfB = gf.drop_na(df, columns="song.hotttnesss")
```

The `y = func(x, **dictionary)` pattern expands each key into the argument as if you were to write `key=val`. 

So for 

```python
dictionary = {"key1": "val1", "key2", "val2", "key3": "val3"}
```

executing `y = func(x, **dictionary)` is the same as: 

```python
y = func(x, key1=val1, key3=val3, key2=val2)
```

(or any ordering of `key1`, `key2`, and `key3`)

### What if `columns` wasn't a keyword argument ?

In [14]:
def drop_na_test(df, columns):
    """Drops rows of dataframe where there are null values in the columns given.

    Args:
        df (:py:class:`pandas.DataFrame`): DataFrame containing data
        columns (str or list of str): Name of column or list of columns for which to drop rows that contain nulls.

    Returns:
        df (:py:class:`pandas.DataFrame`): DataFrame containing only data for which no nulls existed in the columns
    """
    logger = logging.getLogger(__name__)
    if columns is not None:
        columns = [columns] if type(columns) == str else columns
        num_nas = df[columns].isna().sum()
        for col in columns:
            logger.info("There were %i missing %s values", num_nas.loc[col],
                        col)
        df_len = len(df)
        df = df.dropna(subset=columns)
        logger.warning(
            "%i values were dropped from the dataset because of missing values",
            df_len - len(df))
    else:
        logger.warning(
            "No columns provided for drop_na, original dataframe being returned"
        )

    return df

In [15]:
dfC = drop_na_test(df, **config["generate_features"]["drop_na"])

4351 values were dropped from the dataset because of missing values


In [16]:
dfC.equals(dfB)

True

As long as all required arguments exist in the dictionary that is expanded in the function, the function will work as desired.  

### Why not use `**kwargs`? 

In [17]:
def drop_na_testB(df, **kwargs):
    """Drops rows of dataframe where there are null values in the columns given.

    Args:
        df (:py:class:`pandas.DataFrame`): DataFrame containing data
        **kwargs: Should include `columns`

    Returns:
        df (:py:class:`pandas.DataFrame`): DataFrame containing only data for which no nulls existed in the columns
    """

    logger = logging.getLogger(__name__)
    if columns is not None:
        columns = [columns] if type(columns) == str else columns
        num_nas = df[columns].isna().sum()
        for col in columns:
            logger.info("There were %i missing %s values", num_nas.loc[col],
                        col)
        df_len = len(df)
        df = df.dropna(subset=columns)
        logger.warning(
            "%i values were dropped from the dataset because of missing values",
            df_len - len(df))
    else:
        logger.warning(
            "No columns provided for drop_na, original dataframe being returned"
        )

    return df

In [18]:
dfD = drop_na_testB(df, **config["generate_features"]["drop_na"])

UnboundLocalError: local variable 'columns' referenced before assignment

Whoops, you actually need to pull out `columns` from the `kwargs` dictionary:

In [19]:
def drop_na_testB(df, **kwargs):
    """Drops rows of dataframe where there are null values in the columns given.

    Args:
        df (:py:class:`pandas.DataFrame`): DataFrame containing data
        **kwargs: Should include `columns`

    Returns:
        df (:py:class:`pandas.DataFrame`): DataFrame containing only data for which no nulls existed in the columns
    """
    
    logger = logging.getLogger(__name__)
    
    columns = kwargs["columns"]
    if columns is not None:
        columns = [columns] if type(columns) == str else columns
        num_nas = df[columns].isna().sum()
        for col in columns:
            logger.info("There were %i missing %s values", num_nas.loc[col],
                        col)
        df_len = len(df)
        df = df.dropna(subset=columns)
        logger.warning(
            "%i values were dropped from the dataset because of missing values",
            df_len - len(df))
    else:
        logger.warning(
            "No columns provided for drop_na, original dataframe being returned"
        )

    return df

In [20]:
dfD = drop_na_testB(df, **config["generate_features"]["drop_na"])

4351 values were dropped from the dataset because of missing values


In [21]:
dfD.equals(dfA)

True

They are equal but it is not clear to anyone using the function what the expected arguments are and columns has to be pulled out of the `kwargs` dictionary to be used. 

# Appendix

## Watermark 
For full reproducibility of results, use exact data extraction as defined at top of notebook and ensure that the environment is exactly as follows: 

In [22]:
# ! pip install watermark
%load_ext watermark
%watermark -v -m --iversions -g

json        2.0.9
numpy       1.15.1
matplotlib  2.2.3
seaborn     0.9.0
folium      0.7.0
logging     0.5.1.2
yaml        3.13
re          2.2.1
pandas      0.23.4
missingno   0.4.1
CPython 3.6.7
IPython 7.2.0

compiler   : GCC 4.2.1 Compatible Clang 4.0.1 (tags/RELEASE_401/final)
system     : Darwin
release    : 18.2.0
machine    : x86_64
processor  : i386
CPU cores  : 12
interpreter: 64bit
Git hash   : c83a5448fb6bf0d75a7cf573ab3eb359679f93f2
