# Data Science coding habits and useful tips

Notes about how to organize better my analyses and code 

refs: 
* https://www.thoughtworks.com/insights/blog/coding-habits-data-scientists
* https://github.com/zedr/clean-code-python


In [1]:
import numpy as np
import pandas as pd

from scipy import stats

import matplotlib
import matplotlib.pyplot as plt
%matplotlib inline 

import IPython
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"


In [2]:
!pwd
!ls images

/home/leandroohf/Documents/leandro/ds_pragmatic_programming
add_data.png		     no_data_pipeline.png
biasvariance.png	     non-linear_and_linear_decision_edge.png
data_frame.png		     notebook_vs_code.png
hig_bias_low_variance.png    onehot.png
iris_petal_sepal.png	     pathlib_cheatsheet_p1.png
irr_error.png		     pivot-table-datasheet.png
layers.jpeg		     refactor_notebooks.png
learning_rate2.png	     resampling.png
learning_rate.png	     smote.png
loss_learning_rate.png	     split-apply-combine.png
low_high_var.png	     tomek.png
minibatch_learning_rate.png  with_data_pipeline.png
neuron_ANN.png


## Useful tips

### Get start with few rows 

Work with small data for speedup things or run things local before move to a powerfull machine
1. code the model
1. dev cleaning data 


In [None]:
df = pd.read_csv('data/FremontBridge.csv', index_col='Date', parse_dates=True, nrows=150)
df.shape

### Report progress for long processing

https://medium.com/modern-nlp/10-great-ml-practices-for-python-developers-b089eefc18fc

https://github.com/tqdm/tqdm

tqdm also works in bash

run in bash
```sh
seq 9999999 | tqdm --bytes | wc -l

# compress folder with a lot of files
tar -zcf - docs/ | tqdm --bytes --total `du -sb docs/ | cut -f1` > backup.tgz
 ```


In [5]:
from tqdm import tqdm, tnrange
from time import sleep
import time

text = ""

for char in tqdm(["a", "b", "c", "d"]):
    time.sleep(0.25)
    text = text + char

100%|██████████| 4/4 [00:01<00:00,  3.97it/s]


In [9]:
for i in tnrange(3, desc='1st loop'):
    for j in tnrange(10, desc='2nd loop'):
        sleep(0.03)

HBox(children=(IntProgress(value=0, description='1st loop', max=3, style=ProgressStyle(description_width='init…

HBox(children=(IntProgress(value=0, description='2nd loop', max=10, style=ProgressStyle(description_width='ini…




HBox(children=(IntProgress(value=0, description='2nd loop', max=10, style=ProgressStyle(description_width='ini…




HBox(children=(IntProgress(value=0, description='2nd loop', max=10, style=ProgressStyle(description_width='ini…





In [10]:
# To activate tqdm in a notebook  !?
tqdm.pandas()

# read all data
bikes = pd.read_csv('data/FremontBridge.csv', index_col='Date', parse_dates=True)

bikes.shape

bikes.head()

bikes['col'] = bikes['Fremont Bridge East Sidewalk'].progress_apply(lambda x: x**2)

  from pandas import Panel


(56160, 2)

Unnamed: 0_level_0,Fremont Bridge East Sidewalk,Fremont Bridge West Sidewalk
Date,Unnamed: 1_level_1,Unnamed: 2_level_1
2019-02-28 23:00:00,5.0,6.0
2019-02-28 22:00:00,4.0,27.0
2019-02-28 21:00:00,16.0,32.0
2019-02-28 20:00:00,16.0,50.0
2019-02-28 19:00:00,39.0,59.0


100%|██████████| 56160/56160 [00:00<00:00, 1072290.11it/s]


### Set all seed for reproducibility


In [11]:
def set_seed(args):
    
    random.seed(args.seed)
    np.random.seed(args.seed)
    tf.random.set_seed(seed)
    
    #torch.manual_seed(args.seed) 
#     if args.n_gpu > 0:
#         torch.cuda.manual_seed_all(args.seed)

### Do not leave machines running in the cloud. Save money

Some of our experiments can run for hours. It's difficult to keep a track of it and shutdown the cloud instance when it's done. We need to shutdown in oreder to save money

* Put this code at the end of long process to shutdown the machine after few minutes or 1 hour



In [13]:
import os

def _run_command(cmd):
    
    return os.system(cmd)
    
## TODO: Send me an email or slack message 
# slack: https://keestalkstech.com/2019/10/simple-python-code-to-send-message-to-slack-channel-without-packages/
# https://julien.danjou.info/sending-emails-in-python-tutorial-code-examples/
# https://www.freecodecamp.org/news/send-emails-using-code-4fcea9df63f/
def shutdown(seconds=0, os='linux'):
    """Shutdown system after seconds given. Useful for shutting EC2 to save costs."""
    
    if os == 'linux':
    
        run_command('sudo shutdown -h -t sec %s' % seconds)
    
    elif os == 'windows':
        
        run_command('shutdown -s -t %s' % seconds)

### Create and save model and report 

This is so common that makes sense to have a code for taht


In [15]:
import json
import os

## TODO: personalize this code from: https://medium.com/modern-nlp/10-great-ml-practices-for-python-developers-b089eefc18fc

from sklearn.metrics import (accuracy_score, classification_report,
                             confusion_matrix, f1_score, fbeta_score)

def get_metrics(y, y_pred, beta=2, average_method='macro', y_encoder=None):
    
    if y_encoder:

        y = y_encoder.inverse_transform(y)
        y_pred = y_encoder.inverse_transform(y_pred)

    return {
        'accuracy': round(accuracy_score(y, y_pred), 4),
        'f1_score_macro': round(f1_score(y, y_pred, average=average_method), 4),
        'fbeta_score_macro': round(fbeta_score(y, y_pred, beta, average=average_method), 4),
        'report': classification_report(y, y_pred, output_dict=True),
        'report_csv': classification_report(y, y_pred, output_dict=False).replace('\n','\r\n')
    }


# TODO:
# you call before:
# file_name = get_filename(fname, desc, author)
# returns: 2020-04-uuid-author-descrition
def save_metrics(metrics: dict, model_directory, file_name):
    
    path = os.path.join(model_directory, file_name + '_report.txt')
    
    classification_report_to_csv(metrics['report_csv'], path)
    
    metrics.pop('report_csv')
    
    path = os.path.join(model_directory, file_name + '_metrics.json')
    
    json.dump(metrics, open(path, 'w'), indent=4)

## Jupyter notebook analyses workflow


2 folders workflow for working with jupyter notebook as data scientist. Once you have a model refactors as soon as possible the code to put in dev/production environment. See images bellow.


1. dev/ quick test your ideas (do worry too much about code QA )

    * dev code should be fast so, do not worry too much about code quality
    * keep notebooks short as posisible. Do not try to address many questions in one notebook  
    
1. report/notebook/presentation: 

    * keep only important results and move to report folder (publish) 

    * refactor important result (code quality matters in this phase) 

1. Notebooks conventions

    * Notebooks contents 

        * top: What are the mains questions and goals. Do not try to answer many questions at once in a notebook
        * top: Main conclusions and finds 
        * top: small plan what to try next 

    * Names: Ex: 2019-12-14-lhof-short_description.ipynb 
    * Try to keep notebooks short. **Large notebooks is hard to understand and mantain**.  

```sh

# search notebooks by dates 
ls 2019-12-1*.ipynb

# search by authors
ls 2019*-lhof-*.ipynb

# search by keyword in description
ls 2019*_keyword*.ipynb


# search a keyword in jupyter notebooks contents
grep --include=\*.ipynb -rnw '.' -e "lstm"

``` 

1. As soon as possible you have a model, move all the code from notebooks to implement an API or ETLS and etc 

    * Refactor again and pay attention on code quality. Error handle and etc. Try to follow coding best practices 
    

<div style="clear:both">
<img src="images/notebook_vs_code.png" style="float:left" width="400" align="left"/> 
</div>

<br><br><br><br>

<br>

<div style="clear:both">
<img src="images/refactor_notebooks.png" style="float:left" width="600" align="left"/>  
</div>


<br><br><br>

<div style="clear:both">


## Notes about best coding practices in jupyter notebooks


###  keep code clean

    * Don't expose your internals (Keep implementation details hidden). function and class are good for that
        * Ex: categorize_column, encode_label or split_train_n_test
    * Avoid print statements 
        * Ex: even glorified print statements such as df.head(), df.describe(), df.plot()  

    * Good variables name: Variable names should reveal intent


```python
loans = pd.read_csv('loans.csv')

monthly_loans = loans.groupby(['month']).sum()
monthly_loans_in_december = filter_loans(monthly_loans, month=12)

```
    
    * Avoid comments  


```python
## BAD

# Check to see if employee is eligible for full benefits
if (employee.flags and HOURLY_FLAG) and (employee.age > 65):
    # do something

## Better
if employee.isEligibleForBenefits():
    # do something
    
```

    * Avoid mental map

```python

# Bad
seq = ('Austin', 'New York', 'San Francisco')

for item in seq:
    do_stuff()
    do_some_other_stuff()
    # ...
    # Wait, what's `item` for again?
    dispatch(item)

# Good
locations = ('Austin', 'New York', 'San Francisco')

for location in locations:
    do_stuff()
    do_some_other_stuff()
    # ...
    dispatch(location)
```



### Use code abstracting (Functions and class)

    * Use functions to keep code “DRY” (Don’t Repeat Yourself)
    * Functions should do one thing
    * functions name are verbs
    * Fewer arguments (try to keep 2 or 3)

```python
def create_menu(title, body, button_text, cancellable):
    # ...

## Good
class Menu:
    def __init__(self, config: dict):
        title = config["title"]
        body = config["body"]
        # ...

menu = Menu(
    {
        "title": "My Menu",
        "body": "Something about my menu",
        "button_text": "OK",
        "cancellable": False
    }
)

## Also Good 
class MenuConfig:
    """A configuration for the Menu.

    Attributes:
        title: The title of the Menu.
        body: The body of the Menu.
        button_text: The text for the button label.
        cancellable: Can it be cancelled?
    """
    title: str
    body: str
    button_text: str
    cancellable: bool = False


def create_menu(config: MenuConfig):
    title = config.title
    body = config.body
    # ...


config = MenuConfig
config.title = "My delicious menu"
config.body = "A description of the various items on the menu"
config.button_text = "Order now!"
# The instance attribute overrides the default class attribute.
config.cancellable = True

create_menu(config)
 
```

    
    * class names are Noums and methods verbs
    
    
    * Use default arguments instead of short circuiting or conditionals
    
```python

def create_micro_brewery(name):
    name = "Hipster Brew Co." if name is None else name
    slug = hashlib.sha1(name.encode()).hexdigest()
    # etc.
    
## Better
def create_micro_brewery(name: str = "Hipster Brew Co."):
    slug = hashlib.sha1(name.encode()).hexdigest()
    # etc.

```
    
Gains because of the use of functions

* Readability 

    * Is focusing what while reading code instead of how. 

* Tetability (not realy sure. Only make sense when developyn the backe end code or API) 

    * we can easily write a unit test for it. 

* Resuability
</div>



In [None]:
data = pd.read_csv('./data/phone_data.csv')
data.info()
data.head()

In [None]:
import utils 
import functools

# # Shameless stolen from the comments of 
# # https://www.thoughtworks.com/insights/blog/coding-habits-data-scientists
# def compose(*functions):
    
#     return functools.reduce(lambda f, g: lambda x: f(g(x)), functions, lambda x: x)

# Examples of common functions
# The implementation is only ilustrations

def encode_column(df: pd.DataFrame, col_name: str):
    
    col_name_out = col_name + '_enc'
    df[col_name_out] = df[col_name] + '_enc'
    
    
    return df


def add_categorical_column(df):
    
    df['cat'] = df['network']
    
    
    return df

def convert_to_minutes(df):
    
    df['duration'] = df['duration'] /60.00
    
    return df


def split_features_and_labels(df):
    
    # XXX: You can use split_train form scikit learn.
    # But the exampleas is enough to express the ideas
    y = df['duration']
    X = df.iloc[:, df.columns != 'duration']

    
    return X,y


In [None]:
## Good example

df = encode_column(data, col_name='item')
df = add_categorical_column(df)
df = convert_to_minutes(df)
X,y = split_features_and_labels(df)

## Better example
prepare_data = utils.compose(utils.function_with_args(encode_column, col_name='item'),
                       add_categorical_column,
                       convert_to_minutes
                      )

data_pre = prepare_data(data)
X, y = split_features_and_labels(data_pre)

In [None]:
X.shape

In [None]:
data.head()

data_pre.head()
df.head()