# Data Science coding habits and useful tips

Notes about how to organize better my analyses and code 

refs: 
* https://www.thoughtworks.com/insights/blog/coding-habits-data-scientists
* https://github.com/zedr/clean-code-python


In [1]:
import numpy as np
import pandas as pd

from scipy import stats

import matplotlib
import matplotlib.pyplot as plt
%matplotlib inline 

import IPython
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"


In [2]:
!pwd
!ls images

/home/leandroohf/Documents/leandro/ds_pragmatic_programming
add_data.png		     neuron_ANN.png
baby1.jpeg		     no_data_pipeline.png
biasvariance.png	     non-linear_and_linear_decision_edge.png
data_frame.png		     notebook_vs_code.png
final_image.png		     onehot.png
hig_bias_low_variance.png    pathlib_cheatsheet_p1.png
iris_petal_sepal.png	     pivot-table-datasheet.png
irr_error.png		     refactor_notebooks.png
layers.jpeg		     resampling.png
learning_rate2.png	     smote.png
learning_rate.png	     split-apply-combine.png
loss_learning_rate.png	     tomek.png
low_high_var.png	     with_data_pipeline.png
mandrill_colour.png	     xfiles.jpeg
minibatch_learning_rate.png


## Useful tips

### Get start with few rows 

Work with small data for speedup things or run things local before move to a powerfull machine
1. code the model
1. dev cleaning data 


In [None]:
df = pd.read_csv('data/FremontBridge.csv', index_col='Date', parse_dates=True, nrows=150)
df.shape

### Report progress for long processing

https://medium.com/modern-nlp/10-great-ml-practices-for-python-developers-b089eefc18fc

https://github.com/tqdm/tqdm

tqdm also works in bash

run in bash
```sh
seq 9999999 | tqdm --bytes | wc -l

# compress folder with a lot of files
tar -zcf - docs/ | tqdm --bytes --total `du -sb docs/ | cut -f1` > backup.tgz
 ```


In [5]:
from tqdm import tqdm, tnrange
from time import sleep
import time

text = ""

for char in tqdm(["a", "b", "c", "d"]):
    time.sleep(0.25)
    text = text + char

100%|██████████| 4/4 [00:01<00:00,  3.97it/s]


In [9]:
for i in tnrange(3, desc='1st loop'):
    for j in tnrange(10, desc='2nd loop'):
        sleep(0.03)

HBox(children=(IntProgress(value=0, description='1st loop', max=3, style=ProgressStyle(description_width='init…

HBox(children=(IntProgress(value=0, description='2nd loop', max=10, style=ProgressStyle(description_width='ini…




HBox(children=(IntProgress(value=0, description='2nd loop', max=10, style=ProgressStyle(description_width='ini…




HBox(children=(IntProgress(value=0, description='2nd loop', max=10, style=ProgressStyle(description_width='ini…





In [10]:
# To activate tqdm in a notebook  !?
tqdm.pandas()

# read all data
bikes = pd.read_csv('data/FremontBridge.csv', index_col='Date', parse_dates=True)

bikes.shape

bikes.head()

bikes['col'] = bikes['Fremont Bridge East Sidewalk'].progress_apply(lambda x: x**2)

  from pandas import Panel


(56160, 2)

Unnamed: 0_level_0,Fremont Bridge East Sidewalk,Fremont Bridge West Sidewalk
Date,Unnamed: 1_level_1,Unnamed: 2_level_1
2019-02-28 23:00:00,5.0,6.0
2019-02-28 22:00:00,4.0,27.0
2019-02-28 21:00:00,16.0,32.0
2019-02-28 20:00:00,16.0,50.0
2019-02-28 19:00:00,39.0,59.0


100%|██████████| 56160/56160 [00:00<00:00, 1072290.11it/s]


### Set all seed for reproducibility


In [11]:
def set_seed(args):
    
    random.seed(args.seed)
    np.random.seed(args.seed)
    tf.random.set_seed(seed)
    
    #torch.manual_seed(args.seed) 
#     if args.n_gpu > 0:
#         torch.cuda.manual_seed_all(args.seed)

### Do not leave machines running in the cloud. Save money

Some of our experiments can run for hours. It's difficult to keep a track of it and shutdown the cloud instance when it's done. We need to shutdown in oreder to save money

* Put this code at the end of long process to shutdown the machine after few minutes or 1 hour



In [13]:
import os

def _run_command(cmd):
    
    return os.system(cmd)
    
## TODO: Send me an email or slack message 
# slack: https://keestalkstech.com/2019/10/simple-python-code-to-send-message-to-slack-channel-without-packages/
# https://julien.danjou.info/sending-emails-in-python-tutorial-code-examples/
# https://www.freecodecamp.org/news/send-emails-using-code-4fcea9df63f/
def shutdown(seconds=0, os='linux'):
    """Shutdown system after seconds given. Useful for shutting EC2 to save costs."""
    
    if os == 'linux':
    
        run_command('sudo shutdown -h -t sec %s' % seconds)
    
    elif os == 'windows':
        
        run_command('shutdown -s -t %s' % seconds)

### Create and save model and report 

This is so common that makes sense to have a code for taht


In [15]:
import json
import os

## TODO: personalize this code from: https://medium.com/modern-nlp/10-great-ml-practices-for-python-developers-b089eefc18fc

from sklearn.metrics import (accuracy_score, classification_report,
                             confusion_matrix, f1_score, fbeta_score)

def get_metrics(y, y_pred, beta=2, average_method='macro', y_encoder=None):
    
    if y_encoder:

        y = y_encoder.inverse_transform(y)
        y_pred = y_encoder.inverse_transform(y_pred)

    return {
        'accuracy': round(accuracy_score(y, y_pred), 4),
        'f1_score_macro': round(f1_score(y, y_pred, average=average_method), 4),
        'fbeta_score_macro': round(fbeta_score(y, y_pred, beta, average=average_method), 4),
        'report': classification_report(y, y_pred, output_dict=True),
        'report_csv': classification_report(y, y_pred, output_dict=False).replace('\n','\r\n')
    }


# TODO:
# you call before:
# file_name = get_filename(fname, desc, author)
# returns: 2020-04-uuid-author-descrition
def save_metrics(metrics: dict, model_directory, file_name):
    
    path = os.path.join(model_directory, file_name + '_report.txt')
    
    classification_report_to_csv(metrics['report_csv'], path)
    
    metrics.pop('report_csv')
    
    path = os.path.join(model_directory, file_name + '_metrics.json')
    
    json.dump(metrics, open(path, 'w'), indent=4)

## Jupyter notebook analyses workflow




2 folders workflow for working with jupyter notebook as data scientist. Once you have a model refactors as soon as possible the code to put in dev/production environment. See images bellow.


1. dev/ quick test your ideas (do worry too much about code QA )

    * dev code should be fast so, do not worry too much about code quality
    * keep notebooks short as posisible. Do not try to address many questions in one notebook  
    
1. report/notebook/presentation: 

    * keep only important results and move to report folder (publish) 

    * refactor important result (code quality matters in this phase) 

1. Notebooks conventions

    * Notebooks contents 

        * top: What are the mains questions and goals. Do not try to answer many questions at once in a notebook
        * top: Main conclusions and finds 
        * top: small plan what to try next 

    * Names: Ex: 2019-12-14-lhof-short_description.ipynb 
    * Try to keep notebooks short. **Large notebooks is hard to understand and mantain**.  

```sh

# search notebooks by dates 
ls 2019-12-1*.ipynb

# search by authors
ls 2019*-lhof-*.ipynb

# search by keyword in description
ls 2019*_keyword*.ipynb


# search a keyword in jupyter notebooks contents
grep --include=\*.ipynb -rnw '.' -e "lstm"

``` 

1. As soon as possible you have a model, move all the code from notebooks to implement an API or ETLS and etc 

    * Refactor again and pay attention on code quality. Error handle and etc. Try to follow coding best practices 
    

<div style="clear:both">
<img src="images/notebook_vs_code.png" style="float:left" width="400" align="left"/> 
</div>

<br><br><br><br>

<br>

<div style="clear:both">
<img src="images/refactor_notebooks.png" style="float:left" width="600" align="left"/>  
</div>


<br><br><br>

<div style="clear:both">


## Notes about best coding practices in jupyter notebooks


###  keep code clean



    * Don't expose your internals (Keep implementation details hidden). function and class are good for that
        * Ex: categorize_column, encode_label or split_train_n_test
    * Avoid print statements 
        * Ex: even glorified print statements such as df.head(), df.describe(), df.plot()  

    * Good variables name: Variable names should reveal intent


```python
loans = pd.read_csv('loans.csv')

monthly_loans = loans.groupby(['month']).sum()
monthly_loans_in_december = filter_loans(monthly_loans, month=12)

```
    
    * Avoid comments  


```python
## BAD

# Check to see if employee is eligible for full benefits
if (employee.flags and HOURLY_FLAG) and (employee.age > 65):
    # do something

## Better
if employee.isEligibleForBenefits():
    # do something
    
```

    * Avoid mental map

```python

# Bad
seq = ('Austin', 'New York', 'San Francisco')

for item in seq:
    do_stuff()
    do_some_other_stuff()
    # ...
    # Wait, what's `item` for again?
    dispatch(item)

# Good
locations = ('Austin', 'New York', 'San Francisco')

for location in locations:
    do_stuff()
    do_some_other_stuff()
    # ...
    dispatch(location)
```


### Use code abstracting (Functions and class)



    * Use functions to keep code “DRY” (Don’t Repeat Yourself)
    * Functions should do one thing
    * functions name are verbs
    * Fewer arguments (try to keep 2 or 3)

```python
def create_menu(title, body, button_text, cancellable):
    # ...

## Good
class Menu:
    def __init__(self, config: dict):
        title = config["title"]
        body = config["body"]
        # ...

menu = Menu(
    {
        "title": "My Menu",
        "body": "Something about my menu",
        "button_text": "OK",
        "cancellable": False
    }
)

## Also Good 
class MenuConfig:
    """A configuration for the Menu.

    Attributes:
        title: The title of the Menu.
        body: The body of the Menu.
        button_text: The text for the button label.
        cancellable: Can it be cancelled?
    """
    title: str
    body: str
    button_text: str
    cancellable: bool = False


def create_menu(config: MenuConfig):
    title = config.title
    body = config.body
    # ...


config = MenuConfig
config.title = "My delicious menu"
config.body = "A description of the various items on the menu"
config.button_text = "Order now!"
# The instance attribute overrides the default class attribute.
config.cancellable = True

create_menu(config)
 
```

    
    * class names are Noums and methods verbs
    
    
    * Use default arguments instead of short circuiting or conditionals
    
```python

def create_micro_brewery(name):
    name = "Hipster Brew Co." if name is None else name
    slug = hashlib.sha1(name.encode()).hexdigest()
    # etc.
    
## Better
def create_micro_brewery(name: str = "Hipster Brew Co."):
    slug = hashlib.sha1(name.encode()).hexdigest()
    # etc.

```
    
Gains because of the use of functions

* Readability 

    * Is focusing what while reading code instead of how. 

* Tetability (not realy sure. Only make sense when developyn the backe end code or API) 

    * we can easily write a unit test for it. 

* Resuability
</div>



In [6]:
data = pd.read_csv('./data/phone_data.csv')
data.info()
data.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 830 entries, 0 to 829
Data columns (total 7 columns):
index           830 non-null int64
date            830 non-null object
duration        830 non-null float64
item            830 non-null object
month           830 non-null object
network         830 non-null object
network_type    830 non-null object
dtypes: float64(1), int64(1), object(5)
memory usage: 45.5+ KB


Unnamed: 0,index,date,duration,item,month,network,network_type
0,0,15/10/14 06:58,34.429,data,2014-11,data,data
1,1,15/10/14 06:58,13.0,call,2014-11,Vodafone,mobile
2,2,15/10/14 14:46,23.0,call,2014-11,Meteor,mobile
3,3,15/10/14 14:48,4.0,call,2014-11,Tesco,mobile
4,4,15/10/14 17:27,4.0,call,2014-11,Tesco,mobile


In [7]:
import utils 
import functools

# # Shameless stolen from the comments of 
# # https://www.thoughtworks.com/insights/blog/coding-habits-data-scientists
# def compose(*functions):
    
#     return functools.reduce(lambda f, g: lambda x: f(g(x)), functions, lambda x: x)

# Examples of common functions
# The implementation is only ilustrations

def encode_column(df: pd.DataFrame, col_name: str):
    
    col_name_out = col_name + '_enc'
    df[col_name_out] = df[col_name] + '_enc'
    
    
    return df


def add_categorical_column(df):
    
    df['cat'] = df['network']
    
    
    return df

def convert_to_minutes(df):
    
    df['duration'] = df['duration'] /60.00
    
    return df


def split_features_and_labels(df):
    
    # XXX: You can use split_train form scikit learn.
    # But the exampleas is enough to express the ideas
    y = df['duration']
    X = df.iloc[:, df.columns != 'duration']

    
    return X,y


In [8]:
## Good example

df = encode_column(data, col_name='item')
df = add_categorical_column(df)
df = convert_to_minutes(df)
X,y = split_features_and_labels(df)

## Better example
prepare_data = utils.compose(utils.function_with_args(encode_column, col_name='item'),
                       add_categorical_column,
                       convert_to_minutes
                      )

data_pre = prepare_data(data)
X, y = split_features_and_labels(data_pre)

In [9]:
X.shape

(830, 8)

In [26]:
data.head()

data_pre.head()
df.head()

Unnamed: 0,index,date,duration,item,month,network,network_type,item_enc,cat
0,0,15/10/14 06:58,0.009564,data,2014-11,data,data,data_enc,data
1,1,15/10/14 06:58,0.003611,call,2014-11,Vodafone,mobile,call_enc,Vodafone
2,2,15/10/14 14:46,0.006389,call,2014-11,Meteor,mobile,call_enc,Meteor
3,3,15/10/14 14:48,0.001111,call,2014-11,Tesco,mobile,call_enc,Tesco
4,4,15/10/14 17:27,0.001111,call,2014-11,Tesco,mobile,call_enc,Tesco


Unnamed: 0,index,date,duration,item,month,network,network_type,item_enc,cat
0,0,15/10/14 06:58,0.009564,data,2014-11,data,data,data_enc,data
1,1,15/10/14 06:58,0.003611,call,2014-11,Vodafone,mobile,call_enc,Vodafone
2,2,15/10/14 14:46,0.006389,call,2014-11,Meteor,mobile,call_enc,Meteor
3,3,15/10/14 14:48,0.001111,call,2014-11,Tesco,mobile,call_enc,Tesco
4,4,15/10/14 17:27,0.001111,call,2014-11,Tesco,mobile,call_enc,Tesco


Unnamed: 0,index,date,duration,item,month,network,network_type,item_enc,cat
0,0,15/10/14 06:58,0.009564,data,2014-11,data,data,data_enc,data
1,1,15/10/14 06:58,0.003611,call,2014-11,Vodafone,mobile,call_enc,Vodafone
2,2,15/10/14 14:46,0.006389,call,2014-11,Meteor,mobile,call_enc,Meteor
3,3,15/10/14 14:48,0.001111,call,2014-11,Tesco,mobile,call_enc,Tesco
4,4,15/10/14 17:27,0.001111,call,2014-11,Tesco,mobile,call_enc,Tesco


## Use chaining methods  and pipe (alternative to functioning programming )

ref:
* https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.DataFrame.pipe.html
* https://towardsdatascience.com/the-unreasonable-effectiveness-of-method-chaining-in-pandas-15c2109e3c69


In [None]:
df = encode_column(data, col_name='item')
df = add_categorical_column(df)
df = convert_to_minutes(df)
X,y = split_features_and_labels(df)

In [27]:
# You need to comment the pipe lines to remove debbug code 
def csnap(df, fn=lambda x: x.shape, msg=None):
    """ Custom Help function to print things in method chaining.
        Returns back the df to further use in chaining.
    """
    if msg:
        print(msg)
    display(fn(df))
    return df


def setcols(df, fn=lambda x: x.columns.map('_'.join), cols=None):
    """Sets the column of the data frame to the passed column list.
    """
    if cols:
        df.columns = cols
    else:
        df.columns = fn(df)
    return df

In [47]:
# Pretty cool
data_pre = \
    (data.rename(columns={"item": "item2"})
                .pipe(csnap,msg="My debug message.Inspecting shapes ") # debbug
                .pipe(encode_column,col_name='item2')
                # used to assign new columns to a DataFrame
                .assign(item2=lambda r: r.item2 + '_enc') # encode_column(data, col_name='item')
                # np.where Return elements chosen from x or y depending on condition. Lik inline if else for numpy arrays
                .assign(duration2=lambda r: np.where(r.duration > 0.003611, 1, 0))
                .pipe(csnap) # debbug
                # Query the columns of a DataFrame with a boolean expression
                #.query("item = call")
                .pipe(csnap) # debbug
                .sort_values("duration", ascending=False)
                .reset_index(drop=True)
                .loc[:, ["index", "date", "item2", "network", "duration","duration2"]]
                .pipe(csnap, lambda r: r.sample(5), msg="Inspecting 5 random rows")
    )

My debug message.Inspecting shapes 


(830, 9)

(830, 11)

(830, 11)

Inspecting 5 random rows


Unnamed: 0,index,date,item2,network,duration,duration2
689,538,07/01/15 09:28,sms_enc,Vodafone,0.000278,0
534,800,02/03/15 14:53,call_enc,voicemail,0.000556,0
358,182,08/11/14 06:58,data_enc,data,0.009564,1
507,574,12/01/15 18:23,call_enc,Three,0.001111,0
419,695,06/02/15 18:39,call_enc,Three,0.006389,1


In [48]:
data_pre.shape
data_pre.head(11)

(830, 6)

Unnamed: 0,index,date,item2,network,duration,duration2
0,816,04/03/15 12:29,call_enc,landline,2.924444,1
1,742,17/02/15 19:09,call_enc,Three,0.646667,1
2,252,19/11/14 18:56,call_enc,Three,0.588889,1
3,59,23/10/14 08:34,call_enc,landline,0.538889,1
4,648,25/01/15 16:55,call_enc,Three,0.5175,1
5,398,17/12/14 18:08,call_enc,Vodafone,0.516389,1
6,31,18/10/14 13:10,call_enc,Three,0.476111,1
7,809,03/03/15 14:34,call_enc,Vodafone,0.368056,1
8,548,08/01/15 20:31,call_enc,Three,0.346389,1
9,105,31/10/14 13:27,call_enc,Tesco,0.342778,1


## Logging chain pipes func for debuging

In [66]:
import logging
import sys

logger = logging.getLogger()
fhandler = logging.FileHandler(filename="mylog.log", mode="a")
formatter = logging.Formatter(
    "%(asctime)s - %(name)s - %(levelname)s - %(message)s"
)
fhandler.setFormatter(formatter)
logger.addHandler(fhandler)
logger.addHandler(logging.StreamHandler()) # Add stdout as well
logger.setLevel(logging.DEBUG)

# Using logging and the func below you can change logger level to INFO to avoid getting debug 
# information during production. Much easy than comment lines with cnap code
def lsnap(df, fn=lambda x: x.shape, msg=None):
    """ Custom Help function to print things in method chaining.
        Returns back the df to further use in chaining.
    """
    global logging
    if msg:
        print(msg)
    logging.debug(f"{fn(df)}")
    return df

In [67]:
from sklearn.datasets import load_iris

data = load_iris()
iris = pd.DataFrame(data.data, columns=data.feature_names)


iris_pre = \
(
    iris.pipe(lsnap, lambda x: x.head(), msg="Before")
    .pipe(
        setcols,
        fn=lambda x: x.columns.str.lower()
        .str.replace(r"\(cm\)", "")
        .str.strip()
        .str.replace(" ", "_"),
    )
    .pipe(lsnap, lambda x: x.head(), msg="After")
)

   sepal length (cm)  sepal width (cm)  petal length (cm)  petal width (cm)
0                5.1               3.5                1.4               0.2
1                4.9               3.0                1.4               0.2
2                4.7               3.2                1.3               0.2
3                4.6               3.1                1.5               0.2
4                5.0               3.6                1.4               0.2
   sepal_length  sepal_width  petal_length  petal_width
0           5.1          3.5           1.4          0.2
1           4.9          3.0           1.4          0.2
2           4.7          3.2           1.3          0.2
3           4.6          3.1           1.5          0.2
4           5.0          3.6           1.4          0.2


Before
After


In [68]:
!cat mylog.log

2020-05-08 20:11:03,777 - root - DEBUG -    sepal length (cm)  sepal width (cm)  petal length (cm)  petal width (cm)
0                5.1               3.5                1.4               0.2
1                4.9               3.0                1.4               0.2
2                4.7               3.2                1.3               0.2
3                4.6               3.1                1.5               0.2
4                5.0               3.6                1.4               0.2
2020-05-08 20:11:03,777 - root - DEBUG -    sepal length (cm)  sepal width (cm)  petal length (cm)  petal width (cm)
0                5.1               3.5                1.4               0.2
1                4.9               3.0                1.4               0.2
2                4.7               3.2                1.3               0.2
3                4.6               3.1                1.5               0.2
4                5.0               3.6                1.4              

## ETL, Batches and APIs

ref:
* https://medium.com/bcggamma/data-science-python-best-practices-fdb16fdedf82
* https://medium.com/bcggamma/welcome-to-the-big-leagues-b9038648054f

### Convert you notebooks in python script 



* The model developed can be:
    1. run as batch script 
    1. option to be imported and execute inside anoteh python script

* Use docopt for build command line interfaces

* It is good to have the ability to run the script with different condiguration file or parameters **WHITHOUT CHANGE CODE**. For condiguration see next sections


Example: main.py 

```python
#!/usr/bin/env python3

""" Extract salient region of the image
Usage:
    main.py [options] <img> <out>

Options:
    --map-thr=N           Threshold value to be used in roi map. Default use otsu        [default: None]

"""

import os, sys
import numpy as np
import cv2

from notebooks.utils import print_7_numbers 

from docopt import docopt
import logging

logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

def execute(img_path, out_path, thr=None):

    logger.info(f'Reading image: {img_path}')
      
    pass 

if __name__ == '__main__':

    args = docopt(__doc__, version='1.0')

    img_path = args['<img>']
    out_path = args['<out>']
    thr = None if args['--map-thr'] == 'None' else int(args['--map-thr'])

    execute(img_path, out_path, thr=thr)

    logger.info(f'DONE!')
```


### Ensure portability by using environment variables


* variables related to dev, staging and production enviroment shoud be enviroment variables

This make easy portability and deployments

* **db user and password should be enviroment variable**
* To avoid type the paswword all the time or any env variable , put all env variables in a file ".env" in your home folder

Example: ~/.env

```sh
DB_HOST=localhost
DB_USER=root
DB_PASS=s1mpl3
```

### Configuration files 


Use yaml for configurations for **multi-languages projects**. Also when it is important to run the script using different configuration files  you just need to pass a new config2.yaml file as argument of the python script.

This allow applications with more than one languages (voicedoubles utilizes dotnet, bash and python) utilize a single config.yaml file. 

**If your applications is not multi-language, write the only one config.py** Is simple and it is less files in the projects

Example: config.yml
```yaml
base:
  path:
    data: "/shared/mydata.csv"
    logs: "logs"
    log_config: "adv_ds_logging.yml"
  rf_params:
    n_estimators: 400

dev:
  path_db: "sqlite:////some_dir/dev.db"
  
prod:
  path_db: "sqlite:////some_other_dir/prod.db"
```

* box library can be an option: Allows cfg.base.path.data instead of cfg["base"]["path"]["data"]


```sh
conda install -c conda-forge python-box
```


Ex: config.py that utilizes an yaml file
```python

import yaml
from box import Box

with open("config.yml", "r") as ymlfile:
     cfg = Box(yaml.safe_load(ymlfile))


print(cfg.base.path.data)
```

In the python script you can do

```python
import pandas as pd
from config import cfg

df = pd.read_csv(cfg.base.path.data)

```


In [8]:
# Example of single config.py (for only python applications)
from box import Box

CONFIG = {'prod': {'path': 'main.csv', 'usr': 'me'}, 'dev': {'path': 'second.csv', 'usr': 'you'}, 'n_est': 1}

cfg = Box(CONFIG)

print(f"prod path: {cfg.prod.path} and dev path: {cfg.dev.path}")

prod path: main.csv and dev path: second.csv


## Better logging 



* use loggers for debug messages and not print statments anymore 
* use logger configuration yaml file
* load the logger in the config.py file that reads config.yaml and reads also the log_config.yaml file 

```yaml

version: 1
# Set to False to get log messages from external packages you're using
disable_existing_loggers: False

# Formatters that are assigned to outputs ("handlers") below
formatters:
    simple:
        format: "%(asctime)s - %(name)s - %(levelname)s - %(message)s"

# Outputs, one or more of which can be assigned to loggers
handlers:
    console:
        class: logging.StreamHandler
        level: DEBUG
        formatter: simple
        stream: ext://sys.stdout

    debug_file_handler:
        class: logging.handlers.RotatingFileHandler
        level: DEBUG
        formatter: simple
        filename: logs/debug.log
        maxBytes: 20485760 # 20MB
        backupCount: 10
        encoding: utf8

    info_file_handler:
        # THIS MIGH DO ROTATION FILENAME FOR ME. FEATURE I WAS LOOKING FOR
        class: logging.handlers.RotatingFileHandler 
        level: INFO
        formatter: simple
        filename: logs/info.log
        maxBytes: 10485760 # 10MB
        backupCount: 10
        encoding: utf8

# How is each logger handled? In particular, set external packages you're
# using which spam the DEBUG, to log INFO and above (below, this is done 
# for the urllib3 and s3transfer loggers)

## NOT SURE IF I UNDERSTADOO THIS NEED DOUBLE CHECK
loggers:
  urllib3:
    level: INFO
    handlers: [info_file_handler]

  s3transfer:
    level: INFO
    handlers: [info_file_handler]
    
# The root logger
root:
  level: DEBUG
  handlers: [console, debug_file_handler, info_file_handler]

```

config.py 

```python
import os
import logging.config

import yaml
from box import Box

with open("config.yml", "r") as ymlfile:
     cfg = Box(yaml.safe_load(ymlfile))

# setting logger
os.makedirs(cfg.path.logs, exist_ok=True)

if os.path.exists(cfg.path.log_config):
    with open(cfg.path.log_config, "r") as ymlfile:
        log_config = yaml.safe_load(ymlfile)

    # Set up the logger configuration
    logging.config.dictConfig(log_config)
else:
    raise FileNotFoundError(f"Log yaml configuration file not found in {cfg.path.log_config}")

```


