In [495]:
import ast
from collections.abc import Iterable, Mapping
from datetime import datetime
from itertools import cycle
import os
import pandas as pd
from pathlib import Path
import re
import subprocess

from htools.meta import fallback

In [512]:
# This is slightly outdated - see cli.py for the final version 
# (and documentation).
class ReadmeUpdater:
    
    time_fmt = '%Y-%m-%d %H:%M:%S'
    readme_id_start = '\n---\nStart of auto-generated file data.<br/>'
    readme_id_end = '\n<br/>End of auto-generated file data. Do not add ' \
                    'anything below this.\n'
    readme_regex = readme_id_start + '(.|\n)*' + readme_id_end
    last_edited_cmd_fmt = 'git log -1 --pretty="format:%ct" {}'

    def __init__(self, *dirs, default='_'):
        self.dirs = [Path(d) for d in dirs]
        self.extensions = {'.py', '.ipynb'}
        self.default = default

    def update_dirs(self, *dirs):
        for dir_ in dirs or self.dirs:
            file_df = self._parse_dir_files(dir_)
            self.update_readme(dir_/'README.md', file_df)

    def _parse_dir_files(self, dir_):
        files = []
        for path in dir_.iterdir():
            if path.suffix not in self.extensions: continue
            stats = path.stat()
            files.append({
                'File': path.parts[-1],
                'Summary': self.parse_file(path) or self.default,
                'Last Modified': self.last_modified_date(path),
                'Size': self.readable_file_size(stats.st_size)
            })
        return pd.DataFrame(files).sort_values('File')

    def parse_file(self, path):
        return getattr(self, f'_parse_{path.suffix[1:]}')(path)
    
    def update_readme(self, path, file_df):
        path.touch()
        with open(path, 'r+') as f:
            text = f.read().split(self.readme_id_start)[0] \
                   + self._autogenerate_text(file_df)
            f.seek(0)
            f.write(text)
            
    def _autogenerate_text(self, df):
        date_str = 'Last updated: ' + datetime.now().strftime(self.time_fmt)
        autogen = (self.readme_id_start + date_str + '\n\n'
                   + df.to_html(index=False).replace('\\n', '<br/>') 
                   + self.readme_id_end)
        return autogen

    def _parse_py(self, path):
        with open(path, 'r') as f:
            tree = ast.parse(f.read())
        return ast.get_docstring(tree)

    def _parse_ipynb(self, path):
        with open(path, 'r') as f:
            cells = json.load(f)['cells'][:3]
        for cell in cells:
            if cell['cell_type'] == 'markdown' and \
                    'summary' in cell['source'][0].lower():
                # Notebook lines include newlines so we don't add them back in.
                return ''.join(cell['source'][1:]).strip()
        return ''
    
    def timestamp_to_time_str(self, time):
        return datetime.fromtimestamp(time).strftime(self.time_fmt)
    
    def last_modified_date(self, path):
        try:
            # If we're in a git repo, file edit times are changed when we pull
            # so we have to use built-in git functionality. This will fail if
            # we call the command from a different repo. I vaguely recall 
            # seeing weird git behavior inside running docker containers so I'm
            # not sure if this will work there.
            git_time = subprocess.check_output(
                self.last_edited_cmd_fmt.format(path).split()
            )
            timestamp = int(git_time.decode().strip()
                                    .replace('format:', '').replace('"', ''))

        except Exception as e:
            timestamp = path.stat().st_ctime
        return self.timestamp_to_time_str(timestamp)
    
    @staticmethod
    def readable_file_size(n_bytes):
        power = len(str(n_bytes)) - 1
        assert power < 24, 'Are you sure file is larger than a zettabyte?'

        prefix_powers =[
            (0, 'b'),
            (3, 'kb'),
            (6, 'mb'), 
            (9, 'gb'),
            (12, 'tb'),
            (15, 'pb'),
            (18, 'eb'),
            (21, 'zb'),
            (24, 'yb')
        ]
        prev_pow = 0
        prev_pre = 'b'
        for curr_pow, curr_pre in prefix_powers:
            if power < curr_pow: break
            prev_pow = curr_pow
            prev_pre = curr_pre
        return f'{(n_bytes / 10**prev_pow):.2f} {prev_pre}'

# NOTE: ACCIDENTALLY JUST UPDATED ALL v2 READMEs. Check if I want to rollback in git before doing anything else.

In [553]:
parser = ReadmeUpdater(
    '/Users/harrisonmamin/DatascienceBase/Delphi/v2-0-0/py',
    '/Users/harrisonmamin/DatascienceBase/Delphi/v2-0-0/pylib/delphi',
    '/Users/harrisonmamin/DatascienceBase/Delphi/v2-0-0/analysis'
)

In [506]:
parser.update_dirs()

/Users/harrisonmamin/DatascienceBase/Delphi/v2-0-0/py
/Users/harrisonmamin/DatascienceBase/Delphi/v2-0-0/pylib
/Users/harrisonmamin/DatascienceBase/Delphi/v2-0-0/analysis


In [466]:
df = parser._parse_dir_files(parser.dirs[0])

In [450]:
parser.update_readme(Path('../tmp.md'), df)

Last updated: 2021-03-30 18:06:40


In [420]:
df

Unnamed: 0,File,Summary,Last Modified,Created,Size
0,embeddings_app.py,_,2021-03-29 10:42:30,2021-03-29 10:42:30,12.37 kb
6,s00_sync_s3.py,Script to conveniently upload/download files f...,2021-03-29 10:42:30,2021-03-29 10:42:30,2.23 kb
10,s01_get_top_domains_entauth.py,_,2021-03-29 10:42:30,2021-03-29 10:42:30,1.78 kb
17,s02_scrape_domains.py,_,2021-03-29 10:42:30,2021-03-29 10:42:30,5.14 kb
8,s03_process_common_crawl_graph.py,_,2021-03-29 10:42:30,2021-03-29 10:42:30,3.32 kb
12,s04_analyze_embeddings.py,This CLI is for analyzing a set of trained emb...,2021-03-29 10:42:30,2021-03-29 10:42:30,4.08 kb
7,s05_sample_edgelist.py,_,2021-03-29 10:42:30,2021-03-29 10:42:30,3.20 kb
1,s06_train_collab_fastai.py,In progress: train domain embeddings using Fas...,2021-03-29 10:42:30,2021-03-29 10:42:30,2.46 kb
4,s07_train_embeddings_incendio.py,_,2021-03-29 10:42:30,2021-03-29 10:42:30,7.17 kb
3,s08_build_character_embeddings.py,_,2021-03-29 10:42:30,2021-03-29 10:42:30,3.82 kb


In [412]:
parser.timestamp_to_datetime_str(stats.st_mtime)

'2021-03-29 10:42:30'

In [403]:
# from IPython.display import HTML
# display(HTML(df.to_html(index=False)))
display(HTML(df.to_html(index=False).replace('\\n', '<br/>')))

File,Summary,Last Modified,Created,Size
embeddings_app.py,_,2021-03-29,2021-03-29,12.37 kb
s00_sync_s3.py,Script to conveniently upload/download files from S3. Examples -------- python py/s00_sync_s3.py download --all_ python py/s00_sync_s3.py download --only=['data/misc/'] python py/s00_sync_s3.py download --exclude=['data/scraped/'],2021-03-29,2021-03-29,2.23 kb
s01_get_top_domains_entauth.py,_,2021-03-29,2021-03-29,1.78 kb
s02_scrape_domains.py,_,2021-03-29,2021-03-29,5.14 kb
s03_process_common_crawl_graph.py,_,2021-03-29,2021-03-29,3.32 kb
s04_analyze_embeddings.py,"This CLI is for analyzing a set of trained embeddings, but it needs to be updated (currently uses the old version of the Embeddings class). In the mean time, analysis should be done through the embeddings dash app in this same directory.",2021-03-29,2021-03-29,4.08 kb
s05_sample_edgelist.py,_,2021-03-29,2021-03-29,3.20 kb
s06_train_collab_fastai.py,In progress: train domain embeddings using FastAI collaborative filtering.,2021-03-29,2021-03-29,2.46 kb
s07_train_embeddings_incendio.py,_,2021-03-29,2021-03-29,7.17 kb
s08_build_character_embeddings.py,_,2021-03-29,2021-03-29,3.82 kb


In [211]:
def readable_file_size(n_bytes):
    power = len(str(n_bytes)) - 1
    assert power < 24, 'Are you sure file is larger than a zettabyte?'
    
    prefix_powers =[
        (0, 'b'),
        (3, 'kb'),
        (6, 'mb'), 
        (9, 'gb'),
        (12, 'tb'),
        (15, 'pb'),
        (18, 'eb'),
        (21, 'zb'),
        (24, 'yb')
    ]
    prev_pow = 0
    prev_pre = 'b'
    for curr_pow, curr_pre in prefix_powers:
        if power < curr_pow: break
        prev_pow = curr_pow
        prev_pre = curr_pre
    return f'{(n_bytes / 10**prev_pow):.2f} {prev_pre}'

In [212]:
for length in range(1, 26):
    range_ = cycle(range(1, 10))
    curr = ''
    while len(curr) < length:
        curr += str(next(range_))
    print(length, curr, readable_file_size(int(curr)))

1 1 1.00 b
2 12 12.00 b
3 123 123.00 b
4 1234 1.23 kb
5 12345 12.35 kb
6 123456 123.46 kb
7 1234567 1.23 mb
8 12345678 12.35 mb
9 123456789 123.46 mb
10 1234567891 1.23 gb
11 12345678912 12.35 gb
12 123456789123 123.46 gb
13 1234567891234 1.23 tb
14 12345678912345 12.35 tb
15 123456789123456 123.46 tb
16 1234567891234567 1.23 pb
17 12345678912345678 12.35 pb
18 123456789123456789 123.46 pb
19 1234567891234567891 1.23 eb
20 12345678912345678912 12.35 eb
21 123456789123456789123 123.46 eb
22 1234567891234567891234 1.23 zb
23 12345678912345678912345 12.35 zb
24 123456789123456789123456 123.46 zb


AssertionError: Are you sure file is larger than a zettabyte?