# core

> Utility for automating backups of a specific file or directory

In [None]:
#| default_exp core

# Automated Backups

We want a script to back up a specific file/folder over different intervals. Specifically, it should

- Copy to some destination dir every hour (e.g. a different drive)
- Keep the last 5, and one every day, week and month (for example)

We can then rsync the destination dir to keep a remote backup.

In [None]:
#|export
import shutil, os, time, pprint, logging
from pathlib import Path
from fastcore.script import call_parse
from datetime import datetime, timedelta

In [None]:
!mkdir -p demo_src
!mkdir -p demo_dst
!rm -r demo_dst/*

In [None]:
!echo "content" > "demo_src/$(date +%s).txt"

## The core functionality

The plan has two main steps:

- Create a new backup
- Clean up any old backups that are no longer needed.

Step 1 is easy enough:


In [None]:
#|export
def create_backup(src, dest_dir):
    timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
    src_path = Path(src)
    dest_path = Path(dest_dir) / timestamp
    if src_path.is_file():
        dest_path.mkdir(parents=True, exist_ok=True)
        shutil.copy2(src_path, dest_path / src_path.name)
    else: shutil.copytree(src, dest_path)

In [None]:
create_backup('demo_src', 'demo_dst')
!ls demo_dst

20241115_120316


The harder part is the cleanup. Let's start by generating some dates to test with.

In [None]:
def generate_test_dates(num_dates, base_date):
    return [(base_date + timedelta(hours=i)).strftime("%Y%m%d_%H%M%S") for i in range(num_dates)]
test_dates = generate_test_dates(2400, datetime.now() - timedelta(days=100))
print(test_dates[:5], test_dates[-5:])

['20240807_120317', '20240807_130317', '20240807_140317', '20240807_150317', '20240807_160317'] ['20241115_070317', '20241115_080317', '20241115_090317', '20241115_100317', '20241115_110317']


In [None]:
# Can I get all dates < 2 months old?
[d for d in test_dates if (datetime.now() - datetime.strptime(d, '%Y%m%d_%H%M%S')).days < 60][:3]

['20240916_130317', '20240916_140317', '20240916_150317']

Now we want to grab the most recent 5, and then the oldest below some threshold.

In [None]:
#| export 
def clean_dates(dates, now=None, max_ages=(2, 14, 60)):
    now = now or datetime.now()
    clean = []
    dates.sort()
    
    for max_age in max_ages:
        lt_max = [d for d in dates if (now - datetime.strptime(d, '%Y%m%d_%H%M%S')).days < max_age]
        if lt_max: clean.append(lt_max[0])

    clean.extend(dates[-5:])  # Keep the newest 5
    return sorted(set(clean))  # Remove duplicates and sort

In [None]:
clean_dates(test_dates)

['20240916_130317',
 '20241101_130317',
 '20241113_130317',
 '20241115_070317',
 '20241115_080317',
 '20241115_090317',
 '20241115_100317',
 '20241115_110317']

Now we want code that starts with the same test dates etc as above, but then simulates time passing by adding an hour to 'now' and a date to test dates every step then printing out a (prettified) version of clean_dates to check it's doing as I expect over a simulated month.

In [None]:
# Initialize
now = datetime.now()
test_dates = generate_test_dates(2400, now - timedelta(days=100))

# Simulate time passing
for _ in range(30 * 24):  # Simulate a month (30 days * 24 hours)
    now += timedelta(hours=1)
    test_dates.append(now.strftime("%Y%m%d_%H%M%S"))
    test_dates = clean_dates(test_dates, now)  # Clean up old dates
    if _ % 24 == 0:  # Print once a day
        print(f"\nDay {_ // 24 + 1}:")
        pprint.pprint(test_dates)


Day 1:
['20240916_140322',
 '20241101_140322',
 '20241113_140322',
 '20241115_080322',
 '20241115_090322',
 '20241115_100322',
 '20241115_110322',
 '20241115_130322']

Day 2:
['20241101_140322',
 '20241113_140322',
 '20241115_080322',
 '20241116_090322',
 '20241116_100322',
 '20241116_110322',
 '20241116_120322',
 '20241116_130322']

Day 3:
['20241101_140322',
 '20241113_140322',
 '20241117_030322',
 '20241117_090322',
 '20241117_100322',
 '20241117_110322',
 '20241117_120322',
 '20241117_130322']

Day 4:
['20241101_140322',
 '20241113_140322',
 '20241117_030322',
 '20241118_090322',
 '20241118_100322',
 '20241118_110322',
 '20241118_120322',
 '20241118_130322']

Day 5:
['20241101_140322',
 '20241113_140322',
 '20241118_220322',
 '20241119_090322',
 '20241119_100322',
 '20241119_110322',
 '20241119_120322',
 '20241119_130322']

Day 6:
['20241101_140322',
 '20241113_140322',
 '20241118_220322',
 '20241120_090322',
 '20241120_100322',
 '20241120_110322',
 '20241120_120322',
 '20241120_1

NB: Yay, it looks to be doing mostly what I want! I can collapse the output, if you're viewing this in a notebook my apologies :)

## Turning it into a script

Now that those two pieces of functionality seem to be working, we can wrap this up as a script using fastcore's call_parse, have it run the backup, clean up old files, and log any errors or messages to backup.log

In [None]:
#|export
@call_parse
def run_backup(
    src:str, # The source to be backed up
    dest:str, # The destination directory
    max_ages:str="2,14,60", # The max age(s) in days for the different backups
    log_file:str='backup.log'
):
    "Run backup and cleanup old files"
    
    # Set up logging
    logging.basicConfig(filename=log_file, level=logging.DEBUG,
                        format='%(asctime)s - %(levelname)s - %(message)s')
    try:
        create_backup(src, dest)
        logging.info(f"Backup created: {src} -> {dest}")
        max_ages = [int(age.strip()) for age in max_ages.split(',')]
        backups = [d.name for d in Path(dest).iterdir() if d.is_dir()]
        to_keep = clean_dates(backups, max_ages=max_ages)
        for backup in backups:
            if backup not in to_keep:
                shutil.rmtree(Path(dest) / backup)
                logging.info(f"Removed old backup: {backup}")
    except Exception as e:
        logging.error(f"Backup failed: {str(e)}", exc_info=True)

In [None]:
!ls demo_src

1731699240.txt	1731700528.txt	1731700920.txt
1731700503.txt	1731700856.txt	1731700993.txt


Testing a directory:

In [None]:
!rm -r demo_dst/*

In [None]:
run_backup('demo_src', 'demo_dst',)
!ls demo_dst

20241115_120331


In [None]:
!ls demo_dst/20241115_120331

1731699240.txt	1731700528.txt	1731700920.txt
1731700503.txt	1731700856.txt	1731700993.txt


Testing a single file

In [None]:
!rm -r demo_dst/*

In [None]:
run_backup('demo_src/1731700503.txt', 'demo_dst')

In [None]:
!ls demo_dst

20241115_120341


In [None]:
!ls demo_dst/20241115_120341

1731700503.txt
