# Compressed files & archives

In [1]:
from datetime import datetime
import gzip
from pathlib import Path
import random
import shutil
import tarfile
import tempfile
import time

## Writing & reading gzip files

You can directly write to or read from a compressed file.

In [2]:
gzip_file = Path.cwd() / 'data.gz'
with gzip.open(gzip_file, 'wb') as file:
    for _ in range(500):
        line = ''.join(random.choices('ACGT', k=80) + ['\n']).encode()
        file.write(line)

In [3]:
gzip_file.lstat().st_size

12984

In [4]:
with gzip.open(gzip_file, 'rb') as file:
    count = {symbol: 0 for symbol in 'ACGT'}
    for line in file:
        for symbol in line.decode(encoding='utf8'):
            if symbol in count:
                count[symbol] += 1
    for symbol, nr in count.items():
        print(f'{symbol}: {nr}')
    print(f'total = {sum(count.values())}')

A: 9973
C: 10067
G: 9997
T: 9963
total = 40000


Remove the gzip file.

In [5]:
gzip_file.unlink()

## Writing & reading TAR files

Generate a directory and some data files to work with.

In [6]:
data_dir = Path.cwd() / 'data'
data_dir.mkdir(exist_ok=True)
nr_files = 5
nr_symbols = 10000
for file_nr in range(nr_files):
    time.sleep(2)
    file = data_dir / f'data_{file_nr + 1:02d}.txt'
    with file.open('w') as data_file:
        data = ''.join(random.choices('ACGT', k=nr_symbols))
        print(data, file=data_file, end='')

Add the data files to a compressed TAR file, and show the compression ratio.

In [7]:
tar_path = Path('data.tar.gz')
total_data_size = 0
with tarfile.open(tar_path, 'w:gz') as tar_file:
    for file in data_dir.iterdir():
        data_size = file.lstat().st_size
        total_data_size += data_size
        print(f'{file.name}: {data_size} bytes')
        tar_file.add(file.relative_to(Path.cwd()))
size = tar_path.lstat().st_size
print(f'{tar_path.name}: {size} bytes ({size/total_data_size:.2%})')

data_04.txt: 10000 bytes
data_05.txt: 10000 bytes
data_02.txt: 10000 bytes
data_03.txt: 10000 bytes
data_01.txt: 10000 bytes
data.tar.gz: 15846 bytes (31.69%)


Remove the data directory in the current working directory.

In [8]:
shutil.rmtree(data_dir.name)

Meta-information can be retrieved from the compressed TAR file.

In [9]:
with tarfile.open(tar_path, 'r') as tar_file:
    for tar_info in tar_file:
        date = datetime.strftime(datetime.fromtimestamp(tar_info.mtime), '%Y-%m-%d %H:%M:%S')
        print(f'{tar_info.name}: {tar_info.size} bytes, last modified {date}')

data/data_04.txt: 10000 bytes, last modified 2023-07-31 14:29:21
data/data_05.txt: 10000 bytes, last modified 2023-07-31 14:29:23
data/data_02.txt: 10000 bytes, last modified 2023-07-31 14:29:17
data/data_03.txt: 10000 bytes, last modified 2023-07-31 14:29:19
data/data_01.txt: 10000 bytes, last modified 2023-07-31 14:29:15


Extract and process a single file at the time.  All files are in a compressed TAR file.

In [10]:
tmp_dir = Path.cwd() / 'tmp'
count = {symbol: 0 for symbol in 'ACGT'}
with tarfile.open(tar_path, 'r') as tar_file:
    for tar_info in tar_file:
        tar_file.extract(tar_info, path=tmp_dir)
        data_file = tmp_dir / tar_info.name
        with data_file.open('r') as file:
            for line in file:
                for symbol in line:
                    if symbol in count:
                        count[symbol] += 1
        data_file.unlink()
for symbol, nr in count.items():
    print(f'{symbol}: {nr}')
print(f'total = {sum(count.values())}')
shutil.rmtree(tmp_dir)

A: 12576
C: 12504
G: 12379
T: 12541
total = 50000


*Important security note:* do not extract an unvalidated archive, it may contains files that will extract files in unexpected directories. This is *especially* important when running a script with superuser privileges!

Remove the compressed TAR file.

In [11]:
tar_path.unlink()