## Implement an LSM

- Store record in an LRU and prediotically dump them to disk
- Search item in segments if it doesn't exist in LRU
- Do incremental segments if the file reach some sizes 4KB for example
- Do compaction on the segments and only retains newest result
- Delete thing will add the tombstone and remove it from internall record

In [1]:
import itertools
import os

from collections import deque
from typing import Any, List, Dict, Tuple, Union, Generator


def take(iterable):
    try:
        return next(iterable)
    except StopIteration:
        return None


class Record:
    _internal: Dict[str, Any]
    
    def __init__(self):
        self._internal = {}
    
    def add(self, key: str, value: Any) -> None:
        self._internal[key] = value
    
    def get(self, key: str) -> Any:
        return self._internal[key]

    def get_sorted_items(self) -> List[Tuple[str, Any]]:
        return sorted(self._internal.items(), key=lambda element: element[0])

    
GeneratorType = Generator[Tuple[str, str], None, None]


class SegmentManager:
    _segments: List[str]
    
    def __init__(self, path='db_segments'):
        self._path = path
        self._segments = deque(sorted(self._get_file_names()))
    
    def snapshot(self, record: Record) -> None:
        segment_file = self._get_next_segment()

        with open(self._get_next_segment(), 'w') as f:
            self._segments.append(segment_file)
            
            for key, value in record.get_sorted_items():
                f.write(f'{key}:{value}\n')
    
    def _get_file_names(self) -> List[str]:
        return [f'{self._path}/{file}'
                for file in os.listdir('db_segments')
                if file.startswith('segment.')]
    
    def _get_last_file(self) -> Union[None, str]:
        if not self._segments:
            return None

        return self._segments[-1]
    
    def _get_file_version(self, file_name: str) -> int:
        return int(file_name.split('.')[-1])
    
    def _get_next_segment(self) -> str:
        last_file = self._get_last_file()
        
        if not last_file:
            return f'{self._path}/segment.0'
        
        segment_number = self._get_file_version(last_file) + 2

        return f'{self._path}/segment.{segment_number}'
    
    def _create_read_line_generator(self, file) -> GeneratorType:
        with open(file, 'r') as f:
            for line in f:
                yield line

    def _compact_2_files(self, from_1, from_2, to) -> None:
        gen_1 = self._create_read_line_generator(from_1)
        gen_2 = self._create_read_line_generator(from_2)

        with open(to, 'w') as f:
            while True:
                res_1, res_2 = take(gen_1), take(gen_2)

                if not res_1 and not res_2:
                    break

                if not res_1:
                    while res_2:
                        f.write(res_2)
                        res_2 = take(gen_2)
                    break

                if not res_2:
                    while res_1:
                        f.write(res_1)
                        res_1 = take(gen_1)
                    break

                key_1, val_1 = res_1.split(':')
                key_2, val_2 = res_2.split(':')

                if key_1 < key_2:
                    f.write(res_1)
                else:
                    f.write(res_2)
        
    def compact(self) -> None:
        if len(self._segments) <= 1:
            return

        from_1, from_2 = self._segments[0], self._segments[1]
        segment_number = self._get_file_version(from_1) + self._get_file_version(from_2) // 2
        to = f'{self._path}/segment.{segment_number}'

        self._compact_2_files(from_1, from_2, to)

        os.remove(from_1)
        self._segments.popleft()

        os.remove(from_2)
        self._segments.popleft()
        
        self._segments.appendleft(to)


class Database:
    _record: Record
    _segments: List[str]

    def __init__(self):
        self._record = Record()
        self._segment_manager = SegmentManager()

    def add(self, key: str, value: Any) -> None:
        return self._record.add(key, value)

    def get(self, key:str) -> Any:
        return self._record.get(key)

    def snapshot(self) -> None:
        return self._segment_manager.snapshot(self._record)

    def compact(self) -> None:
        return self._segment_manager.compact()

In [2]:
db = Database()

In [3]:
db.add("1", 10)
db.add("2", 20)
db.snapshot()

db.add("4", 40)
db.add("3", 30)
db.snapshot()

db.add("1", 100)
db.add("3", 300)
db.snapshot()

db.compact()

In [4]:
db.compact()