<a href="https://colab.research.google.com/github/kousiknandy/pycolab/blob/main/MapReduce_Dupfiles.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [35]:
import hashlib
import glob
from concurrent.futures import ProcessPoolExecutor
from collections import defaultdict
import os

def hash_file(filename):
    with open(filename, "rb") as f:
        digest = hashlib.file_digest(f, "sha256")
    # print(f"{os.getpid()} {filename}")
    return digest.hexdigest(), filename

def files(patt):
    yield from glob.iglob(patt)

def djb_hash(s, m):
    h = 5381
    for c in s:
        h = h * 33 + ord(c)
    return h % m

def duplicates(hashes):
    hashes = {k:v for k,v in hashes.items() if len(v) > 1}
    return hashes

class MapReducer:
    def __init__(self, mapper, reducer, partitions=1):
        self.mapper = mapper
        self.reducer = reducer
        self.parts = partitions

    def partition(self, results):
        parts = [defaultdict(list) for _ in range(self.parts)]
        for h, f in results:
            parts[djb_hash(h, self.parts)][h].append(f)
        return parts

    def run(self, input_files):
        with ProcessPoolExecutor(max_workers=3) as executor:
            results = executor.map(self.mapper, files(input_files), chunksize=5)
        parts = self.partition(results)
        with ProcessPoolExecutor(max_workers=2) as executor:
            results = executor.map(self.reducer, parts)
        for res in results:
            print(res.values())


In [36]:
m = MapReducer(hash_file, duplicates, 2)
m.run("/home/dups/**/*")

44647 /home/dups/5/data3.txt
44647 /home/dups/5/data8.txt
44647 /home/dups/5/data4.txt
44647 /home/dups/5/data9.txt
44647 /home/dups/5/data2.txt
44647 /home/dups/5/data1.txt
44647 /home/dups/5/data7.txt
44647 /home/dups/5/data5.txt
44647 /home/dups/5/data6.txt
44647 /home/dups/3/data3.txt
44647 /home/dups/3/data8.txt
44647 /home/dups/3/data4.txt
44647 /home/dups/3/data9.txt
44647 /home/dups/3/data2.txt
44647 /home/dups/3/data1.txt
44647 /home/dups/3/data7.txt
44647 /home/dups/3/data5.txt
44647 /home/dups/3/data6.txt
44647 /home/dups/4/data3.txt
44647 /home/dups/4/data8.txt
44647 /home/dups/4/data4.txt
44647 /home/dups/4/data9.txt
44647 /home/dups/4/data2.txt
44647 /home/dups/4/data1.txt
44647 /home/dups/4/data7.txt
44647 /home/dups/4/data5.txt
44647 /home/dups/4/data6.txt
44647 /home/dups/1/data3.txt
44647 /home/dups/1/data8.txt
44647 /home/dups/1/data4.txt
44647 /home/dups/1/data9.txt
44647 /home/dups/1/data2.txt
44647 /home/dups/1/data1.txt
44647 /home/dups/1/data7.txt
44647 /home/du