# Copy Files with No Duplicates

The goal of this notebook is to have two modes, such that two folders can be merged without duplicates:

1. Mark only mode: Given a directory, hash each file using MD5 and file size
2. Copy mode: Copy each file not in the hash table to another directory

This requires creating four pieces of functionality:

1. Serializing and deserializing a Python dict containing the data (hash plus size)
2. Being able to iterate through files
3. Being able to hash the file in question and extract its size
4. Being able to copy files from one directory to another while maintaining their relative directory structure

## Sources

* D:\Dropbox\Project Hub\Website\KieferFlaskSite\home\scripts\file_age_directory.py
* https://stackoverflow.com/questions/8858008/how-to-move-a-file-in-python#8858026
* https://stackoverflow.com/questions/1072569/see-if-two-files-have-the-same-content-in-python
* https://stackoverflow.com/questions/5787471/md5-and-sha-2-collisions-in-python
* https://stackoverflow.com/questions/3431825/generating-an-md5-checksum-of-a-file
* https://stackoverflow.com/questions/6773584/how-is-pythons-glob-glob-ordered

## Imports

In [None]:
import hashlib
import json
import os
import shutil
import time
from pathlib import Path

## Serializing and Deserializing the Hash Table

In [None]:
def load_hash_table(hash_table_path: str) -> dict:
    """Loads a hash table dict from a file."""

    try:
        with open(hash_table_path, 'r') as f_json:
            return json.load(f_json)
    except FileNotFoundError:
        return {}


def save_hash_table(hash_table_path: str, hash_table: dict):
    """Loads a hash table dict from a file."""

    try:
        with open(hash_table_path, 'w') as f_json:
            json.dump(hash_table, f_json, indent=4, sort_keys=True)
    except FileNotFoundError:
        print('Could not create json file.')

## Hashing Files

In [None]:
def create_md5(file_path: str) -> str:
    """Create an MD5 hash of a file's contents."""
    
    hash_md5 = hashlib.md5()
    try:
        with open(file_path, 'rb') as f_to_hash:
            for chunk in iter(lambda: f_to_hash.read(4096), b''):
                hash_md5.update(chunk)
    except FileNotFoundError:
        return 'BADHASH'
    
    return hash_md5.hexdigest()


def hash_file(hash_table: dict, file_path_object) -> bool:
    """Attempt to hash a file and return true iff it was not already hashed."""

    file_path = str(file_path_object)
    file_size = str(os.path.getsize(file_path))
    file_name = file_path_object.name
    file_hash = create_md5(file_path)
    combined_hash = f'{file_hash}{file_size}'
    
    if combined_hash in hash_table:
        if file_name not in hash_table[combined_hash]['names']:
            hash_table[combined_hash]['names'].append(file_name)
        return False
    else:
        hash_table[combined_hash] = {
            'md5': file_hash,
            'size': file_size,
            'names': [file_name]
        }
        return True

## Moving Files

In [None]:
def move_files_relative(old_dir_path: str, new_dir_path: str, file_path: str):
    """Moves a file from one directory to another, keeping its relative file structure."""

    if os.path.isfile(file_path):
        relative_path = file_path.replace(old_dir_path, "")
        new_path = f'{new_dir_path}{relative_path}'
        os.makedirs(os.sep.join(new_path.split(os.sep)[:-1]), exist_ok=True)
        shutil.copyfile(file_path, new_path)

## Iterating Through Files

In [None]:
def get_earliest_age(file_path_object) -> int:
    """Returns the earliest of created and modified time for a file path object."""
    
    try:
        return min(
            os.path.getmtime(str(file_path_object)),
            os.path.getctime(str(file_path_object))
        )
    except FileNotFoundError:
        return 0

In [None]:
def hash_files_in_directory(hash_table: dict, dir_path: str):
    """Hashes all files in a directory."""
    
    hashed = 0
    duplicate = 0
    
    file_paths = sorted(Path(dir_path).glob('**/*'), key=get_earliest_age)
    for file_path in file_paths:
        if os.path.isdir(str(file_path)):
            continue
        
        if hash_file(hash_table, file_path):
            hashed += 1
            print(f'Hashed: {file_path}')
        else:
            duplicate += 1
            print(f'Hashed, Duplicate: {file_path}')
    
    print(f'\n{hashed + duplicate} total files, {hashed} new hashes, {duplicate} duplicates')


def hash_and_copy_files_in_directory(hash_table: dict, old_dir_path: str, new_dir_path: str):
    """Hashes all files in a directory."""
    
    hashed = 0
    duplicate = 0
    failures = []
    
    file_paths = sorted(Path(old_dir_path).glob('**/*'), key=get_earliest_age)
    for file_path in file_paths:
        if os.path.isdir(str(file_path)):
            continue
        
        try:
            if hash_file(hash_table, file_path):
                move_files_relative(old_dir_path, new_dir_path, str(file_path))
                print(f'Hashed, Moved: {file_path}')
                hashed += 1
            else:
                print(f'Hashed, Duplicate: {file_path}')
                duplicate += 1
        except FileNotFoundError:
            print(f'FAILED: {file_path}')
            failures.append(str(file_path))
    
    fail_count = len(failures)
    print(f'\n{hashed + duplicate + fail_count} total files, {hashed} moved, {duplicate} duplicates')
    if failures:
        print(f'\n{fail_count} files failed:')
        for fail in failures:
            print(f'* {fail}')

## Run

In [None]:
def mark_files():
    """Add files in a directory to a hash table."""
    
    hash_table_file = input('Hash table file: ')
    source_directory = input('Source directory: ')
    
    print('\nProcessing...\n')
    
    hash_table = load_hash_table(hash_table_file)
    hash_files_in_directory(hash_table, source_directory)
    save_hash_table(hash_table_file, hash_table)
    
    print('\nDone!')


def copy_files():
    """Hash and copy unhashed files in a directory to another folder."""

    hash_table_file = input('Hash table file: ')
    source_directory = input('Source directory: ')
    destination_directory = input('Destination directory: ')
    
    print('\nProcessing...\n')
    
    hash_table = load_hash_table(hash_table_file)
    hash_and_copy_files_in_directory(hash_table, source_directory, destination_directory)
    save_hash_table(hash_table_file, hash_table)
    
    print('\nDone!')

    
def main():
    print('Copy files with no duplicates!  Please do not use relative file paths and do not include trailing slashes.')
    mode = input('Mode (MARK, COPY, DEBUG): ')
    print('')
    if mode.lower() == 'mark':
        confirm = input('Are you sure you want to do MARK mode, and not COPY? ')
        if confirm.lower() == 'yes':
            mark_files()
    elif mode.lower() == 'copy':
        copy_files()
    else:
        debug()

Example directories:

```
D:\Dropbox\Project Hub\Game_and_Programming_Tutorials\Python File Manipulation\Armistice_HT.json
D:\Dropbox\Project Hub\Game_and_Programming_Tutorials\Python File Manipulation\Armistice
D:\Dropbox\Project Hub\Game_and_Programming_Tutorials\Python File Manipulation\Armistice_New

phone_backups.json
D:\Media\Phone Backups\LG G5
C:\Media\Phone Backups Temp\Deduplicated Phone Backups\LG G5
D:\Media\Phone Backups\From 32 GB SD Card
C:\Media\Phone Backups Temp\Note 9 2022-02-06
D:\Media\Phone Backups\Note 9\From SD Card 2020-11-09
C:\Media\Straggler Files 2021-03-27
```

In [None]:
def debug():
    return False

    print('hello worlds')
    old_dir_path = input('Source directory: ')
    new_dir_path = input('Destination directory: ')
    
    file_paths = Path(old_dir_path).glob('**/*')
    for file_path in file_paths:
        file_path = str(file_path)
        print(file_path)
        if os.path.isfile(file_path):
            relative_path = file_path.replace(old_dir_path, "")
            new_path = f'{new_dir_path}{relative_path}'
            os.makedirs(os.sep.join(new_path.split(os.sep)[:-1]), exist_ok=True)
            shutil.copyfile(file_path, new_path)


main()

In [None]:
print('test')