### Data backup & sync and duplicate checker script for securing your files.
You don't have to buy expensive backup softwares for this purpose, just use this simple script.
You can with this script:
   * check for equal, modifed, missing files between 2 folders
        * the file comparison is based on filecmp library
           * it uses file type, size, and modification time to compare, if one of the mentioned properties of both files are identical, the files are taken to be equal.
        * you can specify file source destination too
        * you can specify multiple directory, file pairs
   * specify what operations to do copy new, update modified, delete files
     * simulate actions, to see it works as expected
   * search for duplicated files and see its sizes, similar files

Planned functionalities
   * exclude files, directories option

improvements
   * I only tested it with only one source, destination directory pair

<font color=red>WARNING: Use this script at your own risk, check the code and understand better before you use it!</font>

In [None]:
import os
import filecmp
import shutil
from pathlib import Path
import numpy as np
import pandas as pd
from IPython.display import display

def list_files_recursively(directory: str) -> list[str]:
    """Lists all files in a directory recursively

    Args:
        directory (str): directory to check

    Returns:
        list[str]: File list
    """    
    file_list = []
    for root, dirs, files in os.walk(directory):
        for file in files:
            file_list.append(os.path.join(root, file))
    return file_list

def show_progress(total_c: int, act_idx: int, p_step: int = 5) -> None:
    """Shows the total progress in percents, which can be counted in advance

    Args:
        total_c (int): the total number of iterations in the progress
        act_idx (int): the actual step in the progress
        p_step (int, optional): The percent value, how frequently the script reports progress to the user. Defaults to 5.
    """    
    step = 1/total_c * 100
    p = ((act_idx/total_c) * 100)
    rem = p % p_step
    if rem < step:
         print(f"{round(p,2)}%")

def proc_row(df_len: int, i: int, f1: str, f2: str) -> bool:
    """The core file comparing function, it uses filecmp

    Args:
        df_len (int): The full length of the dataframe(each file pairs with similar names)
        i (int): the actual index of the progress 
        f1 (str): filename1 to compare
        f2 (str): filename2 to compare

    Returns:
        bool: if f1 and f2 seem to be identical it returns True otherwise returns False
    """
	
	# calc & show percentage
    show_progress(df_len, i)

	# compare files
    return filecmp.cmp(f1,f2)

def calc_files_size(file_list_np_arr: np.ndarray) -> list[float]:
    """Calculates the file sizes of the specified file list in megabytes

    Args:
        file_list_np_arr (np.ndarray): The list of filepathes, comes from pandas Dataframe

    Returns:
        list[float]: the calculated file sizes in megabytes
    """    
    stats_src = [Path(fp).stat() if not pd.isnull(fp) else None for fp in file_list_np_arr]
    return list(map(lambda s: (s.st_size/1024/1024) if not pd.isnull(s) else None ,stats_src))

def print_unique_dirs(df_filt: pd.DataFrame, action_str:str):
    """Shows the unique folders for a specified files with particular actions
       Possible actions are COPY_NEW, UPDATE, DELETE

    Args:
        df_filt (pd.DataFrame): the pandas DataFrame object to filter
        action_str (str): the caluclated action to do with the files
    """    
    df_filt_tmp = df_filt.loc[df_filt['op'] == action_str]
    if len(df_filt_tmp) > 0:
        col = 'dst' if action_str == "DELETE" else 'src'
        df_filt_tmp['dir'] = [os.path.dirname(f) for f in df_filt_tmp[col].values]
        display(df_filt_tmp['dir'].unique().tolist())

def print_files(df_filt: pd.DataFrame, action_str: str):
    """Print files with a particular action
       Possible actions are COPY_NEW, UPDATE, DELETE

    Args:
        df_filt (pd.DataFrame): the pandas DataFrame object to filter
        action_str (str): the caluclated action to do with the files
    """
    col = 'dst' if action_str == "DELETE" else 'src'
    display(df_filt.loc[df_filt['op'] == action_str][col].tolist())

In [None]:
########################   Data backup & sync   #############################

# --- INPUTS

# decide to execute the particular action(DO IT CAREFULLY!)
perform_copy_new = False 
perform_update = False
perform_delete = False

# you can specify here multiple pairs(should be tested more!)
dir_file_arr = [{'src':'D:\\sample_source', 'dst': 'F:\\sample_source_backup'}]

In [None]:
# --- COLLECT SRC, DST FILE LISTS

files_src_arr = []
files_dst_arr = []
dst_files_src_pathes = []
src_files_dst_pathes = []

# process directory, create file list
for pathes in dir_file_arr:

	# path is directory
	if os.path.isdir(pathes['src']):
		src_act = list_files_recursively(pathes['src'])
		files_src_arr += src_act
		dst_act = list_files_recursively(pathes['dst'])
		files_dst_arr += dst_act
		dst_files_src_pathes += np.char.replace(dst_act, pathes['dst'], pathes['src']).tolist()
		src_files_dst_pathes += np.char.replace(src_act, pathes['src'], pathes['dst']).tolist()
	
	# path is file
	else:
		if os.path.exists(pathes['src']):
			files_src_arr.append(pathes['src'])
		if os.path.exists(pathes['dst']):
			files_dst_arr.append(pathes['dst'])
            
# caclulate missing files in both src and dst
df_src = pd.DataFrame({'src': files_src_arr, 'src_dst': src_files_dst_pathes}, dtype="str")
df_dst = pd.DataFrame({'dst': files_dst_arr, 'dst_src': dst_files_src_pathes}, dtype="str")
df = pd.merge(df_dst,df_src, how= "outer", left_on= ["dst","dst_src"], right_on= ["src_dst","src"])

df = df[['src', 'dst','src_dst']]
df

In [None]:
# --- PROCESSING

# calc src file sizes
df['src_size'] = calc_files_size(df['src'].values)

# compare files
df_len = len(df)
df['src_dst_ident'] = [proc_row(df_len,r[0],r[1][0],r[1][1]) if r[1][0] is not np.nan and r[1][1] is not np.nan else False for r in enumerate(df.values)]

# calculate file oparations
df['op'] = '-'
df.loc[df['src_dst_ident'] == True, 'op'] = 'NONE'
df.loc[df['src_dst_ident'] == False, 'op'] = 'UPDATE'
df.loc[df['dst'].isnull(), 'op'] = 'COPY_NEW'
df.loc[df['src'].isnull(), 'op'] = 'DELETE'
df.loc[df['dst'].isnull(), 'dst'] = df['src_dst']
df.sort_values(['op', 'src'], inplace=True)

# some stats about changes
print("--------------\n\n")
print(df['op'].value_counts()) # if shows '-', something is wrong

In [None]:
# ---SHOW DIRECTORIES AND FILES WITH PARTICULAR ACTIONS

#print_unique_dirs(df_filt, 'COPY_NEW')
print_files(df_filt, 'COPY_NEW')

In [None]:
#print_unique_dirs(df_filt, 'UPDATE')
print_files(df_filt, 'UPDATE')

In [None]:
#print_unique_dirs(df_filt, 'DELETE')
print_files(df_filt, 'DELETE')

In [None]:
# ---PERFORM ACTIONS

df_filt = df[(df['op'] != 'NONE') & (df['op'] != '-')]

# calc total sizes
copy_new_size_mb = df_filt[df_filt['op'] == 'COPY_NEW']['src_size'].sum()
update_size_mb = df_filt[df_filt['op'] == 'UPDATE']['src_size'].sum()
delete_size_mb = df_filt[df_filt['op'] == 'DELETE']['src_size'].sum()

print(f"COPY_NEW total size {copy_new_size_mb} Mib")
print(f"UPDATE total size {update_size_mb} Mib")
print(f"DELETE total size {delete_size_mb} Mib")

print(f"{len(df_filt)} actions total")

perform_actions = perform_copy_new | perform_update | perform_delete

for i,r in df_filt.iterrows():

	# create dst dir if not checked, exist
	dst_dir = os.path.dirname(r['dst'])

	# create dst dir if not checked, exist
	if perform_actions and not os.path.exists(dst_dir):
		os.mkdir(dst_dir)

	if(r['op'] == 'COPY_NEW'):

		print(f"Processing COPY_NEW {r['src']}")
		print(f"Processing COPY_NEW {r['dst']}\n")

		if perform_copy_new:
			shutil.copy(r['src'], r['dst'])

	if(r['op'] == 'UPDATE'):
		print(f"Processing UPDATE {r['src']}")
		print(f"Processing UPDATE {r['dst']}\n")

		if perform_update:
			os.remove(r['dst'])
			shutil.copy(r['src'], r['dst'])

	if(r['op'] == 'DELETE'):
		print(f"Processing DELETE {r['dst']}\n")

		if perform_delete:
			os.remove(r['dst'])

			# delete empty folder
			if not os.listdir(dst_dir):
				os.rmdir(dst_dir)

In [None]:
########################   duplicate checker   #############################

path = "D:\\Gann"
mb_limit = 1

# read file list
files_arr = list_files_recursively(path)
df = pd.DataFrame({'file': files_arr}, dtype="str")

# calc file size
df['file_size'] = calc_files_size(df['file'].values)

# filter by size
df = df[df['file_size'] > mb_limit]
df

In [None]:
# --- COLLECT ALL DUPLICATES INTO STRUCTURE
duplicate_arr = []
total = pow(len(df['file'].values), 2)
df_len = len(df)
print(f"Total elements to check: {total}, Analyzing...")
for i,f1 in enumerate(df['file'].values):
    if not pd.isna(f1):
        duplicate_arr.append({'file': f1, 'size': df['file_size'].values[i], 'dups': []})
        for j,f2 in enumerate(df['file'].values):
            show_progress(total, (i * df_len) + (j+1))
            if f1 != f2 and not pd.isna(f2) and filecmp.cmp(f1,f2):
                duplicate_arr[-1]['dups'].append(f2)
duplicate_arr          

In [None]:
# create dict -> dataframe
duplicate_arr = [d for d in duplicate_arr if len(d['dups']) > 0]
dup_counts = [len(d['dups']) for d in duplicate_arr]
df_dup = pd.DataFrame(duplicate_arr, columns=['file', 'size', 'dups'])
df_dup['dup_c'] = dup_counts
df_dup.sort_values(['size'], inplace=True, ascending=False)
df_dup

In [None]:
# ---SHOW RESULTS WITH SIMPLE PRINT STATEMENT
for i,dup_f in enumerate(df_dup['file'].values):
    print(f"{dup_f}\n")
    print(f"{df_dup['size'].values[i]}\n\n")
    for dup_f in df_dup['dups'].values[i]:
        print(dup_f)
    print("------------")