In [1]:
# Mount drive
from google.colab import drive
from pathlib import Path
from collections import defaultdict
import pandas as pd
from datetime import datetime
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
import glob
import os
import time
import datetime as dt


"""
Current limitations:

Does not check if date in filename is valid date, could be in form YYYY-DD-MM
for example.

Uses mtime - so names files with the date that they were last modified, not
when they were created. Can be changed to use ctime, which will name them with
the date that the filepath was created. This is sometimes accurate to when they
were created, but if the file has been renamed or moved then it chooses the date
it was renamed/moved.
"""

def rename_incorrect_files(path):
    """
    Renames files in a specified folder to be in the format YYYY-MM-DD Filename.
    Inputs: path: String of the path of the folder to rename files in.
    """
    incorrect_files = retrieve_incorrect_filenames(path)
    for file in incorrect_files:
        separated_file = file.split("/")

        try:
          file_prefix = find_date_modified(file) + " "       # can switch find_date_modified for find_date_path_created
          separated_file[-1] = file_prefix + separated_file[-1]
          new_filename = "/".join(separated_file)
        except:
          pass
        try:
            os.rename(file, new_filename)
        except:
          pass

def retrieve_incorrect_filenames(path):
    """
    Retrieves a list of paths to incorrectly named files in a specified folder.
    Inputs: path: String of the path of folder to focus on.
    Outputs: incorrect_files: Set of paths to incorrectly named files.
    """
    all_files = set(retrieve_all_filenames(path))
    correct_files = set(retrieve_correct_filenames(path))
    incorrect_files = all_files - correct_files
    return(incorrect_files)

def find_date_modified(path):
    """
    Finds the date a specified file was modified.
    Input: path: String of the path to the file to focus on.
    Output: local_time: String of the date a file was modified in the form YYYY-MM-DD
    """
    try:
      seconds_since_epoch = os.path.getmtime(path)
    except:
      pass

    try:
      local_time = dt.datetime.utcfromtimestamp(seconds_since_epoch).strftime("%Y/%m/%d")
      local_time = local_time.replace("/","-")
      return (local_time)
    except:
      pass


def find_date_path_created(path):     # Currently unused
    """
    Finds the date the path to a specified file was created.
    Input: path: String of the path to the file to focus on.
    Output: local_time: String of the date a file was modified in the form YYYY-MM-DD
    """
    seconds_since_epoch = os.path.getctime(path)
    local_time = dt.datetime.utcfromtimestamp(seconds_since_epoch).strftime("%Y/%m/%d")
    local_time = local_time.replace("/","-")
    return (local_time)

def retrieve_correct_filenames(path):
    """
    Retrieves a list of paths to correctly named files in a specified folder.
    Input: path: String of the path of folder to focus on.
    Output: correct_files: List of paths to correctly named files.
    """
    full_path = path + "/**/????-??-??*"
    correct_files = glob.glob(full_path, recursive = True)
    return correct_files

def retrieve_all_filenames(path):
    """
    Retrieves a list of paths to all files in a specified folder.
    Inputs: path: String of the path of folder to focus on.
    Output: all_files: List of paths to all files.
    """
    full_path = path + "/**"
    all_files = glob.glob(full_path, recursive = True)
    all_files = [file for file in all_files if os.path.isfile(file)]
    return all_files


def check_file_path_length(path):
  files = retrieve_all_filenames(path)
  list_path = defaultdict(list)

  [list_path['path_length'].append(len(Path(file).parents)-6) for file in files]
  file_path_df = pd.DataFrame({ 'File Path': files , 'Path Length': list_path['path_length']})
  return file_path_df

In [3]:
rename_incorrect_files(r"/content/drive/MyDrive/CO - GGMF GCG LIVE (Official Folder)/00 Documents for deletion")
rename_incorrect_files(r"/content/drive/MyDrive/CO - GGMF GCG LIVE (Official Folder)/01 Closed folders from 2017")
rename_incorrect_files(r"/content/drive/MyDrive/CO - GGMF GCG LIVE (Official Folder)/02 GGMF Programme delivery")
rename_incorrect_files(r"/content/drive/MyDrive/CO - GGMF GCG LIVE (Official Folder)/03 GGMF Core Delivery")
rename_incorrect_files(r"/content/drive/MyDrive/CO - GGMF GCG LIVE (Official Folder)/04 GCG Directorate Services")
rename_incorrect_files(r"/content/drive/MyDrive/CO - GGMF GCG LIVE (Official Folder)/05 Business Services")
rename_incorrect_files(r"/content/drive/MyDrive/CO - GGMF GCG LIVE (Official Folder)/06 Team Admin")

In [4]:
# Check file path length
first_folder = check_file_path_length(r"/content/drive/MyDrive/CO - GGMF GCG LIVE (Official Folder)/02 GGMF Programme delivery")
second_folder =  check_file_path_length(r"/content/drive/MyDrive/CO - GGMF GCG LIVE (Official Folder)/03 GGMF Core Delivery")
third_folder =  check_file_path_length(r"/content/drive/MyDrive/CO - GGMF GCG LIVE (Official Folder)/04 GCG Directorate Services")
fourth_folder =  check_file_path_length(r"/content/drive/MyDrive/CO - GGMF GCG LIVE (Official Folder)/05 Business Services")
fifth_folder = check_file_path_length(r"/content/drive/MyDrive/CO - GGMF GCG LIVE (Official Folder)/06 Team Admin")

all_files = pd.concat([first_folder,second_folder,third_folder,fourth_folder,fifth_folder ])
all_files['File Path'] = all_files['File Path'].str.split('/').str[:-1].str.join('/')
length_unique_file_paths = len(all_files['File Path'].unique())

all_files = all_files[all_files['Path Length']>6]
print(len(all_files['File Path'].unique())*100/length_unique_file_paths)
%cd '/content/drive/My Drive/Kehinde Owoeye - Transfer Folder'
all_files.to_csv(datetime.today().strftime('%Y-%m-%d') + ' file_paths_lengths.csv')

15.826330532212886
/content/drive/My Drive/Kehinde Owoeye - Transfer Folder
