In [None]:
import os, csv
import pandas as pd
from pprint import pprint
from pathlib import Path
from datetime import datetime

def build_file_dataframe(chosenDir, DF=None, ignoreThumbs=True):
    def timestamp_to_date(timestamp):
        DT = datetime.fromtimestamp(timestamp)
        return DT.strftime("%m/%d/%Y, %H:%M:%S")

    def file_data_to_list(root, file):
        filePath = os.path.join(root, file)
        extension = file.split('.')[-1]
        extension = extension.lower()
        name = '.'.join(file.split('.')[:-1])
        now = datetime.now().strftime("%m/%d/%Y, %H:%M:%S")

        try:
            fileStats = os.stat(filePath)
            fileSize = str(fileStats.st_size)
            fileSize.zfill(15)
            fileCreationTime = timestamp_to_date(fileStats.st_ctime)
            fileModifiedTime = timestamp_to_date(fileStats.st_mtime)
            error = None
            retrieved = now
        except:
            fileSize = "123456789"
            fileSize.zfill(15)
            fileCreationTime = now
            fileModifiedTime = now
            error = "error getting file metadata"
            retrieved = now
        return [filePath, file, name, extension, fileSize, fileCreationTime, fileModifiedTime, retrieved, error]


    fileList = []
    for root, dirs, files in os.walk(chosenDir):
        if files:
            for file in files:
                if ignoreThumbs:
                    if file.split('.')[0] != "Thumbs":
                        fileList.append(file_data_to_list(root,file))
                else:
                    fileList.append(file_data_to_list(root, file))

    fileDF = pd.DataFrame(fileList,
                          columns=["Filepath", "File", "Name", "Extension", "Filesize", "Created", "Modified", "Retrieved", "Error"])
    if DF is not None:
        fileDF = DF.append(fileDF)
        fileDF.drop_duplicates(subset="Filepath", keep='first', inplace=True)
    return fileDF

def remove_chars_from_str(toRemove, someChars):
    '''Removes every character in string, someChars, from other string, someStr. Returns string with removed characters
    Also accepts lists of strings.'''
    if type(toRemove) == list:
        return [i.translate({ord(i): None for i in someChars}) for i in toRemove]
    if type(toRemove) == str:
        return toRemove.translate({ord(i): None for i in someChars})
    else:
        print("Error: Wrong Type")

def splitall(path):
    '''splits a path into each piece that corresponds to a mount point, directory name, or file'''
    allparts = []
    while 1:
        parts = os.path.split(path)
        if parts[0] == path:  # sentinel for absolute paths
            allparts.insert(0, parts[0])
            break
        elif parts[1] == path: # sentinel for relative paths
            allparts.insert(0, parts[1])
            break
        else:
            path = parts[0]
            allparts.insert(0, parts[1])
    return allparts

def get_size(start_path = '.'):
    total_size = 0
    for dirpath, dirnames, filenames in os.walk(start_path):
        for f in filenames:
            fp = os.path.join(dirpath, f)
            # skip if it is symbolic link
            if not os.path.islink(fp):
                try:
                    total_size += os.path.getsize(fp)
                except:
                    print("Error with" + str(fp))
                    pathLen = str(len(str(fp)))
                    print("Path Length is " + pathLen)
    return total_size

def csv_for_use(csvDF, recordsMount, destination):
    resultsDF = csvDF

    recordsColumn = []

    for index, row in resultsDF.iterrows():
        missingFile = row["Filepath"]
        missingFileDirPath = splitall(missingFile)[:-1]
        dirLoc = os.path.join(*missingFileDirPath)
        recordsColumn.append(convert_backup_path(dirLoc, recordsMount))

    resultsDF["Records Drive Loc"] = pd.Series(recordsColumn)
    resultsDF.drop(["Name", "Extension", "Retrieved", "Error" ], axis = 1, inplace = True)
    resultsDF.to_csv(destination, index=False, quoting=csv.QUOTE_NONNUMERIC)
    return resultsDF

def establish_csv(defaultName, columnNamesList):
    csvPrompt = "Use %s?" % defaultName

    if user_chooses_yes(csvPrompt):
        csvFile = defaultName
    else:
        csvFile = user_csv_choice()
    csvPath = os.path.join(os.getcwd(), csvFile)

    if not os.path.isfile(csvPath):
        blankCSVdf = pd.DataFrame(columns=columnNamesList)
        blankCSVdf.to_csv(csvPath, index=False, quoting=csv.QUOTE_NONNUMERIC)
    return csvPath

def user_chooses_yes(promptText):
    '''asks yes or no question to user and returns 'True' for a yes answer and 'False' for a no answer'''
    yesNo = ['yes', 'y', 'Yes', 'Y', 'No', 'no', 'n', 'N']
    response = ''
    while response not in yesNo:
        response = input(promptText)
    if response in yesNo[4:]:
        return False
    else:
        return True

In [None]:
kresgeDir = r"R:\32xx   Kresge College (College 6)\3238"
csvPath = ""
filesDF = build_file_dataframe(kresgeDir)
print(filesDF.shape)
#filesDF.drop_duplicates(subset=["Filepath", "Name"], keep='first', inplace=True)
    

In [None]:
#Dataframe of files with duplicate names

dupNamesCsvFile = "kresge_filename_duplicates.csv"
nameframe = filesDF[filesDF.duplicated('Name',keep=False)]
nameframe.sort_values(by=['File', 'Filesize', 'Modified'], inplace=True)
nameframe.drop(['Name', 'Error'], axis = 'columns', inplace = True)
duplicatesCsvPath = establish_csv(dupNamesCsvFile, ["Filepath", "File", "Extension", "Filesize", "Created", "Modified", "Retrieved"])
nameframe.to_csv(duplicatesCsvPath, index=False, quoting=csv.QUOTE_NONNUMERIC)

In [None]:
nameframe.shape

In [None]:
def dir_attribute_list(dirPath):
    onlyfiles = [f for f in os.listdir(dirPath) if os.path.isfile(os.path.join(dirPath, f))]
    filesNum = len(onlyfiles)
    subDirsNum = len(os.listdir(dirPath)) - filesNum
    dirName = splitall(dirPath)[-1]
    size = get_size(dirPath)
    return [dirPath, dirName, filesNum, subDirsNum, size]
    
dirList = []
for root, dirs, files in os.walk(kresgeDir, topdown=True):
    dirList.append(dir_attribute_list(root))

no_duplicates = lambda ii : list(dict.fromkeys(ii))
dirList = no_duplicates(dirList)
len(dirList)

In [None]:
pathTooLongCSV = "path_too_long.csv"
pathLengths = [len(ii) for ii in filesDF["Filepath"]]
filesDF["Path Length"] = pathLengths
pathTooLongDF = filesDF[filesDF["Path Length"] >= 260]
pathTooLongDF.drop(['Name', 'Error'], axis = 'columns', inplace = True)
pathTooLongDF.sort_values(['File', 'Filesize'], inplace = True)
pathTooLongPath = establish_csv(pathTooLongCSV, ['Filepath', 'File', 'Extension', 'Filesize', 'Created',
       'Modified', 'Retrieved'])
pathTooLongDF.to_csv(pathTooLongPath, index=False, quoting=csv.QUOTE_NONNUMERIC)
