In [2]:
import pandas as pd
import shutil
from pathlib import Path

In [3]:
# Prevent wide columns from being cut off
pd.set_option("display.max_colwidth", None)

---

### Moving notebooks we want to keep

# /Users/robbfournier/Desktop/E2_Migrations/Elsevier/logs/STexport2/artifacts

In [8]:
source_folder = '/Users/robbfournier/Desktop/E2_Migrations/Elsevier/logs/STexport2/artifacts_old'
dest_folder = '/Users/robbfournier/Desktop/E2_Migrations/Elsevier/logs/STexport2/artifacts'

In [9]:
# Move artifacts folder to artifacts_old
# This way the filtered folder is called artifacts and the import tool will use the correct one
# source/dest reads backwards for this step but makes sense going forward 

if not Path(source_folder).is_dir() and Path(dest_folder).is_dir():
    shutil.move(dest_folder, source_folder)
else:
    raise Exception('Make sure to have notebooks in a folder called artifacts and delete the folder artifacts_old if it already exists')

In [10]:
# Read in notebooks to keep and rename 'notebook' column to 'source' for disambiguation
df = pd.read_csv('keep_notebooks.csv', delimiter='\t')
df.rename(columns={'notebook':'source'}, inplace=True)

In [11]:
# Create a new dataframe where source and dest are Path objects instead of strings
# Append '.dbc' to each filename

keep_df = pd.DataFrame()
keep_df['source'] = df['source'].apply(lambda x: Path(source_folder + x + '.dbc'))
keep_df['dest'] = df['source'].apply(lambda x: Path(dest_folder + x + '.dbc'))

In [12]:
# Loop through each row and copy the file in location 'source' to the location 'dest'
# Creates dest directories if they don't exist

counter = 0

for _, row in keep_df.iterrows():
    row['dest'].parent.mkdir(parents=True, exist_ok=True)
    try:
        shutil.copy(row['source'], row['dest'])
    except Exception as e:
        counter += 1
        print(e)

print (f"{counter} files failed to copy")

[Errno 2] No such file or directory: '/Users/robbfournier/Desktop/E2_Migrations/Elsevier/logs/STexport2/artifacts_old/Users/d.mcbeath@elsevier.com/E2Migration/Migrate/Groups.dbc'
[Errno 2] No such file or directory: '/Users/robbfournier/Desktop/E2_Migrations/Elsevier/logs/STexport2/artifacts_old/Users/d.mcbeath@elsevier.com/TERMite/Concordancer.dbc'
[Errno 2] No such file or directory: '/Users/robbfournier/Desktop/E2_Migrations/Elsevier/logs/STexport2/artifacts_old/rads/projects/2022_AUS_ARC/PlumX/Match_with_PlumX_Data.dbc'
[Errno 2] No such file or directory: '/Users/robbfournier/Desktop/E2_Migrations/Elsevier/logs/STexport2/artifacts_old/rads/projects/2022_AUS_ARC/TRO analysis/stats_paul.dbc'
[Errno 2] No such file or directory: '/Users/robbfournier/Desktop/E2_Migrations/Elsevier/logs/STexport2/artifacts_old/rads/projects/2022_AUS_ARC/Preliminary report (30 Sept. 2022)/ARC_Indexing_30sept.dbc'
[Errno 2] No such file or directory: '/Users/robbfournier/Desktop/E2_Migrations/Elsevier/lo

In [13]:
# Checking that all notebooks were copied by making a list of every file with a .dbc file extension in /artifacts/ and all subfolders
all_notebooks = Path(dest_folder).glob('**/*.dbc')
all_notebooks = [x for x in all_notebooks if x.is_file()]

# Compare how many .dbc files are in the dest folder against the number of rows in keep_df
if len(all_notebooks) == keep_df.shape[0]:
    print('All notebooks copied successfully')
else:
    print(f'{keep_df.shape[0] - len(all_notebooks)} notebooks were not copied')

13 notebooks were not copied


---

### Archiving Notebooks

In [20]:
archive_folder = "/Users/robbfournier/Desktop/E2_Migrations/Elsevier/logs/STexport2/artifacts/Archive"

In [16]:
# Read in notebooks to archive and rename 'notebook' column to 'source' for disambiguation
df = pd.read_csv('archive_notebooks.csv', delimiter='\t')
df.rename(columns={'notebook':'source'}, inplace=True)

In [17]:
# Create a new dataframe where source and dest are Path objects instead of strings
# Append '.dbc' to each filename

archive_df = pd.DataFrame()
archive_df['source'] = df['source'].apply(lambda x: Path(source_folder + x + '.dbc'))
archive_df['dest'] = df['source'].apply(lambda x: Path(archive_folder + x + '.dbc'))

In [18]:
# Loop through each row and copy the file in location 'source' to the location 'dest'
# Creates destination directories if they don't exist

counter = 0

for _, row in archive_df.iterrows():
    row['dest'].parent.mkdir(parents=True, exist_ok=True)
    try:
        shutil.copy(row['source'], row['dest'])
    except Exception as e:
        counter += 1
        print(e)

print (f"{counter} files failed to copy")

0 files failed to copy


In [19]:
# Checking that all notebooks were copied by making a list of every file with a .dbc file extension in /artifacts/archive/ and all subfolders
all_archived_notebooks = Path(archive_folder).glob('**/*.dbc')
all_archived_notebooks = [x for x in all_archived_notebooks if x.is_file()]

# Compare how many .dbc files are in the archive folder against the number of rows in archive_df
if len(all_archived_notebooks) == archive_df.shape[0]:
    print('All archived notebooks copied successfully')
else:
    print(f'{archive_df.shape[0] - len(all_archived_notebooks)} notebooks were not copied')

All archived notebooks copied successfully
