In [2]:
# packages 

import os
import glob
import pandas as pd
import PyPDF2
import numpy as np

import re
import datetime

import shutil


In [3]:

# function to mkdir without overwriting folders w/ same name
def create_folder_safe(path, folder_name):
    folder_path = os.path.join(path, folder_name)
    suffix = 1
    while(os.path.exists(folder_path)):
        folder_path = folder_path + " (" + str(suffix) + ")"
        suffix = suffix + 1
    
    os.makedirs(folder_path)
    return(folder_path)

# function for cleaning text (remove periods, slashes, etc.)
def clean_text(in_text):
    return re.sub('[^a-zA-Z0-9_-]+', '-', in_text)


In [4]:
# Loop through pdfs 

data_folder = 'C:/Users/LIU7TV/Desktop/in_folder'
file_path_list = glob.glob(os.path.join(data_folder, '*.pdf'))


# empty lists to make data frame
date_list = []
name_list = []
identifier_list = []
filename_list = []
assessment_list = []


# loop through PDF file paths
for file_name in file_path_list:

    filename_list.append(file_name)

    with open(file_name, 'rb') as pdf_file:

        # read pdf into one string

        
        pdf_reader = PyPDF2.PdfFileReader(pdf_file)
        text = ''
        for page_num in range(pdf_reader.getNumPages()):
            page = pdf_reader.getPage(page_num)
            text += page.extractText()


        # date
        match_date = re.search(r"Date Assessment Completed:(\d{1,2}/\d{1,2}/\d{2,4})Date Report Generated", text)
        if match_date:
            date_str = match_date.group(1)
            date = datetime.datetime.strptime(date_str, "%m/%d/%Y").date()
            date_list.append(date)
        else:
            raise Exception('Date not found in PDF')
        

        # name
        match_name = re.search(r"Name:(.*?)Date of Birth", text)
        if match_name:
            name = match_name.group(1)
            name_split = re.split(", ", name)
            if(len(name_split) == 2):
                name_edited = name_split[1] + name_split[0]
            else:
                name_edited = name
            name_list.append(name_edited)
        else:
            raise Exception('Name not found in PDF')
        

        # identifier
        match_identifier = re.search(r"Identifier:(.*?)Name", text)
        if match_identifier: 
            identifier = match_identifier.group(1)
            identifier_list.append(identifier)
        else:
            raise Exception('Identifier not found in PDF')
        

        # assessment name
        match_assessment = re.search(r"Assessment Name:(.*?)Date Assessment Completed", text)
        if match_assessment: 
            assessment = match_assessment.group(1)
            assessment_list.append(assessment)
        else:
            raise Exception('Assessment list not found in PDF')



df_pdf = pd.DataFrame({'date': date_list, 
                       'initials': name_list, 
                       'id': identifier_list, 
                       'assessment_name': assessment_list,
                       'pdf_file_path': filename_list})

print(np.shape(df_pdf))

(0, 5)


In [5]:
# check csvs for every id 

# output folder
out_folder = 'C:/Users/LIU7TV/Desktop/out_folder'

# input folder 
data_folder = 'C:/Users/LIU7TV/Desktop/in_folder'
file_path_list = glob.glob(os.path.join(data_folder, '*.csv'))


# loop through df of pdfs
for index, row in df_pdf.iterrows():

    # get info from df 
    cur_pdf_file_path = row['pdf_file_path']
    cur_id = row['id']
    cur_initials = row["initials"]
    cur_assessment = row['assessment_name']
    cur_date = row['date'].strftime("%m-%d-%Y")


    # create new folder and copy pdf over
    new_folder_name = cur_initials + "__" + cur_date + "__" + cur_id + "__" + cur_assessment
    new_folder_name = clean_text(new_folder_name)

    new_folder_path = create_folder_safe(path = out_folder, 
                                         folder_name = new_folder_name)
    shutil.copy(cur_pdf_file_path, new_folder_path)

    #print(new_folder_name)


    # loop through csv file paths
    for file_name in file_path_list:

 
        # get current csv
        cur_df = pd.read_csv(file_name)
       

        # check if df matches PID from current PDF
        match_id = cur_df['PID'].astype(str).str.contains(cur_id).any()

        match_assessment = cur_df['AssessmentName'].astype(str).str.contains(cur_assessment).any()

        if(match_id and match_assessment):
            shutil.copy(file_name, new_folder_path)

     


In [6]:
# check csvs for every id 

# output folder
out_folder = 'C:/Users/LIU7TV/Desktop/out_folder'

# input folder 
data_folder = 'C:/Users/LIU7TV/Desktop/in_folder'
file_path_list = glob.glob(os.path.join(data_folder, '*.csv'))


# loop through df of pdfs
for index, row in df_pdf.iterrows():

    # get info from df 
    cur_pdf_file_path = row['pdf_file_path']
    cur_id = row['id']
    cur_initials = row["initials"]
    cur_assessment = row['assessment_name']
    cur_date = row['date'].strftime("%m-%d-%Y")


    # create new folder and copy pdf over
    new_folder_name = cur_initials + "__" + cur_date + "__" + cur_id + "__" + cur_assessment
    new_folder_name = clean_text(new_folder_name)

    new_folder_path = create_folder_safe(path = out_folder, 
                                         folder_name = new_folder_name)
    shutil.copy(cur_pdf_file_path, new_folder_path)

    #print(new_folder_name)


    # loop through csv file paths
    for file_name in file_path_list:

 
        # get current csv
        cur_df = pd.read_csv(file_name)

        cur_df['AssessmentName'] = cur_df['AssessmentName'].str.replace(' ', '')
       

        # check if df matches PID from current PDF
        match_id = cur_df['PID'].astype(str).str.contains(cur_id).any()

        match_assessment = cur_df['AssessmentName'].astype(str).str.contains(cur_assessment.replace(' ', '')).any()

        if(match_id and match_assessment):
            shutil.copy(file_name, new_folder_path)

     
