In [1]:
import os
import re
import traceback

import pandas as pd
from tqdm.notebook import tqdm

from utils.dataframe import df2gsheet, db2df, get_mapping, insert_status_in_mapping_gsheet

# catalog_text_processing funcs are imported in order of processing
from utils.catalog_text_processing import (find_department_font, 
                                           get_all_departments, 
                                           departments_update_with_page_ranges, 
                                           merge_duplicate_departments, 
                                           clear_junk_departments,
                                           get_department_text_slice,
                                           get_courses,                                            
                                          )

regex_until_but_not_including = lambda rgx: rf"(?:(?!{rgx}).)*"
regex_everything_not_greedy = '.*?'

DEBUG = False
TRACE = True

df_mapping = get_mapping()
df_mapping = df_mapping[((df_mapping.scrape_me == 'TRUE') | (df_mapping.scrape_me)) 
                        & (df_mapping.first_department_name != '')
                        & (df_mapping.regex_pattern_course_id != '')
                        & (df_mapping.status != 'DONE')
#                         & (df_mapping.college == 'UCLA')
#                         & (df_mapping.filename == 'UCLA_2017–18.pdf')
                       ]

In [2]:
for t in tqdm(df_mapping.itertuples()):
    regex_course_pattern = (t.regex_pattern_course_id 
            + regex_everything_not_greedy 
            + regex_until_but_not_including(t.regex_pattern_course_id))
    filename = t.filename
    college = t.college
    first_department_name = t.first_department_name
    avoid_matching_depatment_name = t.avoid_first_department_name if t.avoid_first_department_name else None 
    first_page = int(t.first_department_page)
    last_page = int(t.last_department_page)
    
    if DEBUG: 
        print(filename, college, regex_course_pattern, first_page, last_page, 
              first_department_name, avoid_matching_depatment_name)
    
    try:
        data = db2df(college=college, filename=filename).pdfminer_detailed.iat[0]    
    except:
        insert_status_in_mapping_gsheet(f"ERROR ~ {filename} can't get db data", college, filename)
        continue
    
    try:
        first_page_data = [page for page in data if page['page'] == first_page][0]
        pages_data_with_courses = [page for page in data 
                                   if page['page'] >= first_page 
                                   and page['page'] <= last_page]
    except:
        insert_status_in_mapping_gsheet(f"ERROR ~ {filename} - can't find first page, or page range", college, filename)
        continue

    regex_course_pattern = (t.regex_pattern_course_id 
        + regex_everything_not_greedy 
        + regex_until_but_not_including(t.regex_pattern_course_id))
    
    if DEBUG: 
        print(filename, college, regex_course_pattern, first_page, last_page, 
              first_department_name, avoid_matching_depatment_name, regex_course_pattern,
              first_page_data)
  
    try: 
        (department_font_sizes, 
         department_font_family,
         paragraph_font_sizes) = find_department_font(page=first_page_data, 
                                                      department_name=first_department_name, 
                                                      avoid=avoid_matching_depatment_name) 
        if DEBUG: 
            print(department_font_sizes, department_font_family, paragraph_font_sizes)
            
        departments = get_all_departments(data, department_font_sizes, department_font_family)
        departments.append((None, int(t.last_department_page)))
        departments = departments_update_with_page_ranges(departments)
        departments = merge_duplicate_departments(departments)
        departments = clear_junk_departments(departments)
        if DEBUG: print(departments)
        
        department_courses = []
        for d in get_department_text_slice(data, departments):
            if isinstance(d[1], Exception):
                insert_status_in_mapping_gsheet(f'ERROR ~ {filename} - while slicing text: {d}', college, filename)
                continue
            try:
                for c in get_courses(d[1], regex_course_pattern):
                    department_courses.append((d[0],c))
            except:
                insert_status_in_mapping_gsheet(f'ERROR ~ {filename} - while scraping course: {d}', college, filename)
                if TRACE: print(traceback.format_exc())

        df_dc = pd.DataFrame(department_courses, columns=['department_name', 'course'])
        df_dep = pd.DataFrame(departments, columns=['department_name', 'pages'])

    except Exception as e:
        t = ''
        d = ''
        if TRACE: 
            t = traceback.format_exc()
#             print(t)
        if DEBUG: 
            d = department_font_sizes, department_font_family, paragraph_font_sizes
#             print(d)
        insert_status_in_mapping_gsheet(f'ERROR ~ {filename} - MAIN BLOCK:  {e} {t} {d}', college, filename )
        
        continue
    
    try:
        df2gsheet(df_dc, filename.strip('.pdf'))
    except Exception:
        t = ''
        if TRACE: 
            t = traceback.format_exc()
#             print(t)
        print(college, filename)
        insert_status_in_mapping_gsheet(f'ERROR ~ {filename} - uploading courses: {t}', college, filename)
    
        
    try:
        df2gsheet(df_dep, filename.strip('.pdf') + '_Departments')
    except Exception:
        t = ''
        print(college, filename)
        if TRACE: 
            t = traceback.format_exc()
#             print(t)
        insert_status_in_mapping_gsheet(f'ERROR ~ {filename} - uploading departments: {t}', college, filename)
        
    insert_status_in_mapping_gsheet('DONE', college, filename)

HBox(children=(HTML(value=''), FloatProgress(value=1.0, bar_style='info', layout=Layout(width='20px'), max=1.0…

3999
                           department_name  \
6                             Architecture   
31  Education: Physical Activity and Ed Se   
32  Education: Physical Activity and Ed Se   
35                       Engineering Graph   
46                                 History   
64                                   Music   
65                     Nuclear Engineering   
69                                  Psycho   
73                             Social Work   

                                               course  
6   \n202. Admission to AHR, who have completed 20...  
31  \n140. Travel and subsistence costs for activi...  
32  \n709. Student must receive a B- or higher to ...  
35  \n110. GEC arts and hums, cultures and ideas c...  
46  \n132. GEC historical survey course.\nHistory ...  
64  \n150. GEC arts and hums vpa course. VPA Admis...  
65  \n541. Not open to students with credit for Me...  
69  \n245. Not open to students with less than a g...  
73  \nH694. Open only to soc w