# Multi-File, Multi-Line Notebook

### Disclaimer:
This code uses the multi-line strategy mentioned in the "NLP_Multi-Line" notebook, so go there to get context. This notebook will probably work on OCR where all information is combined on one line -- but here the matches can extend over multiple lines, so make sure your regex does what you expect in multi-line mode. Also keep in mind that this notebook doesn't have an option to combine groups of consecutive lines into one line, like the original NLP notebook does. This code is experimental, so feel free to bug me (Sarah) if something isn't working (or just improve it yourself if you want).

## When / How to Use

This notebook allows you to automatically fill columns with certain values, and change those values when you switch to a new file. For example, if you have different files from different years, you can fill in the year column with 1921 for the 1921 file, 1922 for the 1922 file, and so on. You can also fill in columns that remain the same between all files, like university, state, Chetty tier, etc., so you don't have to do that manually later.

Like before, I have the code you need to change labeled with comments in all caps. Most of the steps are the same as the multi-line notebook. You can make a copy of the notebook and alter those parts to work with your school.

# Imports

In [1]:
import os
import re
import csv
import sys
# import unicodedata
# from unidecode import unidecode

### Reads all lines instead of splicing them apart

In [2]:
def pre_process(fname):
    with open(fname, encoding='utf-8') as fin:
        lines = fin.read()
    return lines

# Function that makes lists of people

In [3]:
def collect(text):
    """Collects all names and returns a formatted list of matches from text<str>."""  
    if type(text) is not str:
        raise NotImplementedError

    # This loop looks through lines (from above) and appends each match to a list, formatted the way we want
    raw_matches = []
    non_matches = []
    
    # Find all matches in the text and put it in a list
    # raw_matches = people_re.findall(text, re.MULTILINE)
    
    for match in people_re.finditer(text, re.MULTILINE):
        raw_matches.append(match)
    
    # Add everything that isn't in a match to non_matches (make substrings from the gaps in-between matches)
    prev_end_index = 0
    length = len(raw_matches)
    for i in range(length):
        currentMatch = raw_matches[i]
        
        # Get start and end values
        matchSpan = currentMatch.span()
        start = matchSpan[0]
        end = matchSpan[1]
        
        non_matches.append(text[prev_end_index:start])
        prev_end_index = end
    # add last bit (last match to end of string)
    non_matches.append(text[prev_end_index:-1])
    
    
    return raw_matches, non_matches


# Define the regex and which school to search
Go to https://pythex.org/ to test your regex.

In [4]:
# PUT REGULAR EXPRESSION HERE, (1/8)
people_re = re.compile(r'(^(?P<studentName>[a-zA-Z -\'’é]{7,}([A-Z]\.)?[a-zA-Z -]+(, Jr\.?)?)(, \dd)?(\n\n[0-9]+[A-Za-z. -]+)?\n\n(?P<city>[a-zA-Z./ ]+)(, (?P<state>[a-zA-Z./ ]+))?)', flags=re.MULTILINE)

# PUT NAME OF SCHOOL HERE (aka the name of the folder), (2/8)
source = "YaleTemp"

# Name of the csv file to write to
target = f"NLP_Output_{source}.csv"

# This is where the non-matched lines will be written
chk_file = 'check.csv'

os.chdir(r'..\output\University CSVs\{}'.format(source))

# Run this just to make sure that if you rerun the code,
# it makes a new file instead of appending

In [5]:
# Update target file name so that we aren't appending to an existing file
    
if os.path.exists(target):
    i = 1
    name, ext = target.split('.')
    while os.path.exists(f'{name}_{i}.{ext}'):
        i += 1
    target = f'{name}_{i}.{ext}'

if os.path.exists(chk_file):
    i = 1
    name, ext = chk_file.split('.')
    while os.path.exists(f'{name}_{i}.{ext}'):
        i += 1
    chk_file = f'{name}_{i}.{ext}'

# Main function -- create rows and variables for each row, then output
If you interrupt this cell or the previous one and the next run gives an OS error, restart the kernel and then try again.

In [6]:
os.chdir(r'..\..\University Text Files\{}'.format(source))

# RENAME FILES SO THEY WILL BE READ IN THE ORDER YOU WANT (ex. 0_Yale_1923_Seniors, 1_Yale_1923_Juniors, etc.) (3/8)


# LIST INITIAL HARDCODED COLUMN VALUES HERE (i.e. the values you want for the first file, index 0) (4/8)
standing = ''
year = '1921'
school = 'Yale University'
school_state = 'Connecticut'
chetty_tier = '1'
class_year = ''

file_name = '1921 College' # this variable is only used on the debugging row to distinguish files in the output; comment out if you want

index = 0;
for txt in [i for i in os.listdir() if i[-4:] == '.txt']:
    
    # Change column values based on which file it is
    
    # CHANGE THE VARIABLES FOR THE NEW COLUMN OUTPUT FOR EACH FILE (5/8)
    if (index == 1):
        file_name = '1922 College'
        year = '1922'
        class_year = '' 
    elif (index == 2):
        file_name = '1923 College'
        year = '1923'
        class_year = ''
    elif (index == 3):
        file_name = '1924 College'
        year = '1924'
        class_year = ''
    elif (index == 4):
        file_name = '1925 College'
        year = '1925'
        class_year = ''
    # elif (index == #), and so on for each file
    
    
    # This is where the work happens. Uses the collect() function.
    print(f'finding names in {txt}...')
    
    result, check = collect(pre_process(txt))
    num = len(result)
    print(f'found {num} names. Writing to file {target}...')
    os.chdir(r'..\..\University CSVs\{}'.format(source))

    # Write matches to the target file (.csv)
    
    with open(target, 'a', newline='', encoding='utf-8-sig') as fout:
        writer = csv.writer(fout)
        
        
        # PUT COLUMN HEADINGS HERE -- WILL ONLY BE AT TOP OF PAGE (6/8)
        if (index == 0):
            writer.writerow(['Name', 'City' , 'State', 'Standing', 'Year', 'Class_Year', 'School', 'School_State', 'Chetty_Tier'])
            
        # Output a line at the top of each file so you can tell them apart (for debugging, feel free to comment out)
        writer.writerow(['FILE:', file_name])
        

        # Output each match in a row
        for match in result:
            # USE THIS SYNTAX TO MAKE VARIABLES FOR YOUR NAMED GROUPS, (7/8)
            # variable_name = match.group('namedGroupInExpression')
            name = match.group('studentName')
            city = match.group('city')
            state = match.group('state')
            
            
            # PUT VARIABLE NAMES IN ORDER HERE TO OUTPUT THEM IN A ROW (8/8)
            writer.writerow([name, city, state, standing, year, class_year, school, school_state, chetty_tier])
            

        # Total names found in file
        writer.writerow(['Names found:', num])
        index = index + 1;

    # Write non-matches to the check file
    with open(chk_file, 'a', newline='', encoding='utf-8-sig') as fout:
        writer = csv.writer(fout)
        for i in check:
            try:
                for line in i.split('\n'):
                    writer.writerow([line])
            except UnicodeEncodeError as e:
                writer.writerow([e])

    os.chdir(r'..\..\University Text Files\{}'.format(source))

finding names in YaleTemp_1921.txt...
found 953 names. Writing to file NLP_Output_YaleTemp_8.csv...
finding names in YaleTemp_1922.txt...
found 984 names. Writing to file NLP_Output_YaleTemp_8.csv...
finding names in YaleTemp_1923.txt...
found 1098 names. Writing to file NLP_Output_YaleTemp_8.csv...
finding names in YaleTemp_1924.txt...
found 1157 names. Writing to file NLP_Output_YaleTemp_8.csv...
finding names in YaleTemp_1925.txt...
found 1333 names. Writing to file NLP_Output_YaleTemp_8.csv...
