## Turning scraped text on New York pharmacies into structured data

This notebook uses data scraped from the Department of Education’s Office of the Professions [online verification search engine](http://www.op.nysed.gov/opsearches.htm#rx) currently saved as text files into structured data. It uses regular expressions to parse the text, turns the data into pandas dataframes and saves it as a CSV.

In [1]:
import pandas as pd
import re
import numpy as np
from os import listdir
from tqdm import tqdm

In [2]:
%load_ext jupyternotify

<IPython.core.display.Javascript object>

In [3]:
def extract_data(text):
    pharmacy = {}
    pharmacy['type'] = re.findall(r"(?<=Type\s:).+(?=Legal)", text)[0].strip()
    pharmacy['legal_name'] = re.findall(r"(?<=Legal Name\s:).+(?=Trade)", text)[0].strip()
    pharmacy['trade_name'] = re.findall(r"(?<=Trade Name\s:).+(?=Street)", text)[0].strip()
    address = re.findall(r"(?<=Street Address\s:).+(?=Registration No)", text, flags=re.DOTALL)[0].strip().split('\n')
    address_clean = []
    for item in address:
        address_clean.append(item.strip())
    address = ' '.join(address_clean)
    address = address.replace('\xa0', '').replace('\r', '')
    address = re.sub(r"\s+(?=\w+, NY)", ' ', address)
    address = re.sub(r"\s+(?=\d+)", ' ', address)
    pharmacy['street_address'] = address
    pharmacy['registration_number'] = re.findall(r"(?<=Registration No\s:).+(?=Date)", text)[0].strip()
    pharmacy['date_first_registered'] = re.findall(r"(?<=Date First Registered\s:).+(?=Registration)", text)[0].strip()
    pharmacy['registration_begins'] = re.findall(r"(?<=Registration Begins\s:).+(?=Registered)", text)[0].strip()
    pharmacy['registration_ends'] = re.findall(r"(?<=Registered through\s:)\s+\d\d\/\d\d\/\d\d", text)[0].strip()
    try:
        pharmacy['establishment_status'] = re.findall(r"(?<=Establishment Status\s:).+(?=Successor)", text)[0].strip()
    except:
        pharmacy['establishment_status'] = np.nan
    try:
        pharmacy['supervisor_no'] = re.findall(r"(?<=Supervisor\s:).+(?=\n)", text)[0].strip()
        pharmacy['supervisor_name'] = re.findall(r"(?<=" + re.escape(pharmacy['supervisor_no']) + r").+", text)[0].strip()
    except:
        pharmacy['supervisor_no'] = np.nan
        pharmacy['supervisor_name'] = np.nan
    try:
        pharmacy['successor_no'] = re.findall(r"(?<=Successor\s:).+\d\d\d\d\d", text, flags=re.DOTALL)[0].strip()
        pharmacy['successor_name'] = re.findall(r"(?<=" + re.escape(pharmacy['successor_no']) + r").+", text, flags=re.DOTALL)[0].strip()
    except:
        try:
            pharmacy['successor_name'] = re.findall(r"(?<=Successor\s:).+NONE", text)[0].strip()
        except:
            pharmacy['successor_no'] = np.nan
            pharmacy['successor_name'] = np.nan
    return pharmacy

In [4]:
def convert_datetimes(df):
    df = df.replace('Not on file', np.nan)
    df.date_registered = pd.to_datetime(df.date_registered)
    df.registration_begins = pd.to_datetime(df.registration_begins)
    df.registration_ends = pd.to_datetime(df.registration_begins)
    return df

In [5]:
def extract_batch(filename):
    file = open(filename, "r")
    pharmacy_list = []
    text = file.read()
    text = text.split('\n*****\n')
    for pharmacy in text:
        if pharmacy != '' and pharmacy != ' ':
            data = extract_data(pharmacy)
            pharmacy_list.append(data)
    file.close()
    batch = pd.DataFrame(pharmacy_list)
    #batch = convert_datetimes(batch) 
    return batch

In [21]:
path = '/Users/jmingram/Documents/COLUMBIA/Summer-2022/MP/pharmacy-data'
filenames = [f for f in listdir(path) if '.txt' in f]

In [22]:
filenames

['batch_m_to_0_3.txt',
 'batch_m_to_0_2.txt',
 'batch_m_to_0_1.txt',
 'batch_c_and_l.txt',
 'batch_d_to_l.txt',
 'batch_a_to_c.txt',
 'batch_remaining.txt',
 'batch_d_to_l_2.txt']

In [23]:
all_dfs_alpha = []
for file in tqdm(filenames):
    df = extract_batch('pharmacy-data/' + file)
    all_dfs_alpha.append(df)
    
all_pharmacies_alpha = pd.concat(all_dfs_alpha)

100%|█████████████████████████████████████████████| 8/8 [00:02<00:00,  3.02it/s]


In [24]:
all_pharmacies_alpha.drop_duplicates().to_csv('pharmacy-data-scrape-6-22.csv', index=False)

In [25]:
all_pharmacies_alpha.head()

Unnamed: 0,type,legal_name,trade_name,street_address,registration_number,date_first_registered,registration_begins,registration_ends,establishment_status,supervisor_no,supervisor_name,successor_no,successor_name
0,PHARMACY,"RIVERSIDE DRUG MART, INC.",,"288 ONTARIO ST. BUFFALO, NY 14207-0000",16282,11/16/78,Not on file,04/10/98,TRANSFER,,,23642.0,FAY'S INCORPORATED/ ECKERD DRUGS #5019
1,PHARMACY,RIVERSIDE PHARMACY INC.,,"2920 8TH AVENUE NEW YORK, NY 10039-0000",26816,11/15/04,11/01/10,12/23/10,TRANSFER,,,30384.0,RIVERSIDE SRX INC./ RIVERSIDE PHARMACY
2,PHARMACY,RIVERSIDE REMEDIES RX CORP.,,"39 LOWER MAIN ST. CALLICOON, NY 12723-0000",33078,10/08/14,10/01/20,09/30/23,,57762.0,,,
3,PHARMACY,RIVERSIDE RX INC.,,"5906B RIVERDALE AVE. BRONX, NY 10471-0000",36561,05/31/18,05/31/18,09/05/19,DISCONTINUED,,,,
4,PHARMACY,RIVERSIDE SRX INC.,RIVERSIDE PHARMACY,"2920 FREDERICK DOUGLAS BLVD. NEW YORK, NY 1003...",30384,12/23/10,12/01/19,11/30/22,,,,,


In [26]:
all_pharmacies_alpha.shape

(15370, 13)

In [27]:
all_pharmacies_alpha.drop_duplicates().shape

(14906, 13)