In [1]:
import numpy as np
import pandas as pd

from address_normalize.paths import DATA_DIR

# Get matches
match_df = pd.read_csv(DATA_DIR / 'processed' / 'pending_phase_df.csv')


# Get raw data
raw_dir = DATA_DIR / 'raw'

files = list(raw_dir.glob('Master Project Address List*.xlsx'))
files.sort()

file_path = files[-1]

df_e911 = pd.read_excel(file_path, sheet_name='E911 Data', dtype={'GRAND LIST ZIPCODE': str})
data = pd.read_excel(file_path, sheet_name=['Phase_II', 'Phase_III'], dtype={'ZIP': str})
pending_df = pd.concat([data['Phase_II'], data['Phase_III']])

In [2]:
print(match_df.shape)
match_df.head()

(7323, 3)


Unnamed: 0,address,score,e911_address
0,160 green st,1.0,160 GREEN St
1,26 thomas cir,0.333333,26 THOMAS CIRCLE
2,22 sunset dr,1.0,22 SUNSET Dr
3,99 griswold ln,0.333333,99 BURKE Ln
4,411 mountainview dr,1.0,411 MOUNTAINVIEW DR


In [3]:
pending_df.head()

Unnamed: 0,REF,ESITEID,Address,Town,ZIP,PON Boundary,Status,Service Speed,Eligible,LAT,LONG,Build Phase
0,1,264978,160 GREEN ST,VERGENNES,5491,LCC VERGENNES-3,Served,Served 100/20,,44.161107,-73.245427,Phase II
1,2,265443,26 THOMAS CIR,VERGENNES,5491,LCC VERGENNES-2,Served,Served 100/20,,44.156948,-73.245608,Phase II
2,3,265008,22 SUNSET DR,VERGENNES,5491,LCC VERGENNES-3,Served,Served 100/20,,44.160163,-73.245543,Phase II
3,4,184727,99 GRISWOLD LN,ORWELL,5760,LCC ORWELL-5,Underserved,Served 4/1,Eligible,43.815731,-73.247273,Phase II
4,5,139558,411 MOUNTAINVIEW DR,LEICESTER,5733,LCC LEICESTER-5,Served,Served 100/20,,43.878902,-73.070688,Phase II


In [4]:
# Create a dictionary of the matches from pending_phase to e911
address_dict = dict(zip(match_df.address, zip(match_df.e911_address, match_df.score)))
print(len(address_dict))

7259


In [5]:
pending_df.columns

Index(['REF', 'ESITEID', 'Address', 'Town', 'ZIP', 'PON Boundary', 'Status',
       'Service Speed', 'Eligible', 'LAT', 'LONG', 'Build Phase'],
      dtype='object')

In [6]:
check_cols = [
    'OWNER 1',
    'OWNER 2',
    'GRAND LIST ADDRESS ',
    'CAT',
    'E911 ADDRESS',
    'GRAND LIST = ADDRESS'
]

for col in check_cols:
    if col not in pending_df.columns:
        print(f"Adding col: {col}")
        pending_df[col] = None

Adding col: OWNER 1
Adding col: OWNER 2
Adding col: GRAND LIST ADDRESS 
Adding col: CAT
Adding col: E911 ADDRESS
Adding col: GRAND LIST = ADDRESS


In [7]:
def update_row(row: pd.Series):
    recorded_address = row['Address'].lower()
    recorded_zip = row['ZIP']
    if (recorded_address in address_dict.keys()):
        e911_row = df_e911[
            (df_e911['E911 ADDRESS'] == address_dict[recorded_address][0]) &
            (recorded_zip == df_e911['GRAND LIST ZIPCODE'])
        ]
        if len(e911_row) > 1:
            # print(row['Zip Code'], e911_row['GRAND LIST ZIPCODE'], e911_row)
            # We will update with the first as it looks like they are all duplicate names
            e911_row = e911_row.iloc[0]
            # raise Exception('More than 1 match')

        if len(e911_row) == 1:
            # e911_row = e911_row.squeeze()
            # print(recorded_address, recorded_zip)
            # print(e911_row)
            row['OWNER 1'] = e911_row['OWNER 1'].values[0]
            row['OWNER 2'] = e911_row['OWNER 2'].values[0]
            row['GRAND LIST ADDRESS '] = e911_row['GRAND LIST ADDRESS '].values[0]
            row['CAT'] = e911_row['CAT'].values[0]
            row['E911 ADDRESS'] = e911_row['E911 ADDRESS'].values[0]
            if address_dict[recorded_address][1] == 1:
                row['GRAND LIST = ADDRESS'] = 'Y'
            else:
                row['GRAND LIST = ADDRESS'] = 'T'

    out = (row['OWNER 1'], row['OWNER 2'], row['GRAND LIST ADDRESS '], row['CAT'], row['E911 ADDRESS'], row['GRAND LIST = ADDRESS'])

    return out

In [8]:
len(pending_df[~(pending_df['GRAND LIST = ADDRESS'] == 'Y')])

7465

In [9]:
temp_df = pending_df.copy()

columns_to_replace = ['OWNER 1', 'OWNER 2', 'GRAND LIST ADDRESS ', 'CAT', 'E911 ADDRESS', 'GRAND LIST = ADDRESS']
temp_df[columns_to_replace] = temp_df.apply(update_row, axis=1, result_type="expand")

In [10]:
temp_df.head().T

Unnamed: 0,0,1,2,3,4
REF,1,2,3,4,5
ESITEID,264978,265443,265008,184727,139558
Address,160 GREEN ST,26 THOMAS CIR,22 SUNSET DR,99 GRISWOLD LN,411 MOUNTAINVIEW DR
Town,VERGENNES,VERGENNES,VERGENNES,ORWELL,LEICESTER
ZIP,05491,05491,05491,05760,05733
PON Boundary,LCC VERGENNES-3,LCC VERGENNES-2,LCC VERGENNES-3,LCC ORWELL-5,LCC LEICESTER-5
Status,Served,Served,Served,Underserved,Served
Service Speed,Served 100/20,Served 100/20,Served 100/20,Served 4/1,Served 100/20
Eligible,,,,Eligible,
LAT,44.161107,44.156948,44.160163,43.815731,43.878902


In [11]:
len(temp_df[temp_df['GRAND LIST = ADDRESS'] == 'T'])

995

In [12]:
temp_df[temp_df['GRAND LIST = ADDRESS'] == 'T']

Unnamed: 0,REF,ESITEID,Address,Town,ZIP,PON Boundary,Status,Service Speed,Eligible,LAT,LONG,Build Phase,OWNER 1,OWNER 2,GRAND LIST ADDRESS,CAT,E911 ADDRESS,GRAND LIST = ADDRESS
1,2,265443,26 THOMAS CIR,VERGENNES,05491,LCC VERGENNES-2,Served,Served 100/20,,44.156948,-73.245608,Phase II,FULTON LYLE AND DEBRA,,26 THOMAS CIRCLE,Residential-1,26 THOMAS CIRCLE,T
8,9,107258,5592 US ROUTE 7,FERRISBURGH,05456,LCC VERGENNES-9,Served,Served 100/20,,44.239412,-73.230508,Phase II,MASIELLO VALERIE M,,5592 US RT 7,Residential-1,5592 ROUTE 7,T
20,21,107882,258 TUPPERS XING,FERRISBURGH,05456,LCC VERGENNES-12,Served,Served 100/20,,44.190279,-73.252499,Phase II,STEADY IRENE,,258 TUPPERS CROSSING,Mobile Home/la,258 TUPPERS CROSSING,T
22,23,264645,8 SHORT ST,VERGENNES,05491,LCC VERGENNES-13,Served,Served 100/20,,44.167094,-73.250334,Phase II,LINCOLN GERONIMO LLC,,1341 ARNOLD BAY ROAD,Residential-1,8 E St,T
37,38,216626,720 WEST SHORE RD,SALISBURY,05769,LCC LEICESTER-7,Served,Served 100/20,,43.910131,-73.088466,Phase II,RYAN AMEY,,PO BOX 23,Seasonal-1,720 W SHORE RD,T
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3567,3568,202130,5041 VT ROUTE 125,RIPTON,05766,LCC RIPTON-2,Underserved,Served 4/1,Eligible,43.947051,-72.979557,Phase III,SCOTT PAULA,,PO BOX 180,Residential-1,5041 ROUTE 125,T
3591,3592,201962,905 PEDDLER BRIDGE RD,RIPTON,05766,NODE RIPTON-1,Underserved,Served 4/1,Eligible,43.981130,-73.021019,Phase III,HEPPELL JONATHAN,HEPPELL KAREN,PO BOX 54,Residential-2,905 PEDDLERS BRIDGE RD,T
3600,3601,202044,1613 VT ROUTE 125,RIPTON,05766,NODE RIPTON-1,Served,Served 25/3,,43.972266,-73.031442,Phase III,GOMBOSI EILEEN,,PO BOX 25,Residential-1,1613 ROUTE 125,T
3603,3604,155694,231 COURT ST,MIDDLEBURY,05753,LCC MIDDLEBURY-4,Served,Served 100/20,,44.003449,-73.154981,Phase III,MWM LLC,,227 CREEK ROAD,Commercial,231 COUROUTE St,T


In [13]:
temp_df.to_csv(DATA_DIR / 'processed' / 'temp_pending_phase.csv', index=False)