In [64]:
import json
import yaml

In [3]:
import numpy as np
import scipy.stats as scs
import matplotlib.pyplot as plt
import pandas as pd
%matplotlib inline
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = 'all'

In [4]:
import googlemaps
from sklearn.pipeline import make_pipeline

In [5]:
from frag_tools import (DropColumns,
                        AddressLatLong,
                        CurrentMakeDummies,
                        Binarize,
                        ChangeTypes,
                        custom_zip_cleaning)

In [6]:
df = pd.read_csv('/mnt/c/Users/kurtrm/Downloads/Predictive_Maintenance_Transformer_Overload_PA.csv', sep=';')

In [7]:
df.head()

Unnamed: 0,VegMgmt,PMLate,MilesFromOcean,Manufacturer,WaterExposure,MultipleConnects,Storm,AssetType,Repairs,AvgRepairCost,...,AssetCity,AssetState,AssetZip,locationID,Latitude1,Longitude1,Overloads,Latitude,Longitude,Status
0,Yes,N,0-30,Other,No,Yes,No,1-Phase Pole Transformer,Rebuild+3,788513,...,STOCKBRIDGE,GA,30281,ADK,518.781,1.766.461,Below 100%,518781,1766461,1
1,Yes,N,0-30,Schneider Electric,No,No,No,1-Phase Pole Transformer,Rebuild+2,788513,...,MARRIETTA,GA,30068,AKK,569.386,1.541.825,Below 100%,569386,1541825,1
2,Yes,N,0-30,Other,No,No,No,1-Phase Pole Transformer,Original,55000,...,NORCROSS,GA,30091,Z13,609.047,1.614.225,Above 150%,609047,1614225,0
3,Yes,N,0-30,Other,No,Yes,No,1-Phase Pole Transformer,Original,788513,...,STOCKBRIDGE,GA,30281,AKI,609.028,1.612.306,Above 150%,609028,1612306,0
4,Yes,N,0-30,Siemens,No,No,No,1-Phase Pole Transformer,Original,788513,...,JONESBORO,GA,30238,AUK,626.8,1.646.600,Above 150%,6268,16466,0


In [8]:
location_info = df[['AssetLocation', 'AssetCity', 'AssetState', 'AssetZip']]

In [9]:
joined_series = location_info.apply(lambda x: ", ".join(x.tolist()), axis=1)

In [179]:
joined_series.tolist();

In [11]:
df.drop(['AssetState', 'MilesFromOcean', 'AssetLocation', 'AssetZip', 'locationID', 'Latitude1', 'Latitude1', 'Latitude', 'Longitude'], axis=1)

Unnamed: 0,VegMgmt,PMLate,Manufacturer,WaterExposure,MultipleConnects,Storm,AssetType,Repairs,AvgRepairCost,Age,AssetId,AssetCity,Longitude1,Overloads,Status
0,Yes,N,Other,No,Yes,No,1-Phase Pole Transformer,Rebuild+3,788513,703,CE01059,STOCKBRIDGE,1.766.461,Below 100%,1
1,Yes,N,Schneider Electric,No,No,No,1-Phase Pole Transformer,Rebuild+2,788513,703,JU02620,MARRIETTA,1.541.825,Below 100%,1
2,Yes,N,Other,No,No,No,1-Phase Pole Transformer,Original,55000,703,QD01302,NORCROSS,1.614.225,Above 150%,0
3,Yes,N,Other,No,Yes,No,1-Phase Pole Transformer,Original,788513,703,RC00547,STOCKBRIDGE,1.612.306,Above 150%,0
4,Yes,N,Siemens,No,No,No,1-Phase Pole Transformer,Original,788513,703,GO01571,JONESBORO,1.646.600,Above 150%,0
5,Yes,N,Schneider Electric,No,No,No,1-Phase Pole Transformer,Original,788513,703,PV03821,ATLANTA,1.586.178,Above 150%,0
6,Yes,N,Siemens,No,No,No,1-Phase Pole Transformer,Original,788513,703,BG00035,ATLANTA,1.585.972,Above 150%,0
7,Yes,N,GE,Yes,No,No,DF-series Transformer,Rebuild+1,77000,679,BI00850,ACWORTH,1.585.578,Below 100%,1
8,Yes,N,Other,No,Yes,No,DF-series Transformer,Original,83287,679,PH00338,REX,1.526.222,Below 100%,1
9,Yes,N,Other,No,Yes,No,DF-series Transformer,Rebuild+2,83287,679,DU01362,DECATUR,1.578.575,Below 100%,1


## Column Descriptions

- VegMgmt: Vegetation Management plans help reduce herbicide use and maintenance costs. Vegetation, if left alone will grow out of control, blocking visibility.
- PMLate: Plant Maintenance Late – equipment that had overdue maintenance schedule.
- MilesFromOcean: This column makes no sense in relation to the AssetLocation, City, and Zip.
- Manufacturer: The name of the manufacturer of the transformer.
- WaterExposure: Whether a given transformer has been documented as having been been exposed to water.
- MultipleConnects: Whether the transformer supplies more than one home.
- Storm: Whether the transformer has been subjected to stormy weather.
- AssetType: Type of the transformer.
- Repairs: How many repairs/refurbishments a transformer has received.
- AvgRepairCost: Average repair cost for the transformer. These units seem high.
- Age: In years.
- Overloads: Status of being overloaded.

In [12]:
df['Age'].apply(lambda x: float(x.replace(',', '.'))).describe()

count    1716.000000
mean       13.438287
std        15.996203
min         3.000000
25%         4.500000
50%         6.600000
75%        12.900000
max        80.000000
Name: Age, dtype: float64

In [13]:
df['Overloads'].value_counts()

Below 100%    950
100-120%      663
120-150%       79
Above 150%     24
Name: Overloads, dtype: int64

In [14]:
df['Status'].value_counts()

1    950
0    766
Name: Status, dtype: int64

In [180]:
(df.groupby('AssetCity')['Status'].sum() / df['AssetCity'].value_counts()).sort_values(ascending=False);

In [181]:
df['AssetCity'].value_counts();

In [18]:
# test1 = gmaps.geocode(', '.join(location_info.iloc[0].tolist()))

In [19]:
joined_list = joined_series.tolist()

In [20]:
# geocoded = [gmaps.geocode(address) for address in joined_list]
with open('../src/static/data/geocoded_address.json', 'r') as f:
    geocoded = json.load(f)

In [110]:
def state_verification(geocodes):
    names = [[name['long_name'] for name in location[0]['address_components']] for location in geocodes]
    return [name for name in names if 'Georgia' not in name]

In [111]:
state_verification(geocoded)

[['Windham', 'Windham County', 'Connecticut', 'United States']]

In [21]:
address_check = [location[0]['formatted_address'] for location in geocoded]

In [22]:
address_check[0].split(',')

['371 Vista Creek Dr', ' Stockbridge', ' GA 30281', ' USA']

In [24]:
address_check[373], joined_list[373]

('982 Smith St, Clarkston, GA 30021, USA',
 '982 SMITH ST, Clarkston, GA, 30021')

In [25]:
all('GA' in a for a in address_check)

False

In [26]:
all(a[:4] == b[:4] for a, b in zip(address_check, joined_list))

False

In [27]:
not_equal = []
sum_not_equal = 0
for i, (a, b) in enumerate(zip(address_check, joined_list)):
    if a[:4] != b[:4]:
        not_equal.append(i)
        sum_not_equal += 1

In [183]:
df.iloc[not_equal][['AssetLocation', 'AssetCity', 'AssetState', 'AssetZip']];

In [31]:
not_adresses = [address_check[i] for i in not_equal]

In [184]:
list(zip(range(len(not_adresses)), not_adresses));

In [33]:
bad_addresses = [0, 1, 2, 4, 9, 10, 11, 13, 14, 15, 16, 18, 19, 20, 21,
                 22, 24, 25, 27, 29, 32, 35, 36, 37, 38, 40, 41, 42,
                 46, 49, 50, 51, 52, 54, 55, 59]

In [51]:
bad_addresses_df = df.iloc[not_equal][['AssetLocation', 'AssetCity', 'AssetState', 'AssetZip']].reset_index().iloc[bad_addresses]

In [141]:
bad_indices = bad_copy['index'].tolist()

In [124]:
corrected = ['356 N Rover Rd, Williamson, GA 30292',
             '3001 Tree Lodge Parkway Atlanta, Ga 30350',
             '3328 E Ponce de Leon Ave, Scottdale, GA 30079',
             '100 Tinsley Road, Peachtree City, GA 30269',
             '35 Firethorne Dr, Newnan, GA 30265',
             '172 Takela Forest, Fairmount, GA 30139',
             '1081 Brite Ct, Austell, GA 30106',
             '606 Wyndham Ct, Canton, GA 30115',
             '705 Cobb Crossing SE, Smyrna, GA 30080',
             '245 Omin Rd, Fayetteville, GA 30214',
             '1336 Oakdale Dr, Griffin, GA 30224',
             '50 Hollow Oaks CtNewnan, GA 30263',
             '3696 Tulip Dr, Decatur, GA 30032',
             '5411 Huron Dr, Lake City, GA 30260',
             '717 Sorrel Ln, Alpharetta, GA 30005',
             '410 Sassafras Rd, Roswell, GA 30076',
             '4032 Jonesboro Rd, Hampton, GA 30228 ',
             '3210 Wexford Dr, Albany, GA 31721',
             '2910 Lake Colony Dr NW, Norcross, GA 30071',
             'Hyacinth Ln, Peachtree City, GA 30269',
             '2470 Chestnut Landing, Atlanta, GA 30360',
             '2821 Gillionville Rd, Albany, GA 31721',
             '5858 Waggoner Ct, Rex, GA 30273',
             '3799 Main St, Atlanta, GA 30337',
             '3517 Shadowood Pkwy SE, Atlanta, GA 30339',
             '706 Lake Cove Dr, Hampton, GA 30228',
             '538 Country Greens Dr, Jonesboro, GA 30238',
             '4020 Jeffrey Dr, College Park, GA 30349',
             '4045 George Busbee Pkwy NW, Kennesaw, GA 30144',
             '818 S Nottingham Rd, Jonesboro, GA 30236',
             '848 Oglethorpe Ave SW, Atlanta, GA 30310',
             '708 Cobblestone Blvd, Fayetteville, GA 30215',
             '1645 NE Executive Park Lane, Atlanta, GA 30329',
             '8275 Sentinae Chase Dr, Roswell, GA 30076',
             '319 Awendaw Cir, Ellenwood, GA 30294',
             '8827 Fairview Bluff, Johns Creek, GA 30022']

In [125]:
len(corrected)

36

In [65]:
with open('/home/kurtrm/.secrets/geocoding.yaml', 'r') as f:
    key = yaml.load(f)

In [68]:
gmaps = googlemaps.Client(key=key['API_KEY'])

In [131]:
corrected_locations = gmaps.geocode(corrected)

In [153]:
bad_copy.at[2, ['AssetState', 'AssetZip']] = 'GA', 30079

In [152]:
bad_copy.iloc[2]

index                    47
AssetLocation    PO BOX 408
AssetCity         SCOTTDALE
AssetState               AL
AssetZip              30078
Name: 2, dtype: object

In [156]:
len(corrected_locations)

10

In [160]:
 path = '/mnt/c/Users/kurtrm/' \
           'projects/predicting_equipment_failure/' \
           'src/static/data/corrected_addresses.json'
with open(path, 'w') as f:
    json.dump(adds, f)

In [157]:
def accumulate_addresses(addresses):
    """
    Get addresses one at a time and accumulate in a list.
    """
    new_addresses = []
    for address in addresses:
        coded = gmaps.geocode(address)
        new_addresses.extend(coded)
    
    return new_addresses

In [132]:
len(corrected)

36

In [158]:
adds = accumulate_addresses(corrected)

In [159]:
len(adds)

36

In [75]:
corrected_addresses = [location['formatted_address'] for location in corrected_locations]

In [80]:
len(corrected_addresses)

10

In [185]:
df.iloc[bad_addresses][['AssetLocation', 'AssetCity', 'AssetState', 'AssetZip']];

In [35]:
with open('../src/static/data/geocoded_address.json', 'w') as f:
    json.dump(geocoded, f)

In [36]:
geocoded[0][0]['geometry']['location']

{'lat': 33.5168046, 'lng': -84.2584781}

In [37]:
df.copy()['Latitude'] = [location[0]['geometry']['location']
                      for location in geocoded]

In [38]:
df.copy()[['Latitude', 'Longitude']] = pd.DataFrame([location[0]['geometry']['location']
                                                     for location in geocoded])

In [39]:
drop = DropColumns(['AssetCity', 'AssetId', 'AvgRepairCost', 'AssetState', 'MilesFromOcean', 'AssetLocation', 'locationID', 'Latitude1', 'Longitude1'])

In [40]:
dropped = drop.fit_transform(df)

In [41]:
dropped.head()

Unnamed: 0,VegMgmt,PMLate,Manufacturer,WaterExposure,MultipleConnects,Storm,AssetType,Repairs,Age,AssetZip,Overloads,Latitude,Longitude,Status
0,Yes,N,Other,No,Yes,No,1-Phase Pole Transformer,Rebuild+3,703,30281,Below 100%,518781,1766461,1
1,Yes,N,Schneider Electric,No,No,No,1-Phase Pole Transformer,Rebuild+2,703,30068,Below 100%,569386,1541825,1
2,Yes,N,Other,No,No,No,1-Phase Pole Transformer,Original,703,30091,Above 150%,609047,1614225,0
3,Yes,N,Other,No,Yes,No,1-Phase Pole Transformer,Original,703,30281,Above 150%,609028,1612306,0
4,Yes,N,Siemens,No,No,No,1-Phase Pole Transformer,Original,703,30238,Above 150%,6268,16466,0


In [42]:
CurrentMakeDummies(['Manufacturer',
                    'Repairs',
                    'Overloads',
                    'AssetType']).fit_transform(dropped).head()

Unnamed: 0,VegMgmt,PMLate,WaterExposure,MultipleConnects,Storm,Age,AssetZip,Latitude,Longitude,Status,...,Repairs_Rebuild+3,Overloads_100-120%,Overloads_120-150%,Overloads_Above 150%,Overloads_Below 100%,AssetType_1-Phase Pole Transformer,AssetType_3-Phase Transformer,AssetType_DF-series Transformer,AssetType_Padmount Transformer,AssetType_Voltage Transformer
0,Yes,N,No,Yes,No,703,30281,518781,1766461,1,...,1,0,0,0,1,1,0,0,0,0
1,Yes,N,No,No,No,703,30068,569386,1541825,1,...,0,0,0,0,1,1,0,0,0,0
2,Yes,N,No,No,No,703,30091,609047,1614225,0,...,0,0,0,1,0,1,0,0,0,0
3,Yes,N,No,Yes,No,703,30281,609028,1612306,0,...,0,0,0,1,0,1,0,0,0,0
4,Yes,N,No,No,No,703,30238,6268,16466,0,...,0,0,0,1,0,1,0,0,0,0


In [186]:
Binarize(['VegMgmt',
          'PMLate',
          'WaterExposure',
          'MultipleConnects',
          'Storm']).fit_transform(dropped);

In [187]:
ChangeTypes(['Age', 'AssetZip'], [lambda x: float(x.replace(',', '.')),
                                  custom_zip_cleaning]).fit_transform(dropped);

## Pipeline Finale

In [175]:
pipe = make_pipeline(DropColumns(['AssetCity',
                                  'AssetId',
                                  'AvgRepairCost',
                                  'AssetState',
                                  'MilesFromOcean',
                                  'AssetLocation',
                                  'locationID',
                                  'Latitude1',
                                  'Longitude1']),
                     CurrentMakeDummies(['Manufacturer',
                                         'Repairs',
                                         'Overloads',
                                         'AssetType']),
                     ChangeTypes(['Age', 'AssetZip'],
                                 [lambda x: float(x.replace(',', '.')),
                                  custom_zip_cleaning]),
                     Binarize(['VegMgmt',
                               'PMLate',
                               'WaterExposure',
                               'MultipleConnects',
                               'Storm']),
                     AddressLatLong())
transformed = pipe.fit_transform(df)

In [188]:
transformed['Latitude'].value_counts();