In [1]:
import pandas as pd

from sklearn.preprocessing import OneHotEncoder

# Preprocessing ev_stations data

In [2]:

stations_df = pd.read_csv(r'C:\Users\clava\data_miners\resources\TRNSFRM_ev_stations_v1.csv')
stations_df

Unnamed: 0,Station_Name,Street_Address,City,State_Code,Zip_Code,Status_Code,Access_Days_Time,EV_Level1_EVSE_Num,EV_Level2_EVSE_Num,EV_DC_Fast_Count,EV_Network,Latitude,Longitude,EV_ID,Owner_Type_Code,Open_Date,Access_Code,Facility_Type,Charging_Cost
0,LADWP - Truesdale Center,11797 Truesdale St,Sun Valley,CA,91352,E,Fleet use only,,39.0,3.0,Non-Networked,34.248319,-118.387971,1517,LG,10/15/1999,private,UTILITY,
1,LADWP - West LA District Office,1394 S Sepulveda Blvd,Los Angeles,CA,90024,E,,,4.0,,Non-Networked,34.052542,-118.448504,1519,LG,2/28/2020,private,UTILITY,Free
2,Los Angeles Convention Center,1201 S Figueroa St,Los Angeles,CA,90015,E,24 hours daily; pay lot,,12.0,,Non-Networked,34.040539,-118.271387,1523,P,8/30/1995,public,PARKING_GARAGE,
3,LADWP - John Ferraro Building,111 N Hope St,Los Angeles,CA,90012,E,For fleet and employee use only,,311.0,2.0,Non-Networked,34.059133,-118.248589,1525,LG,10/15/1999,private,UTILITY,
4,LADWP - Haynes Power Plant,6801 E 2nd St,Long Beach,CA,90803,E,Fleet use only,,19.0,1.0,Non-Networked,33.759802,-118.096665,1531,LG,5/1/2018,private,UTILITY,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
50216,VIP LOT STATION1,1501 Kirkwood Meadows Dr,Markleeville,CA,96120,E,24 hours daily,,2.0,,ChargePoint Network,38.684660,-120.065169,204783,,1/14/2022,public,,
50217,Prunedale Shopping Center,8065 San Miguel Canyon Rd,Salinas,CA,93907,E,24 hours daily,,,6.0,eVgo Network,36.801716,-121.664153,204784,,1/14/2022,public,,
50218,Beaverton Electric Avenue,11665 SW Beaverton Hillsdale Hwy,Beaverton,OR,97005,E,24 hours daily,,,2.0,Greenlots,45.489030,-122.798151,204785,,1/14/2022,public,,
50219,Shell - Inman,2040 Highway 292,Inman,SC,29349,E,24 hours daily,,,1.0,Greenlots,35.082476,-82.058433,204786,,1/14/2022,public,,


In [3]:
# Drop columns with unusable date
stations_df = stations_df.drop(columns=['Status_Code', 'Access_Days_Time', 'EV_ID', 'Owner_Type_Code', 'Open_Date', 'Access_Code', 'Facility_Type', 'Charging_Cost'])

# bin network type
other_network = ['Volta', 'EV Connect', 'OpConnect', 'SemaCharge Network', 'Tesla Destination',
                 'Greenlots', 'EVGATEWAY', 'POWERFLEX', 'eVgo Network', 'Webasto', 'CHARGELAB',
                 'AMPUP', 'EVCS', 'Blink Network', 'FCN', 'Tesla', 'Electrify America', 'FLO',
                 'FPLEV', 'ZEFNET', 'LIVINGSTON', 'Electrify Canada']

for network in other_network:
  stations_df.EV_Network = stations_df.EV_Network.replace(network, 'Other-Network')

stations_df.EV_Network = stations_df.EV_Network.replace('ChargePoint Network', 'ChargePoint')

# drop rows with States not used in current model
states = ['MI', 'MN', 'NJ', 'NY', 'OR', 'TX', 'WA', 'WI']

stations_df = stations_df[stations_df.State_Code.isin(states) == True]

# FUTURE OPTION - Classify Charging_Cost as FREE or PAID - Need solution to classify NaNs
# stations_df['Charging_Cost'] = stations_df['Charging_Cost'].replace(to_replace = ('Free', 'Paid'), value = ('FREE', 'PAID'))

# replace NaNs in EVSE type columns
stations_df['EV_Level1_EVSE_Num'] = stations_df['EV_Level1_EVSE_Num'].fillna(0)
stations_df['EV_Level2_EVSE_Num'] = stations_df['EV_Level2_EVSE_Num'].fillna(0)
stations_df['EV_DC_Fast_Count'] = stations_df['EV_DC_Fast_Count'].fillna(0)

# reorder columns in table
stations_df = stations_df[['Station_Name', 'Street_Address', 'City', 'State_Code', 'Zip_Code', 'Latitude', 'Longitude', 'EV_Level1_EVSE_Num', 'EV_Level2_EVSE_Num', 'EV_DC_Fast_Count', 'EV_Network']]

# rename column names
stations_df = stations_df.rename(columns={
    'Zip_Code': 'ZIP_CODE',
    'EV_Level1_EVSE_Num': 'EV_LEVEL_1',
    'EV_Level2_EVSE_Num': 'EV_LEVEL_2',
    'EV_DC_Fast_Count': 'EV_FAST',
    'EV_Network': 'NETWORK_TYPE'})

# drop rows with nan values
stations_df = stations_df.dropna().reset_index(drop=True)

# OneHotEncode Network Type
enc = OneHotEncoder(sparse=False)
encode_df = pd.DataFrame(enc.fit_transform(stations_df.NETWORK_TYPE.values.reshape(-1,1)))
encode_df.columns = enc.get_feature_names_out(['NETWORK_TYPE'])

stations_df = stations_df.merge(encode_df, left_index=True, right_index=True).drop('NETWORK_TYPE', 1)

stations_df.head()

AttributeError: 'OneHotEncoder' object has no attribute 'get_feature_names_out'

# Preprocessing Registration Data

In [None]:
# read in CSV files
mi = (r'C:\Users\clava\data_miners\resources\mi_ev_registrations_public.csv')
mn = (r'C:\Users\clava\data_miners\resources\mn_ev_registrations_public.csv')
nj = (r'C:\Users\clava\data_miners\resources\nj_ev_registrations_public.csv')
ny = (r'C:\Users\clava\data_miners\resources\ny_ev_registrations_public.csv')
ore = (r'C:\Users\clava\data_miners\resources\or_ev_registrations_public.csv')
tx = (r'C:\Users\clava\data_miners\resources\tx_ev_registrations_public.csv')
wa = (r'C:\Users\clava\data_miners\resources\wa_ev_registrations_public.csv')
wi = (r'C:\Users\clava\data_miners\resources\wi_ev_registrations_public.csv')

# function to create dataframes from csvs
def clean_state(state):
  state_df = pd.read_csv(state)

  state_df = state_df.Zip_code.value_counts().rename_axis('ZIP_CODE').reset_index(name='REG_COUNTS')

  state_df['ZIP_CODE'] = state_df['ZIP_CODE'].astype(int)

  return state_df

# create dataframes using clean_state function
mi_df = clean_state(mi)
mn_df = clean_state(mn)
nj_df = clean_state(nj)
ny_df = clean_state(ny)
ore_df = clean_state(ore)
tx_df = clean_state(tx)
wa_df = clean_state(wa)
wi_df = clean_state(wi)



# create registration_df and set index as ZIP_CODE
registration_df = pd.concat([mi_df,
                             mn_df,
                             nj_df,
                             ore_df,
                             tx_df,
                             wa_df,
                             wi_df])

In [4]:
wa_df

NameError: name 'wa_df' is not defined

In [None]:
registration_df

# Preprocessing Median Income Data

In [None]:
median_inc = (r'C:\Users\clava\data_miners\resources\TRNSFRM_census_median_income.csv')
median_df = pd.read_csv(median_inc)
median_df = median_df[median_df.MEDIAN_INCOME != '2,500-']
median_df = median_df[median_df.MEDIAN_INCOME != '250,000+']
median_df = median_df[median_df.MEDIAN_INCOME != '-']
median_df['TOTAL_HOUSEHOLDS'] = median_df['TOTAL_HOUSEHOLDS'].astype(int)
median_df['MEDIAN_INCOME'] = median_df['MEDIAN_INCOME'].astype(int)
median_df.dtypes

In [None]:
median_df

# Final model Dataframe

In [None]:
# Drop location info for ml model
model_df = stations_df.drop(columns=['Station_Name',
                                     'Street_Address',
                                     #'City',
                                     'State_Code',
                                     'Latitude',
                                     'Longitude'])



# change data types for sorting and merging
model_df['ZIP_CODE'] = model_df['ZIP_CODE'].astype(int)
model_df['EV_LEVEL_1'] = model_df['EV_LEVEL_1'].astype(int)
model_df['EV_LEVEL_2'] = model_df['EV_LEVEL_2'].astype(int)
model_df['EV_FAST'] = model_df['EV_FAST'].astype(int)
model_df['NETWORK_TYPE_ChargePoint'] = model_df['NETWORK_TYPE_ChargePoint'].astype(int)
model_df['NETWORK_TYPE_Non-Networked'] = model_df['NETWORK_TYPE_Non-Networked'].astype(int)
model_df['NETWORK_TYPE_Other-Network'] = model_df['NETWORK_TYPE_Other-Network'].astype(int)

# mege the data fram with median_df
model_df = model_df.merge(median_df, how='left', on='ZIP_CODE')

# merge the data frame with registration_df
model_df = model_df.merge(registration_df, how='left', on='ZIP_CODE')

# fill nans with zero
model_df['REG_COUNTS'] = model_df['REG_COUNTS'].fillna(0)

model_df = model_df.dropna(how='all')

# change data type for sorting and merging
model_df['REG_COUNTS'] = model_df['REG_COUNTS'].astype(int)



model_df

In [None]:
model_df.to_csv('viz_df_citystate.csv')

In [None]:
model_df


In [None]:
# count each level type by zip code
model_df = model_df.groupby(['ZIP_CODE'], as_index=False).sum()
model_df

# soft dataframe by zip code
model_df = model_df.sort_values(by='ZIP_CODE')

# change zip code to string and reformat
model_df['ZIP_CODE'] = model_df['ZIP_CODE'].apply(lambda x: '0' + str(x) if x < 10000 else str(x)) 

# set ZIP_CODE as index
model_df = model_df.set_index('ZIP_CODE')

model_df = model_df.dropna(axis=0, how='any')

In [None]:
model_df.dtypes

In [None]:
model_df

In [None]:
model_df.nunique()

In [None]:
# export model_df for the ML Model predictions.
model_df.to_csv('viz_df.csv')