# Data Merging

Some observations
- We choose the 5 non-RCV cities with highest cosine similary score compared to the 7 RCV cities in CA
- There were 33 distinct cities among those 35 cities
- There are 66 non-registered voters among 21.7 million voters
- There are total of 3.9 million voters in the sampled cities
- City 'El Paso de Robles' didn't match in demographic data, so we manually searched for possible names for that city and found 'Paso Robles'
- We found 122 cases out of 312 with 0% voter turnout. This notebook is an attempt to identify correct election dates for each cities that were selected.
    
# Find four most recent election dates
    
Vote History file doesn't contain city, so we need to merge it with the DEMOGRAPHIC file in order to find out the four most recent election dates for the selected cities. 

1. Load the DEMOGRAPHIC parquet file with only registered voters from selected cities and of selected ethnicities.
     - Get the list of RCV and non-RCV cities computed based on cosine similarity in ca_similarity_search.ipynb
2. Merge the DEMOGRAPHIC with VOTE HISTORY data
3. Find 4 most recent General elections and 4 most recent Local_or_Municipal elections


In [1]:
import pandas as pd
import janitor
import gc
import numpy as np
import time
start_time = time.time()

In [2]:
state = 'CA' ##california
# state = 'CO' ##colorado
# state = 'MD' ##maryland
# state = 'ME' ##maine
# state = 'MN' ##minnesota
# state = 'NM' ##new mexico
# state = 'UT' ##utah
# state = 'VT' ##vermont

In [3]:
if state == "CA":
    filepath = '../data/VM2--CA--2022-04-25/'

    DEMO_filename = 'VM2--CA--2022-04-25-DEMOGRAPHIC_selected_cols.parquet'
    VOTE_filename = 'VM2--CA--2022-04-25-VOTEHISTORY_selected_cols.parquet'
    VOTE_filename_orig = 'VM2--CA--2022-04-25-VOTEHISTORY.tab'

if state=='CO':
    filepath = '../data/VM2--CO--2022-04-26/'
    DEMO_filename = 'VM2--CO--2022-04-26-DEMOGRAPHIC_selected_cols.parquet'
    VOTE_filename = 'VM2--CO--2022-04-26-VOTEHISTORY_selected_cols.parquet'
    VOTE_filename_orig = 'VM2--CO--2022-04-26-VOTEHISTORY.tab'


if state=='MD':
    filepath = '../data/VM2--MD--2022-04-08/'
    DEMO_filename = 'VM2--MD--2022-04-08-DEMOGRAPHIC_selected_cols.parquet'
    VOTE_filename = 'VM2--MD--2022-04-08-VOTEHISTORY_selected_cols.parquet'
    VOTE_filename_orig = 'VM2--MD--2022-04-08-VOTEHISTORY.tab'

if state=='ME':
    filepath = '../data/VM2--ME--2022-03-02/'
    DEMO_filename = 'VM2--ME--2022-03-02-DEMOGRAPHIC_selected_cols.parquet'
    VOTE_filename = 'VM2--ME--2022-03-02-VOTEHISTORY_selected_cols.parquet'
    VOTE_filename_orig = 'VM2--ME--2022-03-02-VOTEHISTORY.tab'


if state=='MN':
    filepath = '../data/VM2--MN--2022-03-25/'
    DEMO_filename = 'VM2--MN--2022-03-25-DEMOGRAPHIC_selected_cols.parquet'
    VOTE_filename = 'VM2--MN--2022-03-25-VOTEHISTORY_selected_cols.parquet'
    VOTE_filename_orig = 'VM2--MN--2022-03-25-VOTEHISTORY.tab'

if state=='NM':

    filepath = '../data/VM2--NM--2022-03-30/'
    DEMO_filename = 'VM2--NM--2022-03-30-DEMOGRAPHIC_selected_cols.parquet'
    VOTE_filename = 'VM2--NM--2022-03-30-VOTEHISTORY_selected_cols.parquet'
    VOTE_filename_orig = 'VM2--NM--2022-03-30-VOTEHISTORY.tab'

if state=='VT':

    filepath = '../Downloads/data/VM2--VT--2022-04-20/'
    DEMO_filename = 'VM2--VT--2022-04-20-DEMOGRAPHIC_selected_cols.parquet'
    VOTE_filename = 'VM2--VT--2022-04-20-VOTEHISTORY_selected_cols.parquet'
    VOTE_filename_orig = 'VM2--VT--2022-04-20-VOTEHISTORY.tab'

    
if state == "UT":
    filepath = '../data/VM2--UT--2022-03-30/'
    DEMO_filename = 'VM2--UT--2022-03-30-DEMOGRAPHIC_selected_cols.parquet'
    VOTE_filename = 'VM2--UT--2022-03-30-VOTEHISTORY_selected_cols.parquet'
    VOTE_filename_orig = 'VM2--UT--2022-03-30-VOTEHISTORY.tab'

## 1.  Load new Demographic Data

1. use parquet file that was created in by Reduce_to_parquet.ipynb
2. filter the data based on the list of cities found in ca_similarity_search.ipynb


In [4]:
def combine_cities_list(RCV_list, NonRCV_list):

    print("total number of cities:", len(RCV_list))

    print("number of distinct cities:", len(set(NonRCV_list)))

    print("name of cities that were duplicated:", set([x for x in NonRCV_list if NonRCV_list.count(x) > 1]))

    combined_cityName = RCV_list+list(set(NonRCV_list))
    print("number of distinct RCV and sampled nonRCV cities:", len(combined_cityName))
    return combined_cityName

In [5]:
# ## 1. List of RCV and non-RCV cities 

all_sampled_cityName = {'CA': ['San Francisco',
  'Oakland',
  'Berkeley',
  'San Leandro',
  'Palm Desert',
  'Eureka',
  'Albany',
  'Fresno',
  'San Diego',
  'Sacramento',
  'Riverside',
  'San Jose',
  'Santa Ana',
  'Anaheim',
  'Santa Rosa',
  'Merced',
  'Santa Clarita',
  'Alhambra',
  'Davis',
  'Montebello',
  'Burbank',
  'Huntington Park',
  'Bellflower',
  'Watsonville',
  'Gilroy',
  'Whittier',
  'Lynwood',
  'Lakewood',
  'Pico Rivera',
  'Lake Forest',
  'Livermore',
  'Chino Hills',
  'Paramount',
  'El Paso de Robles',
  'Pico Rivera',
  'Buena Park',
  'Whittier',
  'Calabasas',
  'Carpinteria',
  'Morro Bay',
  'San Carlos',
  'Solvang'],
 'NM': ['Los Alamos',
  'Albuquerque',
  'Rio Rancho',
  'Farmington',
  'North Valley',
  'Taos',
  'Las Cruces',
  'Silver City',
  'Roswell',
  'Lovington',
  'Deming',
  'Alamogordo',
  'Chaparral',
  'Las Vegas',
  'Los Lunas',
  'Hobbs',
  'South Valley',
  'Clovis',
  'Sunland Park',
  'Artesia',
  'Grants',
  'Carlsbad',
  'Portales',
  'Gallup',
  'Espanola',
  'Santa Fe'],
 'CO': ['Boulder',
  'Littleton',
  'Lafayette',
  'Wheat Ridge',
  'Englewood',
  'Broomfield',
  'Montrose',
  'Loveland',
  'Commerce City',
  'Longmont',
  'Golden',
  'Durango',
  'Canon City',
  'Alamosa',
  'Brighton',
  'Castle Rock',
  'Northglenn',
  'Highlands Ranch',
  'Centennial',
  'Silverthorne',
  'Steamboat Springs',
  'Sherrelwood',
  'Glenwood Springs',
  'Louisville',
  'Parker',
  'Greenwood Village',
  'Ken Caryl',
  'Arvada',
  'Cherry Creek',
  'Fountain',
  'Windsor'],
 'VT': ['Burlington',
  'South Burlington',
  'Essex',
  'Rutland',
  'Bennington',
  'Milton',
  'Essex Junction',
  'Barre',
  'Colchester',
  'Brattleboro'],
 'ME': ['Sanford',
  'Westbrook',
  'Lewiston',
  'Wells',
  'Standish',
  'Waterville',
  'Falmouth',
  'Windham',
  'Kennebunk',
  'Scarborough',
  'South Portland',
  'Bangor',
  'Augusta',
  'Brunswick',
  'Auburn',
  'Portland',
  'Biddeford',
  'York',
  'Saco',
  'Orono',
  'Gorham'],
 'MD': ['Takoma Park',
  'White Oak',
  'Bethesda',
  'Easton',
  'Cockeysville',
  'Potomac',
  'Travilah',
  'Princess Anne',
  'Glenmont',
  'Parole',
  'North Bethesda',
  'Annapolis Neck',
  'Annapolis',
  'North Potomac',
  'Timonium',
  'South Laurel',
  'Ocean Pines',
  'Calverton',
  'Kemp Mill',
  'Hyattsville',
  'Cloverly',
  'Adelphi',
  'Glassmanor',
  'College Park',
  'Redland',
  'Fairland',
  'Severna Park',
  'Westminster',
  'Colesville',
  'New Carrollton',
  'Aspen Hill'],
 'UT': ['Bluffdale',
  'Payson',
  'Cottonwood Heights',
  'Salt Lake City',
  'Sandy',
  'Midvale',
  'Draper',
  'Lehi',
  'Springville',
  'South Salt Lake',
  'Magna',
  'Heber',
  'Millcreek',
  'Riverton',
  'Highland',
  'Lindon',
  'Alpine',
  'West Haven',
  'North Logan',
  'Saratoga Springs',
  'Kaysville',
  'Brigham City',
  'North Salt Lake',
  'American Fork',
  'Washington',
  'Hurricane',
  'Vernal',
  'Holladay',
  'Herriman',
  'Ogden',
  'Provo',
  'West Valley City',
  'Logan',
  'St. George',
  'Taylorsville',
  'Layton',
  'Orem',
  'South Jordan',
  'Murray',
  'Bountiful',
  'Pleasant Grove',
  'Washington',
  'South Jordan',
  'Vernal',
  'Vernal',
  'Hurricane',
  'Herriman',
  'American Fork',
  'Washington',
  'South Jordan',
  'Clearfield',
  'Spanish Fork',
  'Tooele',
  'Kearns',
  'Pleasant Grove',
  'American Fork',
  'Herriman',
  'Eagle Mountain',
  'Vernal',
  'Washington',
  'Bountiful',
  'Pleasant Grove',
  'Hurricane',
  'Cedar City',
  'Tooele',
  'Spanish Fork',
  'Clearfield',
  'Kearns',
  'Eagle Mountain',
  'Holladay',
  'Washington',
  'Hurricane',
  'Farmington',
  'Highland',
  'Cedar City',
  'Murray',
  'Bountiful',
  'South Jordan',
  'Pleasant Grove',
  'Eagle Mountain',
  'Brigham City',
  'American Fork',
  'Herriman',
  'Spanish Fork'],
 'MN': ['St. Louis Park',
  'Bloomington',
  'Minneapolis',
  'Minnetonka',
  'Eden Prairie',
  'Winona',
  'Richfield',
  'Maplewood',
  'Brainerd',
  'Brooklyn Center',
  'Plymouth',
  'Roseville',
  'Mankato',
  'Brooklyn Park',
  'Coon Rapids',
  'Burnsville',
  'Blaine',
  'Eagan',
  'Moorhead',
  'Maple Grove',
  'St. Paul',
  'Duluth',
  'St. Cloud',
  'Rochester',
  'Brooklyn Park',
  'Mankato',
  'Coon Rapids',
  'Blaine',
  'Roseville',
  'Richfield',
  'Brainerd',
  'Eden Prairie',
  'Alexandria',
  'Bemidji',
  'Inver Grove Heights',
  'Fridley']}


combined_sampled_cityName = list(set(all_sampled_cityName[state]))

In [6]:
def read_DEMOGRAPHIC():
    df_demographic = pd.read_parquet(f'{filepath}{DEMO_filename}')
    print("Total number of unique cities:", df_demographic.Residence_Addresses_City.nunique())
    print("Total number of unique voters:", df_demographic.LALVOTERID.nunique())
    print("Count of non-registered voters:", len(df_demographic[df_demographic['Voters_OfficialRegDate'].isnull()]))
    
    print("Number of expected cities:", len(combined_sampled_cityName))
    missing_cities = [city for city in combined_sampled_cityName if city not in df_demographic['Residence_Addresses_City'].unique()]
    if len(missing_cities) > 0:
        print("number of cities not found in demographic data:", len(missing_cities))
        print(missing_cities)
        
    return df_demographic
        
state_demographic = read_DEMOGRAPHIC()

Total number of unique cities: 1533
Total number of unique voters: 21711617
Count of non-registered voters: 66
Number of expected cities: 40
number of cities not found in demographic data: 1
['El Paso de Robles']


In [8]:
state_demographic.head(5)

Unnamed: 0,LALVOTERID,Residence_Addresses_City,Voters_Gender,Voters_Age,Voters_BirthDate,Parties_Description,EthnicGroups_EthnicGroup1Desc,Voters_OfficialRegDate,County,CommercialData_Education,CommercialData_EstimatedHHIncome,CommercialData_EstimatedHHIncomeAmount,FECDonors_NumberOfDonations,FECDonors_TotalDonationsAmount
0,LALCA453164106,Oakland,F,29,04/29/1993,Democratic,Other,06/18/2021,ALAMEDA,,,,,
1,LALCA453008306,Oakland,F,26,02/02/1996,Non-Partisan,Likely African-American,04/01/2021,ALAMEDA,,,,,
2,LALCA22129469,Oakland,F,47,02/02/1975,Democratic,European,11/16/2021,ALAMEDA,HS Diploma - Extremely Likely,,,,
3,LALCA549803906,Oakland,M,60,02/09/1962,Democratic,Other,02/07/2022,ALAMEDA,,,,,
4,LALCA24729024,San Leandro,F,56,01/01/1966,Democratic,European,02/28/2016,ALAMEDA,HS Diploma - Extremely Likely,,,,


In [9]:
#standardize duplicate city names to names as presented in "cities.csv"

def rename_dup_city(df, old_name, new_name):
    print("number of records before fixing duplicates")
    print("\t",old_name, len(df[df['Residence_Addresses_City'] == old_name]))
    print("\t",new_name, len(df[df['Residence_Addresses_City'] == new_name]))
    df.loc[df['Residence_Addresses_City'] == old_name, 'Residence_Addresses_City'] = new_name
    print("number of records after fixing duplicates")
    print("\t",old_name, len(df[df['Residence_Addresses_City'] == old_name]))
    print("\t",new_name, len(df[df['Residence_Addresses_City'] == new_name]))
    return df

if state == 'CA':
    # Modifying the names of cities that were duplicate (Calabasas Hills > Calabasas and Huntington Pk > Huntington Park)
    rename_dup_city(state_demographic, 'Paso Robles', 'El Paso de Robles')
    rename_dup_city(state_demographic, 'Huntington Pk', 'Huntington Park')
    rename_dup_city(state_demographic, 'Calabasas Hills', 'Calabasas')


if state=='CO':
    # NOTE: ['Sherrelwood', 'Cherry Creek', 'Ken Caryl'] not found in demographic data so were removed from 
    # sampled_non_RCV_cities_CO list

    # Modifying the names of cities that were duplicate 
    rename_dup_city(state_demographic, 'Hghlnds Ranch', 'Highlands Ranch')
    rename_dup_city(state_demographic, 'Glenwood Spgs', 'Glenwood Springs')
    rename_dup_city(state_demographic, 'Steamboat Spr', 'Steamboat Springs')
    rename_dup_city(state_demographic, 'Greenwood Vlg', 'Greenwood Village')
    

if state=='MD':
    # NOTE: ['Colesville', 'Fairland', 'Cloverly', 'Annapolis Neck', 'Redland', 'Glenmont', 'Travilah', 
    # 'South Laurel', 'White Oak', 'Glassmanor', 'Kemp Mill', 'Parole', 'Calverton'] not found in demographic data 
    # so were removed from sampled_non_RCV_cities_MD list

    # Modifying the names of cities that were duplicate 
    rename_dup_city(state_demographic, 'N Bethesda', 'North Bethesda')
    
if state=='ME':
    # Modifying the names of cities that were duplicate 
    rename_dup_city(state_demographic, 'S Portland', 'South Portland')
    
if state=='MN':
    # Modifying the names of cities that were duplicate 
    rename_dup_city(state_demographic, 'St Louis Park', 'St. Louis Park')
    rename_dup_city(state_demographic, 'Saint Paul', 'St. Paul')
    rename_dup_city(state_demographic, 'Saint Cloud', 'St. Cloud')
    # Inver Grove Heights
    # Inver Grove

# didn't see any duplicate for new mexico
# if state=='NM':
    # NOTE: ['North Valley', 'South Valley'] not found in demographic data so were removed 
    # from sampled_non_RCV_cities_NM list
    
if state=='UT':
    # Modifying the names of cities that were duplicate 
    rename_dup_city(state_demographic, 'W Valley City', 'West Valley City')
    rename_dup_city(state_demographic, 'Saint George', 'St. George')
    rename_dup_city(state_demographic, 'St George', 'St. George')
    rename_dup_city(state_demographic, 'Saratoga Spgs', 'Saratoga Springs')
    rename_dup_city(state_demographic, 'Salt Lake Cty', 'Salt Lake City')
    rename_dup_city(state_demographic, 'S Salt Lake', 'South Salt Lake')
    rename_dup_city(state_demographic, 'Pleasant Grv', 'Pleasant Grove')
    rename_dup_city(state_demographic, 'N Salt Lake', 'North Salt Lake')

if state=='VT':
    # Modifying the names of cities that were duplicate 
    rename_dup_city(state_demographic, 'Essex Jct', 'Essex Junction')
    rename_dup_city(state_demographic, 'S Burlington', 'South Burlington')
    

number of records before fixing duplicates
	 Paso Robles 28787
	 El Paso de Robles 0
number of records after fixing duplicates
	 Paso Robles 0
	 El Paso de Robles 28787
number of records before fixing duplicates
	 Huntington Pk 711
	 Huntington Park 28745
number of records after fixing duplicates
	 Huntington Pk 0
	 Huntington Park 29456
number of records before fixing duplicates
	 Calabasas Hills 1
	 Calabasas 18630
number of records after fixing duplicates
	 Calabasas Hills 0
	 Calabasas 18631


In [10]:
# 2. filter DEMOGRAPHIC data based on the list of cities, ethnicities and registered voters

selected_ethnicities = ['European', 'Likely African-American','Hispanic and Portuguese', 'East and South Asian', 'Other']

def filter_demo(df, list_cityNames):
    filtered_df = df[df['Residence_Addresses_City'].isin(list_cityNames) &
            df['EthnicGroups_EthnicGroup1Desc'].isin(selected_ethnicities) &
            df['Voters_OfficialRegDate'].notnull()][['LALVOTERID', 'Residence_Addresses_City']]
    
    print(filtered_df.shape)
    print("number of unique cities:", filtered_df.Residence_Addresses_City.nunique())
    
    return filtered_df

state_demographic_subset = filter_demo(df = state_demographic, list_cityNames = combined_sampled_cityName)
state_demographic_subset.head()

(4140404, 2)
number of unique cities: 40


Unnamed: 0,LALVOTERID,Residence_Addresses_City
0,LALCA453164106,Oakland
1,LALCA453008306,Oakland
2,LALCA22129469,Oakland
3,LALCA549803906,Oakland
4,LALCA24729024,San Leandro


In [11]:
count_total_voters = state_demographic_subset.groupby('Residence_Addresses_City').size().reset_index()
count_total_voters.columns = ['Residence_Addresses_City', 'total_voters']
count_total_voters.head()

Unnamed: 0,Residence_Addresses_City,total_voters
0,Albany,10397
1,Alhambra,42566
2,Anaheim,161104
3,Bellflower,37759
4,Berkeley,63930


In [12]:
del state_demographic
gc.collect()

20

## 2. Merge VoteHistory with DEMOGRAPHIC Data 
1. Kernel died when trying to load all General and Local, so we load the two types of elections separately
    1. Load the original data in order to get the complete list of all possible columns containing "General" and "Local_or_Municipal" (need only one row)
    2. create two lists with the column names one for each type of election
2. merge only city (`Residence_Addresses_City`) from DEMOGRAPHIC file to VOTE HISTORY reduce computation time

In [13]:
# 1.A select only one rows to find the column names that are General and Local_or_Municipal elections
# need to use original tab file because pandas' read_parquet doesn't support nrows

state_voterhistory_cols = pd.read_csv(f'{filepath}{VOTE_filename_orig}',
                                 sep='\t', dtype=str, encoding='unicode_escape',
                                nrows=1)
state_voterhistory_cols

Unnamed: 0,LALVOTERID,Special_2022_04_19,Special_2022_04_12,Special_2022_04_05,Special_2022_02_15,Special_2022_02_01,Special_2021_12_14,Special_2021_12_07,Special_2021_11_02,Consolidated_General_2021_11_02,...,BallotReturnDate_General_2018_11_06,BallotReturnDate_Primary_2018_06_05,BallotReturnDate_General_2016_11_08,BallotReturnDate_Primary_2016_06_07,BallotReturnDate_General_2014_11_04,BallotReturnDate_Primary_2014_06_03,BallotReturnDate_General_2012_11_06,BallotReturnDate_Primary_2012_06_05,BallotReturnDate_General_2010_11_02,BallotReturnDate_Primary_2010_06_08
0,LALCA453164106,,,,,,,,,,...,,,11/07/2016,,,,,,,


In [14]:
# 1.B select only voter ID and columns with General or Local_or_Municipal election dates
def get_elec_cols(df, string):
    matched_cols = [col for col in df.columns if col.startswith(string)]
    parse_date_df = pd.DataFrame(matched_cols, columns = ['elec_type_date'])
    parse_date_df['elec_year'] = parse_date_df['elec_type_date'].str[-10:-6].astype(int)
#     print(parse_date_df.head())
    print("total number of dates:", len(parse_date_df))
    parse_date_df = parse_date_df[parse_date_df['elec_year'] >= 2008]
    print("number of dates on 2008 and after:", len(parse_date_df))
    return list(parse_date_df['elec_type_date'])
    
print("General election dates")
GE_cols = get_elec_cols(state_voterhistory_cols, 'General')

print("\nLocal or Municipal election dates")
LM_cols = get_elec_cols(state_voterhistory_cols, 'Local_or_Municipal')

print("\nConsolidated General election dates")
CG_cols = get_elec_cols(state_voterhistory_cols, 'Consolidated_General')

General election dates
total number of dates: 18
number of dates on 2008 and after: 7

Local or Municipal election dates
total number of dates: 131
number of dates on 2008 and after: 110

Consolidated General election dates
total number of dates: 18
number of dates on 2008 and after: 7


In [15]:
print(LM_cols)
print(CG_cols)

['Local_or_Municipal_2021_08_31', 'Local_or_Municipal_2021_07_20', 'Local_or_Municipal_2021_06_08', 'Local_or_Municipal_2021_06_01', 'Local_or_Municipal_2021_05_11', 'Local_or_Municipal_2021_05_04', 'Local_or_Municipal_2021_04_20', 'Local_or_Municipal_2021_03_09', 'Local_or_Municipal_2021_03_02', 'Local_or_Municipal_2020_08_03', 'Local_or_Municipal_2020_05_19', 'Local_or_Municipal_2020_05_05', 'Local_or_Municipal_2020_04_14', 'Local_or_Municipal_2019_08_27', 'Local_or_Municipal_2019_08_13', 'Local_or_Municipal_2019_06_04', 'Local_or_Municipal_2019_05_07', 'Local_or_Municipal_2019_04_16', 'Local_or_Municipal_2019_03_05', 'Local_or_Municipal_2018_07_24', 'Local_or_Municipal_2018_05_21', 'Local_or_Municipal_2018_04_10', 'Local_or_Municipal_2018_03_06', 'Local_or_Municipal_2018_01_30', 'Local_or_Municipal_2017_08_29', 'Local_or_Municipal_2017_07_11', 'Local_or_Municipal_2017_06_30', 'Local_or_Municipal_2017_06_06', 'Local_or_Municipal_2017_05_09', 'Local_or_Municipal_2017_05_02', 'Local_or

In [16]:
del state_voterhistory_cols
gc.collect()

63

In [None]:
# 2. read the VOTEHISTORY parquet file and merge the city from DEMOGRAPHIC file 
df_voterhistory_LM = pd.merge(state_demographic_subset, 
                              pd.read_parquet(f'{filepath}{VOTE_filename}', columns =['LALVOTERID'] +LM_cols), 
                               how='inner', on = 'LALVOTERID') 
df_voterhistory_LM

In [None]:
# 2.1. reduce number of columns by removing columns if all rows are None

def remove_all_None(df, selected_cols):
    print("-"*20, "\nBefore filtering Vote and Demographic\n", "-"*20)
    print("Total number of records", len(df))
    if len([col for col in selected_cols if 'General' in col]) > 0: 
        print("Total number of General election dates", len(get_elec_cols(df, 'General')))
    if len([col for col in selected_cols if 'Local_or_Municipal' in col]) > 0: 
        print("Total number of Local or Municipal election dates", len(get_elec_cols(df, 'Local_or_Municipal')))
    if len([col for col in selected_cols if 'Consolidated_General' in col]) > 0: 
        print("Total number of Consolidated General election dates", len(get_elec_cols(df, 'Consolidated_General')))
    print("\n")

    # reduce the search space with this step
    cols_all_None = [col for col in selected_cols if len(df[col].value_counts()) == 0]
    print("number of columns with all None:", len(cols_all_None))
    print(cols_all_None)
    if len(cols_all_None) > 0:
        df = df.drop(columns = cols_all_None)
    
    print("-"*20, "\nAfter removing dates with all None\n", "-"*20)
    print("Total number of records", len(df))
    if len([col for col in selected_cols if 'General' in col]) > 0: 
        print("Total number of General election dates", len(get_elec_cols(df, 'General')))
    if len([col for col in selected_cols if 'Local_or_Municipal' in col]) > 0: 
        print("Total number of Local or Municipal election dates", len(get_elec_cols(df, 'Local_or_Municipal')))
    if len([col for col in selected_cols if 'Consolidated_General' in col]) > 0:
        print("Total number of Consolidated General election dates", len(get_elec_cols(df, 'Consolidated_General')))

    gc.collect()
    return df

In [None]:
df_voterhistory_LM = remove_all_None(df = df_voterhistory_LM, selected_cols = LM_cols)


## 3. Find 4 most recent General elections and 4 most recent Local_or_Municipal elections
1. reduce number of columns by removing columns if all rows are None
2. run a loop to only keep election date columns that are associated with the chosen subset of cities. 
3. create two dictionaries
    1. `init_city_cnt_dates_{LM|CG|GE}` will count the number of election dates for each city
        - for each city if the count of election dates reaches 4 then stop checking more dates for that city (to do this we will remove the city from the `init_city_cnt_dates_{LM|CG|GE}` dictionary) 
    2. `init_city_4_dates_{LM|CG|GE}` will keep track of the cities and their election dates
        - for a given election date if at least one voter has "Y" then proceed to find which cities took part on that date
        - for each city in `init_city_cnt_dates_{LM|CG|GE}` if the city is also present in the dataframe (i.e. the vote "Y" is counted) then increment the count by 1 in `init_city_cnt_dates_{LM|CG|GE}` and also add the date to `init_city_4_dates_{LM|CG|GE}`

    

In [153]:
df_voterhistory_LM.head()

Unnamed: 0,LALVOTERID,Residence_Addresses_City,Local_or_Municipal_2018_01_08,Local_or_Municipal_2017_10_02,Local_or_Municipal_2017_07_29,Local_or_Municipal_2017_05_02,Local_or_Municipal_2017_05_01,Local_or_Municipal_2015_07_25
0,LALMD1367135,Severna Park,,,,,,
1,LALMD1926065,Annapolis,,,,,,
2,LALMD380171,Severna Park,,,,,,
3,LALMD1935983,Annapolis,,,,,,
4,LALMD3801734,Annapolis,,,,,,


In [154]:
def get_list_elec_dates(df, date_cols, list_city_cnt_dates, list_city_4_dates):
    for date_col in date_cols:
        cnt_df = df[df[date_col] == 'Y'][[date_col, 'Residence_Addresses_City']].groupby('Residence_Addresses_City').count()
        cnt_df = cnt_df.merge(count_total_voters, on = 'Residence_Addresses_City')
        cnt_df['perc_voters'] = cnt_df[date_col]/cnt_df['total_voters']*100
        cnt_df = cnt_df.set_index('Residence_Addresses_City')

        # If no rows found then none of the city had election held on that date 
        # assuming that at least one voter will present on an election date
        
        if len(cnt_df) > 1 and len(list_city_cnt_dates) > 0:
            # for the selected date check which cities held the election on that date
            for city in list(list_city_cnt_dates.keys()): 
                # first check if the city is present in list_city_cnt_dates, 
                # not being present means we have already found the dates so no need to check 
                if city in cnt_df.index and cnt_df.loc[city]['perc_voters'] > 5:
                    print(cnt_df.loc[city])
                    # older cnt_df.loc[city][date_col] > 500:
                    # second check if the city is present in the dataframe with "Y"
                    # not being present means the date is not the election date for this city
                    list_city_cnt_dates[city] = list_city_cnt_dates[city]+1
                    list_city_4_dates[city].append(date_col)                
                    if list_city_cnt_dates[city] == 4:
                        # remove the city from dictionary list_city_cnt_dates so that we know when to stop checking for more dates
                        del list_city_cnt_dates[city]

        elif len(cnt_df) == 0:
            print("No cities found for ", date_col)
            
        elif len(list_city_cnt_dates) == 0:
            # means all 4 dates for all cities found since we removed cities every time 4 dates were found
            break
            
    return list_city_cnt_dates, list_city_4_dates

In [155]:
init_city_cnt_dates_LM = {key: 0 for key in df_voterhistory_LM['Residence_Addresses_City']}
init_city_4_dates_LM = {key: [] for key in df_voterhistory_LM['Residence_Addresses_City']}

#need to recompute the list of election dates because some columns were removed in the previous step
LM_cols = get_elec_cols(df_voterhistory_LM, 'Local_or_Municipal')

list_city_cnt_dates_LM, list_city_4_dates_LM = get_list_elec_dates(df_voterhistory_LM, 
                                                                   LM_cols, 
                                                                   init_city_cnt_dates_LM, 
                                                                   init_city_4_dates_LM)

if len(list_city_cnt_dates_LM) == 0:
    print("\nAll local and municipal election dates found!")
else:
    print("\nNeed to find more local and municipal election dates!!!")
print(list_city_cnt_dates_LM)
list_city_4_dates_LM

total number of dates: 6
number of dates on 2008 and after: 6

Need to find more local and municipal election dates!!!
{'Severna Park': 0, 'Annapolis': 0, 'Cockeysville': 0, 'Timonium': 0, 'Westminster': 0, 'Bethesda': 0, 'North Potomac': 0, 'Potomac': 0, 'Takoma Park': 0, 'North Bethesda': 0, 'Adelphi': 0, 'Aspen Hill': 0, 'College Park': 0, 'Hyattsville': 0, 'New Carrollton': 0, 'Princess Anne': 0, 'Easton': 0, 'Ocean Pines': 0}


{'Severna Park': [],
 'Annapolis': [],
 'Cockeysville': [],
 'Timonium': [],
 'Westminster': [],
 'Bethesda': [],
 'North Potomac': [],
 'Potomac': [],
 'Takoma Park': [],
 'North Bethesda': [],
 'Adelphi': [],
 'Aspen Hill': [],
 'College Park': [],
 'Hyattsville': [],
 'New Carrollton': [],
 'Princess Anne': [],
 'Easton': [],
 'Ocean Pines': []}

In [156]:
del df_voterhistory_LM
gc.collect()

21

## Redo 2.2 and all steps of 3 on Consolidated General election

In [157]:
# 2. read the VOTEHISTORY parquet file and merge the city from DEMOGRAPHIC file 
df_voterhistory_CG = pd.merge(state_demographic_subset, 
                              pd.read_parquet(f'{filepath}{VOTE_filename}', columns =['LALVOTERID'] + CG_cols), 
                               how='inner', on = 'LALVOTERID') 
df_voterhistory_CG

Unnamed: 0,LALVOTERID,Residence_Addresses_City,Consolidated_General_2017_11_07,Consolidated_General_2013_11_05,Consolidated_General_2011_11_08,Consolidated_General_2009_11_03
0,LALMD1367135,Severna Park,,,,
1,LALMD1926065,Annapolis,,,,
2,LALMD380171,Severna Park,,,,
3,LALMD1935983,Annapolis,,,,
4,LALMD3801734,Annapolis,,,,
...,...,...,...,...,...,...
307378,LALMD3328771,Ocean Pines,,,,
307379,LALMD3355955,Ocean Pines,,,,
307380,LALMD630189591,Ocean Pines,,,,
307381,LALMD3166841,Ocean Pines,,,,


In [158]:
df_voterhistory_CG = remove_all_None(df_voterhistory_CG, CG_cols)

init_city_cnt_dates_CG = {key: 0 for key in df_voterhistory_CG['Residence_Addresses_City']}
init_city_4_dates_CG = {key: [] for key in df_voterhistory_CG['Residence_Addresses_City']}

CG_cols = get_elec_cols(df_voterhistory_CG, 'Consolidated_General')

list_city_cnt_dates_CG, list_city_4_dates_CG = get_list_elec_dates(df_voterhistory_CG, 
                                                                   CG_cols, 
                                                                   init_city_cnt_dates_CG, 
                                                                   init_city_4_dates_CG)

if len(list_city_cnt_dates_CG) == 0:
    print("\nAll Consolidated General election dates found!")
else:
    print("\nNeed to find more Consolidated General election dates!!!")
print(list_city_cnt_dates_CG)
list_city_4_dates_CG

-------------------- 
Before filtering Vote and Demographic
 --------------------
Total number of records 307383
total number of dates: 0
number of dates on 2008 and after: 0
Total number of General election dates 0
total number of dates: 4
number of dates on 2008 and after: 4
Total number of Consolidated General election dates 4


number of columns with all None: 0
[]
-------------------- 
After removing dates with all None
 --------------------
Total number of records 307383
total number of dates: 0
number of dates on 2008 and after: 0
Total number of General election dates 0
total number of dates: 4
number of dates on 2008 and after: 4
Total number of Consolidated General election dates 4
total number of dates: 4
number of dates on 2008 and after: 4
Consolidated_General_2017_11_07     7010.00000
total_voters                       59711.00000
perc_voters                           11.73988
Name: Annapolis, dtype: float64
Consolidated_General_2017_11_07     1598.000000
total_voters    

{'Severna Park': [],
 'Annapolis': ['Consolidated_General_2017_11_07',
  'Consolidated_General_2013_11_05',
  'Consolidated_General_2009_11_03'],
 'Cockeysville': [],
 'Timonium': [],
 'Westminster': [],
 'Bethesda': [],
 'North Potomac': [],
 'Potomac': [],
 'Takoma Park': [],
 'North Bethesda': [],
 'Adelphi': [],
 'Aspen Hill': [],
 'College Park': ['Consolidated_General_2017_11_07'],
 'Hyattsville': [],
 'New Carrollton': [],
 'Princess Anne': [],
 'Easton': [],
 'Ocean Pines': []}

## Combine Consolidated_General and Local_or_Municipal and find the most recent dates


In [159]:
def comb_LM_CG(LM_dict, CG_dict):
    list_city_4_dates_LM_combined  = LM_dict.copy()
    list_city_4_dates_LM_combined
    for key in list_city_4_dates_LM_combined:
        list_city_4_dates_LM_combined[key] = list_city_4_dates_LM_combined[key] + CG_dict[key]
    election_df = pd.DataFrame(dict([(k,pd.Series(v)) 
                                     for k,v in list_city_4_dates_LM_combined.items()])).T.reset_index()
    print(election_df.head())
    election_df = election_df.melt(id_vars=['index'], 
                  value_vars=list(election_df.columns[1:]),
                  var_name='date_index',
                  value_name='elec_type_date')

    election_df = election_df.drop(columns = ['date_index'])
    election_df['date'] = election_df['elec_type_date'].str[-10:]
    election_df = election_df.sort_values(by=['index', 'date'], ascending= False)
#     print(election_df.head())
    
    init_city_4_dates_LM_combined = {key: [] for key in df_voterhistory_CG['Residence_Addresses_City']}

    for city in election_df['index'].unique():
        init_city_4_dates_LM_combined[city] = list(election_df[election_df['index'] == city]['elec_type_date'][:4]) 
    return init_city_4_dates_LM_combined
    
    
list_city_4_dates_LM_combined = comb_LM_CG(list_city_4_dates_LM, list_city_4_dates_CG)
list_city_4_dates_LM_combined

          index                                0  \
0  Severna Park                              NaN   
1     Annapolis  Consolidated_General_2017_11_07   
2  Cockeysville                              NaN   
3      Timonium                              NaN   
4   Westminster                              NaN   

                                 1                                2  
0                              NaN                              NaN  
1  Consolidated_General_2013_11_05  Consolidated_General_2009_11_03  
2                              NaN                              NaN  
3                              NaN                              NaN  
4                              NaN                              NaN  


  election_df = pd.DataFrame(dict([(k,pd.Series(v))


{'Severna Park': [nan, nan, nan],
 'Annapolis': ['Consolidated_General_2017_11_07',
  'Consolidated_General_2013_11_05',
  'Consolidated_General_2009_11_03'],
 'Cockeysville': [nan, nan, nan],
 'Timonium': [nan, nan, nan],
 'Westminster': [nan, nan, nan],
 'Bethesda': [nan, nan, nan],
 'North Potomac': [nan, nan, nan],
 'Potomac': [nan, nan, nan],
 'Takoma Park': [nan, nan, nan],
 'North Bethesda': [nan, nan, nan],
 'Adelphi': [nan, nan, nan],
 'Aspen Hill': [nan, nan, nan],
 'College Park': ['Consolidated_General_2017_11_07', nan, nan],
 'Hyattsville': [nan, nan, nan],
 'New Carrollton': [nan, nan, nan],
 'Princess Anne': [nan, nan, nan],
 'Easton': [nan, nan, nan],
 'Ocean Pines': [nan, nan, nan]}

In [161]:
def check_nan(list_var):
    print("total number of cities before cleaning:", len(list_var))
    check_nans = pd.DataFrame(list_var).dropna(axis='columns')
    print("total number of cities after cleaning:", check_nans.shape[1])
    check_nans = check_nans.T
    check_nans.index.name = 'city'
    check_nans = check_nans.reset_index()
    return check_nans
    
list_city_4_dates_LM_combined_cleaned =  check_nan(list_city_4_dates_LM_combined).set_index('city').T.to_dict('list')
print("list of consolidated general and local or municipal election dates changed:", list_city_4_dates_LM_combined != list_city_4_dates_LM_combined_cleaned)
list_city_4_dates_LM_combined_cleaned

total number of cities before cleaning: 18
total number of cities after cleaning: 1
list of consolidated general and local or municipal election dates changed: True


{'Annapolis': ['Consolidated_General_2017_11_07',
  'Consolidated_General_2013_11_05',
  'Consolidated_General_2009_11_03']}

In [162]:
# remove cities with less than 4 LM or CG dates
cities_w_no_LM_CG = []
for city in list_city_4_dates_LM_combined_cleaned:
    if len(list_city_4_dates_LM_combined_cleaned[city]) < 4:
        cities_w_no_LM_CG.append(city)
for city in cities_w_no_LM_CG :
    del list_city_4_dates_LM_combined_cleaned[city]
    
list_city_4_dates_LM_combined_cleaned

{}

In [163]:
del df_voterhistory_CG
gc.collect()

21

## Redo 2.2 and all steps of 3 on General election

In [164]:
# 2. read the VOTEHISTORY parquet file and merge the city from DEMOGRAPHIC file 
df_voterhistory_GE = pd.merge(state_demographic_subset, 
                              pd.read_parquet(f'{filepath}{VOTE_filename}', columns =['LALVOTERID'] + GE_cols), 
                               how='inner', on = 'LALVOTERID') 
df_voterhistory_GE

Unnamed: 0,LALVOTERID,Residence_Addresses_City,General_2020_11_03,General_2018_11_06,General_2016_11_08,General_2014_11_04,General_2012_11_06,General_2010_11_02,General_2008_11_04
0,LALMD1367135,Severna Park,Y,,Y,,Y,,Y
1,LALMD1926065,Annapolis,,,,,,,
2,LALMD380171,Severna Park,Y,Y,Y,,Y,Y,Y
3,LALMD1935983,Annapolis,Y,Y,Y,Y,Y,,Y
4,LALMD3801734,Annapolis,Y,Y,Y,Y,Y,Y,
...,...,...,...,...,...,...,...,...,...
307378,LALMD3328771,Ocean Pines,Y,,Y,Y,Y,,
307379,LALMD3355955,Ocean Pines,Y,,Y,,Y,,Y
307380,LALMD630189591,Ocean Pines,,,,,,,
307381,LALMD3166841,Ocean Pines,Y,Y,Y,,Y,Y,Y


In [165]:
df_voterhistory_GE = remove_all_None(df_voterhistory_GE, GE_cols)

init_city_cnt_dates_GE = {key: 0 for key in df_voterhistory_GE['Residence_Addresses_City']}
init_city_4_dates_GE = {key: [] for key in df_voterhistory_GE['Residence_Addresses_City']}

GE_cols = get_elec_cols(df_voterhistory_GE, 'General')

list_city_cnt_dates_GE, list_city_4_dates_GE = get_list_elec_dates(df_voterhistory_GE, 
                                                                   GE_cols, 
                                                                   init_city_cnt_dates_GE, 
                                                                   init_city_4_dates_GE)
# Remove cities with no general elections
cities_w_no_GE = []
for city in list_city_4_dates_GE:
    if len(list_city_4_dates_GE[city]) < 4:
        cities_w_no_GE.append(city)
for city in cities_w_no_GE:
    del list_city_4_dates_GE[city]


if len(list_city_cnt_dates_GE) == 0:
    print("\nAll local and municipal election dates found!")
else:
    print("\nNeed to find more local and municipal election dates!!!")
print(list_city_cnt_dates_GE)
list_city_4_dates_GE

-------------------- 
Before filtering Vote and Demographic
 --------------------
Total number of records 307383
total number of dates: 7
number of dates on 2008 and after: 7
Total number of General election dates 7


number of columns with all None: 0
[]
-------------------- 
After removing dates with all None
 --------------------
Total number of records 307383
total number of dates: 7
number of dates on 2008 and after: 7
Total number of General election dates 7
total number of dates: 7
number of dates on 2008 and after: 7
General_2020_11_03    15475.000000
total_voters          19552.000000
perc_voters              79.147913
Name: Severna Park, dtype: float64
General_2020_11_03    45280.000000
total_voters          59711.000000
perc_voters              75.831924
Name: Annapolis, dtype: float64
General_2020_11_03    10253.000000
total_voters          14396.000000
perc_voters              71.221173
Name: Cockeysville, dtype: float64
General_2020_11_03      6.0
total_voters            

{'Severna Park': ['General_2020_11_03',
  'General_2018_11_06',
  'General_2016_11_08',
  'General_2014_11_04'],
 'Annapolis': ['General_2020_11_03',
  'General_2018_11_06',
  'General_2016_11_08',
  'General_2014_11_04'],
 'Cockeysville': ['General_2020_11_03',
  'General_2018_11_06',
  'General_2016_11_08',
  'General_2014_11_04'],
 'Timonium': ['General_2020_11_03',
  'General_2018_11_06',
  'General_2016_11_08',
  'General_2014_11_04'],
 'Westminster': ['General_2020_11_03',
  'General_2018_11_06',
  'General_2016_11_08',
  'General_2014_11_04'],
 'Bethesda': ['General_2020_11_03',
  'General_2018_11_06',
  'General_2016_11_08',
  'General_2014_11_04'],
 'North Potomac': ['General_2020_11_03',
  'General_2018_11_06',
  'General_2016_11_08',
  'General_2014_11_04'],
 'Potomac': ['General_2020_11_03',
  'General_2018_11_06',
  'General_2016_11_08',
  'General_2014_11_04'],
 'Takoma Park': ['General_2020_11_03',
  'General_2018_11_06',
  'General_2016_11_08',
  'General_2014_11_04'],


In [166]:
list_city_4_dates_GE

{'Severna Park': ['General_2020_11_03',
  'General_2018_11_06',
  'General_2016_11_08',
  'General_2014_11_04'],
 'Annapolis': ['General_2020_11_03',
  'General_2018_11_06',
  'General_2016_11_08',
  'General_2014_11_04'],
 'Cockeysville': ['General_2020_11_03',
  'General_2018_11_06',
  'General_2016_11_08',
  'General_2014_11_04'],
 'Timonium': ['General_2020_11_03',
  'General_2018_11_06',
  'General_2016_11_08',
  'General_2014_11_04'],
 'Westminster': ['General_2020_11_03',
  'General_2018_11_06',
  'General_2016_11_08',
  'General_2014_11_04'],
 'Bethesda': ['General_2020_11_03',
  'General_2018_11_06',
  'General_2016_11_08',
  'General_2014_11_04'],
 'North Potomac': ['General_2020_11_03',
  'General_2018_11_06',
  'General_2016_11_08',
  'General_2014_11_04'],
 'Potomac': ['General_2020_11_03',
  'General_2018_11_06',
  'General_2016_11_08',
  'General_2014_11_04'],
 'Takoma Park': ['General_2020_11_03',
  'General_2018_11_06',
  'General_2016_11_08',
  'General_2014_11_04'],


In [167]:
list_city_4_dates_GE_cleaned =  check_nan(list_city_4_dates_GE).set_index('city').T.to_dict('list')
print("list of general election dates changed:", list_city_4_dates_GE_cleaned != list_city_4_dates_GE)
# list_city_4_dates_GE_cleaned

total number of cities before cleaning: 17
total number of cities after cleaning: 17
list of general election dates changed: False


In [168]:
GE_dates_df = pd.DataFrame(list_city_4_dates_GE_cleaned.items(), columns=['city', 'GE_dates'])
LM_dates_df = pd.DataFrame(list_city_4_dates_LM_combined_cleaned.items(), columns=['city', 'LM_dates'])

GE_LM_dates_df = GE_dates_df.merge(LM_dates_df, on = "city", how = "outer")
print(GE_LM_dates_df.shape)
GE_LM_dates_df

(17, 3)


Unnamed: 0,city,GE_dates,LM_dates
0,Severna Park,"[General_2020_11_03, General_2018_11_06, Gener...",
1,Annapolis,"[General_2020_11_03, General_2018_11_06, Gener...",
2,Cockeysville,"[General_2020_11_03, General_2018_11_06, Gener...",
3,Timonium,"[General_2020_11_03, General_2018_11_06, Gener...",
4,Westminster,"[General_2020_11_03, General_2018_11_06, Gener...",
5,Bethesda,"[General_2020_11_03, General_2018_11_06, Gener...",
6,North Potomac,"[General_2020_11_03, General_2018_11_06, Gener...",
7,Potomac,"[General_2020_11_03, General_2018_11_06, Gener...",
8,Takoma Park,"[General_2020_11_03, General_2018_11_06, Gener...",
9,North Bethesda,"[General_2020_11_03, General_2018_11_06, Gener...",


In [169]:
## save in parquet format
GE_LM_dates_df.to_parquet(f'{filepath}GE_LM_dates_per_city_{state}.parquet')

In [170]:
end_time = time.time()
print("Time take to run this notebook in seconds: ", end_time - start_time)

Time take to run this notebook in seconds:  39.52402424812317


In [171]:
del df_voterhistory_GE, GE_LM_dates_df
gc.collect()

21

In [None]:
# GE_LM_dates_df= pd.read_parquet(f'{filepath}GE_LM_dates_per_city_{state}.parquet').sort_values('city')
# GE_LM_dates_df.head()

In [None]:
# GE_LM_dates_df_old= pd.read_parquet('../data/GE_LM_dates_per_city_UT_old.parquet').sort_values('city')
# GE_LM_dates_df_old.head()

In [None]:
## CA
# GE_LM_dates_df_old.loc[GE_LM_dates_df_old['city'] == 'Paso Robles', 'city'] = 'El Paso de Robles'
# GE_LM_dates_df_old.loc[GE_LM_dates_df_old['city'] == 'Huntington Pk', 'city'] = 'Huntington Park'
# GE_LM_dates_df_old.loc[GE_LM_dates_df_old['city'] == 'Calabasas Hills', 'city'] = 'Calabasas'
# GE_LM_dates_df_old=GE_LM_dates_df_old.sort_values('city')

## UT
# GE_LM_dates_df_old.loc[GE_LM_dates_df_old['city'] == 'W Valley City', 'city'] = 'West Valley City'
# GE_LM_dates_df_old.loc[GE_LM_dates_df_old['city'] == 'Saint George', 'city'] =  'St. George'
# GE_LM_dates_df_old.loc[GE_LM_dates_df_old['city'] == 'St George', 'city'] =  'St. George'

# GE_LM_dates_df_old.loc[GE_LM_dates_df_old['city'] ==  'Saratoga Spgs', 'city'] = 'Saratoga Springs'
# GE_LM_dates_df_old.loc[GE_LM_dates_df_old['city'] ==  'Salt Lake Cty', 'city'] =  'Salt Lake City'
# GE_LM_dates_df_old.loc[GE_LM_dates_df_old['city'] == 'S Salt Lake', 'city'] =  'South Salt Lake'

# GE_LM_dates_df_old.loc[GE_LM_dates_df_old['city'] == 'Pleasant Grv', 'city'] =   'Pleasant Grove'
# GE_LM_dates_df_old.loc[GE_LM_dates_df_old['city'] == 'N Salt Lake', 'city'] =  'North Salt Lake'

# GE_LM_dates_df_old=GE_LM_dates_df_old.sort_values('city')
