In [1]:
import pandas as pd
import numpy as np
from numpy.linalg import norm
filepath = '../data/'


In [2]:
metrics = ['RCV',
           'state_name',
           'city',
           'population',
           'age_median', 
           'income_household_median', 
           'home_value', 
           'rent_median', 
           'education_college_or_above',
           'unemployment_rate',
           'race_white',
           'hispanic']

def read_cities(filename):
    state_cities = pd.read_csv(filename)
    state_cities_filtered = state_cities[metrics]
    return state_cities_filtered

def get_rvn_nonrcv_cities(df):
    rcv_cities = df[df['RCV']==1].reset_index()
    non_rcv_cities = df[df['RCV']!=1].reset_index()
    return rcv_cities, non_rcv_cities
    
#for a given rcv city find the top n most similar non-rcv cities
def find_similar(rcv_city_lookup, rcv_cities_df, non_rcv_cities_df,  n=1):
    rcv_row = rcv_cities_df[rcv_cities_df['city']==rcv_city_lookup]
    rcv_vec = rcv_row.drop(['RCV', 'city'], axis=1)
    cos_sim_scores = np.empty(non_rcv_cities_df.shape[0])
    
    for i, non_rcv_row in non_rcv_cities_df.iterrows():
        non_rcv_vec = non_rcv_row.drop(['RCV', 'city'])
        cos_sim = np.dot(rcv_vec, non_rcv_vec) / (norm(rcv_vec)*norm(non_rcv_vec))
        cos_sim_scores[i] = cos_sim
        
    non_rcv_cities_copy = non_rcv_cities_df.copy()
    non_rcv_cities_copy['similarity'] = cos_sim_scores
    non_rcv_cities_copy.sort_values(by=['similarity'], ascending=False, inplace=True)
    return non_rcv_cities_copy.head(n=n)

def print_similar(rcv_cities_df, non_rcv_cities_df, n):
    print("RCV cities:\n", sorted(list(rcv_cities_df['city'])))
    simi_cities = []
    for city in list(rcv_cities_df['city']):
        simi_cities = simi_cities + list(find_similar(city, rcv_cities_df, non_rcv_cities_df, n)['city'])
    print("Non-RCV cities:\n", sorted(simi_cities))


In [3]:
#------ California ------
ca_cities = read_cities(f'{filepath}ca-cities.csv')
ca_cities_filtered = ca_cities.drop(columns = ['state_name'])

ca_cities_filtered.head()

Unnamed: 0,RCV,city,population,age_median,income_household_median,home_value,rent_median,education_college_or_above,unemployment_rate,race_white,hispanic
0,,Los Angeles,12815475,35.2,54501,549600.0,2085.0,33.1,8.1,52.2,48.7
1,1.0,San Francisco,3603761,38.3,96265,927729.0,2435.0,55.8,5.4,47.2,15.3
2,,San Diego,3210314,34.3,71535,523012.0,1925.0,44.4,7.0,64.7,30.0
3,,Riverside,2084749,31.3,62460,318031.0,1556.0,22.5,9.5,61.9,52.8
4,,Sacramento,1854698,34.3,54615,286886.0,1349.0,31.5,9.3,48.5,28.3


In [4]:
#------ California ------
ca_rcv_cities, ca_non_rcv_cities = get_rvn_nonrcv_cities(ca_cities_filtered)

print("-"*10, "RCV", "-"*10)
print(ca_rcv_cities.shape)
print(ca_rcv_cities.head())

print("-"*10, "Non-RCV", "-"*10)
print(ca_non_rcv_cities.shape)
print(ca_non_rcv_cities.head())

find_similar('Oakland', ca_rcv_cities, ca_non_rcv_cities,n=5)

---------- RCV ----------
(7, 12)
   index  RCV           city  population  age_median  income_household_median  \
0      1  1.0  San Francisco     3603761        38.3                    96265   
1     12  1.0        Oakland      425195        36.4                    63251   
2     61  1.0       Berkeley      122324        31.0                    75709   
3    101  1.0    San Leandro       90553        40.8                    66178   
4    184  1.0    Palm Desert       52932        52.4                    56262   

   home_value  rent_median  education_college_or_above  unemployment_rate  \
0    927729.0       2435.0                        55.8                5.4   
1    562908.0       2021.0                        40.6                8.0   
2    861440.0       2218.0                        72.3                6.8   
3    473702.0       1679.0                        29.9                6.3   
4    325566.0       1324.0                        35.9                5.2   

   race_white  h

Unnamed: 0,index,RCV,city,population,age_median,income_household_median,home_value,rent_median,education_college_or_above,unemployment_rate,race_white,hispanic,similarity
18,20,,Santa Ana,334136,31.0,57151,419851.0,1548.0,13.2,6.5,44.2,77.3,0.999544
15,17,,Anaheim,352497,34.0,65313,490973.0,1934.0,25.3,6.7,68.7,53.8,0.999543
19,21,,Santa Rosa,321908,38.1,67144,457902.0,1684.0,31.6,6.2,68.8,31.8,0.998974
50,52,,Merced,144117,29.2,40704,185191.0,1101.0,17.6,16.0,53.3,52.2,0.99651
20,22,,Santa Clarita,309378,36.9,90544,460639.0,2173.0,35.0,7.1,70.3,32.3,0.995872


In [5]:
#------ California ------
print_similar(ca_rcv_cities, ca_non_rcv_cities, n = 5)

RCV cities:
 ['Albany', 'Berkeley', 'Eureka', 'Oakland', 'Palm Desert', 'San Francisco', 'San Leandro']
Non-RCV cities:
 ['Alhambra', 'Anaheim', 'Bellflower', 'Buena Park', 'Burbank', 'Calabasas', 'Carpinteria', 'Chino Hills', 'Davis', 'El Paso de Robles', 'Fresno', 'Gilroy', 'Huntington Park', 'Lake Forest', 'Lakewood', 'Livermore', 'Lynwood', 'Merced', 'Montebello', 'Morro Bay', 'Paramount', 'Pico Rivera', 'Pico Rivera', 'Riverside', 'Sacramento', 'San Carlos', 'San Diego', 'San Jose', 'Santa Ana', 'Santa Clarita', 'Santa Rosa', 'Solvang', 'Watsonville', 'Whittier', 'Whittier']


In [6]:
#------ Selected states ------
cities_filtered = read_cities(f'{filepath}cities.csv')
cities_filtered.head()


Unnamed: 0,RCV,state_name,city,population,age_median,income_household_median,home_value,rent_median,education_college_or_above,unemployment_rate,race_white,hispanic
0,,New Mexico,Los Alamos,12035,41.7,101399,291325.0,1065.0,66.5,3.3,84.4,17.5
1,,New Mexico,Albuquerque,758523,36.2,49878,188094.0,1063.0,34.3,6.5,73.6,48.5
2,,New Mexico,Rio Rancho,96159,38.1,63180,179001.0,1165.0,30.2,7.2,79.9,40.7
3,,New Mexico,Farmington,52555,32.9,55179,174799.0,893.0,20.2,6.6,61.1,24.9
4,,New Mexico,North Valley,11999,48.1,50236,213404.0,706.0,28.0,4.9,74.9,61.0


In [7]:
for state in cities_filtered.state_name.unique():
    print("-"*10, state, "-"*10)
    subset_cities_filtered = cities_filtered[cities_filtered['state_name'] == state]
    subset_cities_filtered = subset_cities_filtered.drop(columns = ['state_name'])
    print("number of cities:", subset_cities_filtered.city.nunique())
    if subset_cities_filtered.city.nunique() < 30:
        print("low number of cities!!!")
        print(subset_cities_filtered.city.unique())
    elif subset_cities_filtered.RCV.count() < 6: 
        print("low count of RCV!!!")
        n_sim = int(round(30/subset_cities_filtered.RCV.count()))
        print("number of similar non-RCV cities per RCV city:", n_sim)
        subset_rcv_cities, subset_non_rcv_cities = get_rvn_nonrcv_cities(subset_cities_filtered)
        print_similar(subset_rcv_cities, subset_non_rcv_cities, n = n_sim)
    else:
        subset_rcv_cities, subset_non_rcv_cities = get_rvn_nonrcv_cities(subset_cities_filtered)
        print_similar(subset_rcv_cities, subset_non_rcv_cities, n = 5)


---------- New Mexico ----------
number of cities: 26
low number of cities!!!
['Los Alamos' 'Albuquerque' 'Rio Rancho' 'Farmington' 'North Valley'
 'Taos' 'Las Cruces' 'Silver City' 'Roswell' 'Lovington' 'Deming'
 'Alamogordo' 'Chaparral' 'Las Vegas' 'Los Lunas' 'Hobbs' 'South Valley'
 'Clovis' 'Sunland Park' 'Artesia' 'Grants' 'Carlsbad' 'Portales' 'Gallup'
 'Espa√±ola' 'Santa Fe']
---------- Colorado ----------
number of cities: 67
low count of RCV!!!
number of similar non-RCV cities per RCV city: 30
RCV cities:
 ['Boulder']
Non-RCV cities:
 ['Alamosa', 'Arvada', 'Brighton', 'Broomfield', 'Castle Rock', 'Ca√±on City', 'Centennial', 'Cherry Creek', 'Commerce City', 'Durango', 'Englewood', 'Fountain', 'Glenwood Springs', 'Golden', 'Greenwood Village', 'Highlands Ranch', 'Ken Caryl', 'Lafayette', 'Littleton', 'Longmont', 'Louisville', 'Loveland', 'Montrose', 'Northglenn', 'Parker', 'Sherrelwood', 'Silverthorne', 'Steamboat Springs', 'Wheat Ridge', 'Windsor']
---------- Vermont ---------