In [1]:
import pandas as pd
import numpy as np
from numpy.linalg import norm

In [2]:
ca_cities = pd.read_csv('ca-cities.csv')

In [3]:
ca_cities.head()

Unnamed: 0,RCV,city,city_ascii,city_alt,state_id,state_name,county_fips,county_name,county_fips_all,county_name_all,...,race_pacific,race_other,race_multiple,hispanic,disabled,poverty,limited_english,commute_time,health_uninsured,veteran
0,,Los Angeles,Los Angeles,,CA,California,6037,Los Angeles,6037,Los Angeles,...,0.2,22.9,3.5,48.7,10.0,20.4,14.9,30.9,15.5,2.9
1,1.0,San Francisco,San Francisco,,CA,California,6075,San Francisco,6075,San Francisco,...,0.4,7.5,5.1,15.3,10.6,11.7,11.8,32.8,5.5,3.3
2,,San Diego,San Diego,,CA,California,6073,San Diego,6073,San Diego,...,0.4,6.2,5.1,30.0,9.0,14.5,7.2,24.0,10.4,8.4
3,,Riverside,Riverside,,CA,California,6065,Riverside,6065,Riverside,...,0.3,18.9,4.8,52.8,10.3,16.6,6.1,30.4,12.5,5.3
4,,Sacramento,Sacramento,,CA,California,6067,Sacramento,6067,Sacramento,...,1.6,10.3,6.8,28.3,12.5,19.8,7.9,25.4,9.1,6.1


In [4]:
metrics = ['RCV',
           'city',
           'population',
           'age_median', 
           'income_household_median', 
           'home_value', 
           'rent_median', 
           'education_college_or_above',
           'unemployment_rate',
           'race_white',
           'hispanic']

ca_cities_filtered = ca_cities[metrics]
ca_cities_filtered.head()

Unnamed: 0,RCV,city,population,age_median,income_household_median,home_value,rent_median,education_college_or_above,unemployment_rate,race_white,hispanic
0,,Los Angeles,12815475,35.2,54501,549600.0,2085.0,33.1,8.1,52.2,48.7
1,1.0,San Francisco,3603761,38.3,96265,927729.0,2435.0,55.8,5.4,47.2,15.3
2,,San Diego,3210314,34.3,71535,523012.0,1925.0,44.4,7.0,64.7,30.0
3,,Riverside,2084749,31.3,62460,318031.0,1556.0,22.5,9.5,61.9,52.8
4,,Sacramento,1854698,34.3,54615,286886.0,1349.0,31.5,9.3,48.5,28.3


In [5]:
rcv_cities = ca_cities_filtered[ca_cities_filtered['RCV']==1].reset_index()
non_rcv_cities = ca_cities_filtered[ca_cities_filtered['RCV']!=1].reset_index()
#print(non_rcv_cities.shape)
print(rcv_cities.head())
#print(non_rcv_cities.head())

   index  RCV           city  population  age_median  income_household_median  \
0      1  1.0  San Francisco     3603761        38.3                    96265   
1     12  1.0        Oakland      425195        36.4                    63251   
2     61  1.0       Berkeley      122324        31.0                    75709   
3    101  1.0    San Leandro       90553        40.8                    66178   
4    184  1.0    Palm Desert       52932        52.4                    56262   

   home_value  rent_median  education_college_or_above  unemployment_rate  \
0    927729.0       2435.0                        55.8                5.4   
1    562908.0       2021.0                        40.6                8.0   
2    861440.0       2218.0                        72.3                6.8   
3    473702.0       1679.0                        29.9                6.3   
4    325566.0       1324.0                        35.9                5.2   

   race_white  hispanic  
0        47.2      15.3 

In [6]:
#for a given rcv city find the top n most similar non-rcv cities
def find_similar(rcv_city, n=1):
    rcv_row = rcv_cities[rcv_cities['city']==rcv_city]
    rcv_vec = rcv_row.drop(['RCV', 'city'], axis=1)
    cos_sim_scores = np.empty(non_rcv_cities.shape[0])
    
    for i, non_rcv_row in non_rcv_cities.iterrows():
        non_rcv_vec = non_rcv_row.drop(['RCV', 'city'])
        cos_sim = np.dot(rcv_vec, non_rcv_vec) / (norm(rcv_vec)*norm(non_rcv_vec))
        cos_sim_scores[i] = cos_sim
        
    non_rcv_cities_copy = non_rcv_cities.copy()
    non_rcv_cities_copy['similarity'] = cos_sim_scores
    non_rcv_cities_copy.sort_values(by=['similarity'], ascending=False, inplace=True)
    return non_rcv_cities_copy.head(n=n)
        

In [7]:
find_similar('Oakland', n=5)

Unnamed: 0,index,RCV,city,population,age_median,income_household_median,home_value,rent_median,education_college_or_above,unemployment_rate,race_white,hispanic,similarity
18,20,,Santa Ana,334136,31.0,57151,419851.0,1548.0,13.2,6.5,44.2,77.3,0.999544
15,17,,Anaheim,352497,34.0,65313,490973.0,1934.0,25.3,6.7,68.7,53.8,0.999543
19,21,,Santa Rosa,321908,38.1,67144,457902.0,1684.0,31.6,6.2,68.8,31.8,0.998974
50,52,,Merced,144117,29.2,40704,185191.0,1101.0,17.6,16.0,53.3,52.2,0.99651
20,22,,Santa Clarita,309378,36.9,90544,460639.0,2173.0,35.0,7.1,70.3,32.3,0.995872


In [8]:
rcv_cities[rcv_cities['city']=='Oakland']

Unnamed: 0,index,RCV,city,population,age_median,income_household_median,home_value,rent_median,education_college_or_above,unemployment_rate,race_white,hispanic
1,12,1.0,Oakland,425195,36.4,63251,562908.0,2021.0,40.6,8.0,36.7,27.0


In [9]:
simi_cities = []
for city in list(rcv_cities['city']):
    simi_cities = simi_cities + list(find_similar(city, n=5)['city'])


In [10]:
simi_cities

['Fresno',
 'San Diego',
 'Sacramento',
 'Riverside',
 'San Jose',
 'Santa Ana',
 'Anaheim',
 'Santa Rosa',
 'Merced',
 'Santa Clarita',
 'Alhambra',
 'Davis',
 'Montebello',
 'Burbank',
 'Huntington Park',
 'Bellflower',
 'Watsonville',
 'Gilroy',
 'Whittier',
 'Lynwood',
 'Lakewood',
 'Pico Rivera',
 'Lake Forest',
 'Livermore',
 'Chino Hills',
 'Paramount',
 'El Paso de Robles',
 'Pico Rivera',
 'Buena Park',
 'Whittier',
 'Calabasas',
 'Carpinteria',
 'Morro Bay',
 'San Carlos',
 'Solvang']