In [1]:
import pandas as pd
import numpy as np
from numpy.linalg import norm
filepath = '../data/'


In [2]:
metrics = ['RCV',
           'city',
           'population',
           'age_median', 
           'income_household_median', 
           'home_value', 
           'rent_median', 
           'education_college_or_above',
           'unemployment_rate',
           'race_white',
           'hispanic']

def read_cities(filename):
    state_cities = pd.read_csv(filename)
    state_cities_filtered = state_cities[metrics]
    return state_cities_filtered

def get_rvn_nonrcv_cities(df):
    rcv_cities = df[df['RCV']==1].reset_index()
    non_rcv_cities = df[df['RCV']!=1].reset_index()
    return rcv_cities, non_rcv_cities
    
#for a given rcv city find the top n most similar non-rcv cities
def find_similar(rcv_city_lookup, rcv_cities_df, non_rcv_cities_df,  n=1):
    rcv_row = rcv_cities_df[rcv_cities_df['city']==rcv_city_lookup]
    rcv_vec = rcv_row.drop(['RCV', 'city'], axis=1)
    cos_sim_scores = np.empty(non_rcv_cities_df.shape[0])
    
    for i, non_rcv_row in non_rcv_cities_df.iterrows():
        non_rcv_vec = non_rcv_row.drop(['RCV', 'city'])
        cos_sim = np.dot(rcv_vec, non_rcv_vec) / (norm(rcv_vec)*norm(non_rcv_vec))
        cos_sim_scores[i] = cos_sim
        
    non_rcv_cities_copy = non_rcv_cities_df.copy()
    non_rcv_cities_copy['similarity'] = cos_sim_scores
    non_rcv_cities_copy.sort_values(by=['similarity'], ascending=False, inplace=True)
    return non_rcv_cities_copy.head(n=n)

def print_similar(rcv_cities_df, non_rcv_cities_df, n):
    print("RCV cities:\n", list(rcv_cities_df['city']))
    simi_cities = []
    for city in list(rcv_cities_df['city']):
        simi_cities = simi_cities + list(find_similar(city, rcv_cities_df, non_rcv_cities_df, n)['city'])
    print("Non-RCV cities:\n", simi_cities)


In [3]:
#------ California ------
ca_cities_filtered = read_cities(f'{filepath}ca-cities.csv')
ca_cities_filtered.head()

Unnamed: 0,RCV,city,population,age_median,income_household_median,home_value,rent_median,education_college_or_above,unemployment_rate,race_white,hispanic
0,,Los Angeles,12815475,35.2,54501,549600.0,2085.0,33.1,8.1,52.2,48.7
1,1.0,San Francisco,3603761,38.3,96265,927729.0,2435.0,55.8,5.4,47.2,15.3
2,,San Diego,3210314,34.3,71535,523012.0,1925.0,44.4,7.0,64.7,30.0
3,,Riverside,2084749,31.3,62460,318031.0,1556.0,22.5,9.5,61.9,52.8
4,,Sacramento,1854698,34.3,54615,286886.0,1349.0,31.5,9.3,48.5,28.3


In [4]:
#------ California ------
ca_rcv_cities, ca_non_rcv_cities = get_rvn_nonrcv_cities(ca_cities_filtered)

print("-"*10, "RCV", "-"*10)
print(ca_rcv_cities.shape)
print(ca_rcv_cities.head())

print("-"*10, "Non-RCV", "-"*10)
print(ca_non_rcv_cities.shape)
print(ca_non_rcv_cities.head())

find_similar('Oakland', ca_rcv_cities, ca_non_rcv_cities,n=5)

---------- RCV ----------
(7, 12)
   index  RCV           city  population  age_median  income_household_median  \
0      1  1.0  San Francisco     3603761        38.3                    96265   
1     12  1.0        Oakland      425195        36.4                    63251   
2     61  1.0       Berkeley      122324        31.0                    75709   
3    101  1.0    San Leandro       90553        40.8                    66178   
4    184  1.0    Palm Desert       52932        52.4                    56262   

   home_value  rent_median  education_college_or_above  unemployment_rate  \
0    927729.0       2435.0                        55.8                5.4   
1    562908.0       2021.0                        40.6                8.0   
2    861440.0       2218.0                        72.3                6.8   
3    473702.0       1679.0                        29.9                6.3   
4    325566.0       1324.0                        35.9                5.2   

   race_white  h

Unnamed: 0,index,RCV,city,population,age_median,income_household_median,home_value,rent_median,education_college_or_above,unemployment_rate,race_white,hispanic,similarity
18,20,,Santa Ana,334136,31.0,57151,419851.0,1548.0,13.2,6.5,44.2,77.3,0.999544
15,17,,Anaheim,352497,34.0,65313,490973.0,1934.0,25.3,6.7,68.7,53.8,0.999543
19,21,,Santa Rosa,321908,38.1,67144,457902.0,1684.0,31.6,6.2,68.8,31.8,0.998974
50,52,,Merced,144117,29.2,40704,185191.0,1101.0,17.6,16.0,53.3,52.2,0.99651
20,22,,Santa Clarita,309378,36.9,90544,460639.0,2173.0,35.0,7.1,70.3,32.3,0.995872


In [5]:
#------ California ------
print_similar(ca_rcv_cities, ca_non_rcv_cities, n = 5)

RCV cities:
 ['San Francisco', 'Oakland', 'Berkeley', 'San Leandro', 'Palm Desert', 'Eureka', 'Albany']
Non-RCV cities:
 ['Fresno', 'San Diego', 'Sacramento', 'Riverside', 'San Jose', 'Santa Ana', 'Anaheim', 'Santa Rosa', 'Merced', 'Santa Clarita', 'Alhambra', 'Davis', 'Montebello', 'Burbank', 'Huntington Park', 'Bellflower', 'Watsonville', 'Gilroy', 'Whittier', 'Lynwood', 'Lakewood', 'Pico Rivera', 'Lake Forest', 'Livermore', 'Chino Hills', 'Paramount', 'El Paso de Robles', 'Pico Rivera', 'Buena Park', 'Whittier', 'Calabasas', 'Carpinteria', 'Morro Bay', 'San Carlos', 'Solvang']


In [6]:
#------ Utah ------
ut_cities_filtered = read_cities(f'{filepath}ut-cities.csv')
ut_cities_filtered.head()

Unnamed: 0,RCV,city,population,age_median,income_household_median,home_value,rent_median,education_college_or_above,unemployment_rate,race_white,hispanic
0,1.0,Salt Lake City,1098400.0,31.9,54009.0,266711.0,1174.0,45.0,4.6,73.7,21.3
1,,Ogden,573632.0,31.0,43361.0,140282.0,924.0,19.8,6.4,84.8,32.3
2,,Provo,503695.0,23.7,44312.0,227580.0,1096.0,42.5,5.0,88.7,16.3
3,,West Valley City,136170.0,30.4,59954.0,183108.0,1139.0,13.4,5.4,57.4,37.7
4,,St. George,113906.0,36.2,54022.0,241888.0,962.0,29.7,4.6,88.6,13.0


In [7]:
#------ Utah ------
ut_rcv_cities, ut_non_rcv_cities = get_rvn_nonrcv_cities(ut_cities_filtered)

print("-"*10, "RCV", "-"*10)
print(ut_rcv_cities.shape)
print(ut_rcv_cities.head())

print("-"*10, "Non-RCV", "-"*10)
print(ut_non_rcv_cities.shape)
print(ut_non_rcv_cities.head())

---------- RCV ----------
(13, 12)
   index  RCV            city  population  age_median  \
0      0  1.0  Salt Lake City   1098400.0        31.9   
1      8  1.0           Sandy     96145.0        35.7   
2     11  1.0            Lehi     62712.0        25.1   
3     12  1.0       Millcreek     60192.0        36.6   
4     15  1.0          Draper     47710.0        32.0   

   income_household_median  home_value  rent_median  \
0                  54009.0    266711.0       1174.0   
1                  87012.0    309704.0       1390.0   
2                  85794.0    287703.0       1466.0   
3                  61888.0    314968.0       1215.0   
4                 110270.0    407246.0       1840.0   

   education_college_or_above  unemployment_rate  race_white  hispanic  
0                        45.0                4.6        73.7      21.3  
1                        40.3                3.4        90.5       8.9  
2                        43.1                2.0        93.7       7.2  

In [8]:
#------ Utah ------
find_similar('Salt Lake City', ut_rcv_cities, ut_non_rcv_cities,n=5)

Unnamed: 0,index,RCV,city,population,age_median,income_household_median,home_value,rent_median,education_college_or_above,unemployment_rate,race_white,hispanic,similarity
0,1,,Ogden,573632.0,31.0,43361.0,140282.0,924.0,19.8,6.4,84.8,32.3,0.999672
1,2,,Provo,503695.0,23.7,44312.0,227580.0,1096.0,42.5,5.0,88.7,16.3,0.982277
2,3,,West Valley City,136170.0,30.4,59954.0,183108.0,1139.0,13.4,5.4,57.4,37.7,0.755258
5,6,,Logan,100774.0,23.9,38412.0,170329.0,931.0,35.7,4.0,84.2,15.1,0.693424
3,4,,St. George,113906.0,36.2,54022.0,241888.0,962.0,29.7,4.6,88.6,13.0,0.623799


In [9]:
#------ Utah ------
print_similar(ut_rcv_cities, ut_non_rcv_cities, n = 5)

RCV cities:
 ['Salt Lake City', 'Sandy', 'Lehi', 'Millcreek', 'Draper', 'Riverton', 'Cottonwood Heights', 'Springville', 'Midvale', 'Magna', 'South Salt Lake', 'Payson', 'Bluffdale']
Non-RCV cities:
 ['Ogden', 'Provo', 'West Valley City', 'Logan', 'St. George', 'Taylorsville', 'Layton', 'Orem', 'South Jordan', 'Murray', 'South Jordan', 'Clearfield', 'Spanish Fork', 'Tooele', 'Kearns', 'Cedar City', 'Murray', 'Bountiful', 'South Jordan', 'Pleasant Grove', 'Vernal', 'Hurricane', 'Herriman', 'American Fork', 'Washington', 'Eagle Mountain', 'Brigham City', 'American Fork', 'Herriman', 'Spanish Fork', 'Washington', 'Heber', 'Hurricane', 'Vernal', 'Holladay', 'Pleasant Grove', 'American Fork', 'Herriman', 'Eagle Mountain', 'Vernal', 'Bountiful', 'Pleasant Grove', 'Washington', 'South Jordan', 'Vernal', 'Tooele', 'Spanish Fork', 'Clearfield', 'Kearns', 'Eagle Mountain', 'Washington', 'Bountiful', 'Pleasant Grove', 'Hurricane', 'Cedar City', 'Saratoga Springs', 'Kaysville', 'Brigham City', 'No