In [2]:
import pandas as pd
import numpy as np
import seaborn as sns
from sklearn.cluster import KMeans

In [72]:
df = pd.read_csv("FINAL_DATASETS/END.csv")
df

Unnamed: 0.1,Unnamed: 0,city,state_name_x,state_id,lat,lng,city_population,county_fips,county,county_population,...,lesscollege_pct,lesshs_whites_pct,lesscollege_whites_pct,rural_pct,ruralurban_cc,rep_pct,dem_pct,crime_per_100,avg_propty_tax,state_name_y
0,0,New York,New York,NY,40.6943,-73.9249,8336817,36061,New York County,1653877,...,39.565360,1.784064,17.134644,0.000000,1,9.706266,86.556921,0.036231,11811.008190,NY
1,1,Los Angeles,California,CA,34.1139,-118.4068,3979576,6037,Los Angeles County,10105722,...,69.235895,5.681371,51.917013,0.605218,1,22.413336,71.757222,0.053825,6782.028673,CA
2,2,Long Beach,California,CA,33.7981,-118.1675,462628,6037,Los Angeles County,10105722,...,69.235895,5.681371,51.917013,0.605218,1,22.413336,71.757222,0.066151,6782.028673,CA
3,3,Santa Clarita,California,CA,34.4175,-118.4964,212979,6037,Los Angeles County,10105722,...,69.235895,5.681371,51.917013,0.605218,1,22.413336,71.757222,0.029470,6782.028673,CA
4,4,Glendale,California,CA,34.1818,-118.2468,199303,6037,Los Angeles County,10105722,...,69.235895,5.681371,51.917013,0.605218,1,22.413336,71.757222,0.035725,6782.028673,CA
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
696,696,Battle Creek,Michigan,MI,42.2986,-85.2296,51093,26025,Calhoun County,134327,...,79.249181,8.712921,78.917095,30.977039,3,53.468473,41.012190,0.080781,4082.319040,MI
697,697,Florissant,Missouri,MO,38.7996,-90.3269,50952,29189,St. Louis County,999539,...,57.617910,4.973065,52.119318,1.137990,1,39.318414,55.686034,0.046894,4874.699159,MO
698,698,Joplin,Missouri,MO,37.0758,-94.5018,50925,29097,Jasper County,118522,...,77.943574,10.368353,77.136641,23.691697,3,72.569631,21.876423,0.143410,1990.127907,MO
699,699,Enid,Oklahoma,OK,36.4061,-97.8701,49688,40047,Garfield County,62421,...,78.643327,9.287302,76.710624,21.411357,5,73.740212,20.253339,0.075323,2540.766614,OK


In [73]:
centroids = pd.DataFrame(columns = ["feature", "group_name", "center"])

In [74]:
def create_groups(cluster_centers, categories = 3):
    mapping = pd.DataFrame(cluster_centers, columns = ["center"]).reset_index().rename(
        columns = {"index": "group_id"}).sort_values(by="center", ascending=False)

    mapping["group_name"] = ["high", "medium", "low"] if categories == 3 else ["highest", "high", "medium", "low"]

    return mapping.set_index("group_id")

In [125]:
# Size
df["size_feature"] = df["city_population"]
df["size_rank"] = df["size_feature"].rank(ascending = False)

knn = KMeans(4).fit(df[["size_feature"]])

mapping = create_groups(knn.cluster_centers_, 4)
mapping["feature"] = "size"
centroids = centroids.append(mapping)

df["size_group"] = [mapping["group_name"][x] for x in knn.labels_]

In [126]:
# Diversity
df['diversity_feature'] = 1 - (((df.pct_hispanic/100) ** 2) + ((df.pct_white/100) ** 2) + ((df.pct_black/100) ** 2) + ((df.pct_native/100) ** 2) + ((df.pct_asian/100) ** 2) + ((df.pct_pacific/100) ** 2))
df["diversity_rank"] = df["diversity_feature"].rank(ascending = False)
knn = KMeans(3).fit(df[["diversity_feature"]])

mapping = create_groups(knn.cluster_centers_, 3)
mapping["feature"] = "diversity"
centroids = centroids.append(mapping)

df["diversity_group"] = [mapping["group_name"][x] for x in knn.labels_]

In [127]:
# Educational
df["education_feature"] = 0.5 * df["lesscollege_pct"] + 1.0 * (100 - (df["lesscollege_pct"] + df["lesshs_pct"]))
df["education_rank"] = df["education_feature"].rank(ascending = False)
knn = KMeans(3).fit(df[["education_feature"]])

mapping = create_groups(knn.cluster_centers_, 3)
mapping["feature"] = "education"
centroids = centroids.append(mapping)

df["education_group"] = [mapping["group_name"][x] for x in knn.labels_]

In [128]:
# Wealth
df["wealth_feature"] = df["income_per_cap"]
df["wealth_rank"] = df["wealth_feature"].rank(ascending = False)

knn = KMeans(3).fit(df[["wealth_feature"]])

mapping = create_groups(knn.cluster_centers_, 3)
mapping["feature"] = "wealth"
centroids = centroids.append(mapping)

df["wealth_group"] = [mapping["group_name"][x] for x in knn.labels_]

In [129]:
# Home prices
df["home_price_feature"] = df["avg_propty_tax"] / (1.11 / 100) # 1.11% = average national property tax rate
df["home_price_rank"] = df["education_feature"].rank(ascending = False)

knn = KMeans(4).fit(df[["home_price_feature"]])

mapping = create_groups(knn.cluster_centers_, 4)
mapping["feature"] = "home_price"
centroids = centroids.append(mapping)

df["home_price_group"] = [mapping["group_name"][x] for x in knn.labels_]

In [130]:
# Urban/rural
df["urban_feature"] = df["ruralurban_cc"]
df["urban_rank"] = df["urban_feature"].rank(ascending = True)

# I want to live in a:
urban_mapping = {
    1: "Large Metro Area",
    2: "Medium Metro Area",
    3: "Small Metro Area",
    4: "Adjacent to Metro Area",
    5: "Non-Metro"
}

df["urban_group"] = df["ruralurban_cc"].apply(lambda x: urban_mapping[x])

In [131]:
# Political party
df['political_feature'] = df["dem_pct"]
df["political_rank"] = df["political_feature"].rank(ascending = False)

def group_political(dem_pct):
    if dem_pct > 52:
        return "Left"
    elif dem_pct > 48:
        return "Moderate"
    else:
        return "Conservative"

df["political_group"] = df['dem_pct'].apply(group_political)

In [132]:
# Weather
df["weather_feature"] = df["LTM_max_temp"]
df["weather_rank"] = df["weather_feature"].rank(ascending = False)


knn = KMeans(3).fit(df[["weather_feature"]])

mapping = create_groups(knn.cluster_centers_, 3)
mapping["feature"] = "weather"
centroids = centroids.append(mapping)

df["weather_group"] = [mapping["group_name"][x] for x in knn.labels_]

In [133]:
# COVID
df["covid_feature"] = df["covid_cases_per_100k"] + 5*df["covid_deaths_per_100k"]
df["covid_rank"] = df["covid_feature"].rank(ascending = True)


knn = KMeans(3).fit(df[["covid_feature"]])

mapping = create_groups(knn.cluster_centers_, 3)
mapping["feature"] = "covid"
centroids = centroids.append(mapping)

df["covid_group"] = [mapping["group_name"][x] for x in knn.labels_]

In [134]:
# Profession
df["profession_feature"] = df["pct_private_work"]
df["profession_rank"] = df["profession_feature"].rank(ascending = False)

knn = KMeans(3).fit(df[["profession_feature"]])

mapping = create_groups(knn.cluster_centers_, 3)
mapping["feature"] = "profession"
centroids = centroids.append(mapping)

df["profession_group"] = [mapping["group_name"][x] for x in knn.labels_]

In [135]:
# Commute / walkability / public transit
df["transportation_feature"] = df["pct_walk"] + df["pct_transit"]
df["transportation_rank"] = df["transportation_feature"].rank(ascending = False)


knn = KMeans(4).fit(df[["transportation_feature"]])

mapping = create_groups(knn.cluster_centers_, 4)
mapping["feature"] = "transit"
centroids = centroids.append(mapping)

df["transit_group"] = [mapping["group_name"][x] for x in knn.labels_]

In [136]:
# Poverty
df["poverty_feature"] = df["pct_poverty"] + 2 * df["pct_child_poverty"]
df["poverty_rank"] = df["poverty_feature"].rank(ascending = True)


knn = KMeans(3).fit(df[["poverty_feature"]])

mapping = create_groups(knn.cluster_centers_, 3)
mapping["feature"] = "poverty"
centroids = centroids.append(mapping)

df["poverty_group"] = [mapping["group_name"][x] for x in knn.labels_]

In [137]:
# Age
df["age_feature"] = (15 * (df["age29andunder_pct"]/100)) + \
    (47.5 * (1 - df["age29andunder_pct"]/100 - df["age65andolder_pct"]/100)) + (73 * (df["age65andolder_pct"]/100))
df["age_rank"] = df["age_feature"].rank(ascending = True)


knn = KMeans(3).fit(df[["age_feature"]])

mapping = create_groups(knn.cluster_centers_, 3)
mapping["feature"] = "age"
centroids = centroids.append(mapping)

df["age_group"] = [mapping["group_name"][x] for x in knn.labels_]

In [139]:
df.to_csv("FINAL_DATASETS/df_w_column_groups.csv")

In [89]:
centroids.to_csv("FINAL_DATASETS/column_centroids.csv")

In [141]:
import seaborn as sns
['rgb'+str(i) for i in sns.color_palette("icefire", n_colors=50)]

['rgb(0.68629021, 0.86497329, 0.84398721)',
 'rgb(0.63117745, 0.827294, 0.83113431)',
 'rgb(0.57489525, 0.7907688, 0.82022901)',
 'rgb(0.51728854, 0.75509528, 0.81194156)',
 'rgb(0.45770893, 0.72006648, 0.80667756)',
 'rgb(0.3976693, 0.68498786, 0.80448272)',
 'rgb(0.34358916, 0.64853177, 0.8043)',
 'rgb(0.29623491, 0.61072284, 0.80569021)',
 'rgb(0.25588696, 0.57157984, 0.80875922)',
 'rgb(0.22397687, 0.53086094, 0.8137141)',
 'rgb(0.21409503, 0.48734861, 0.81566163)',
 'rgb(0.23176013, 0.44119137, 0.80494325)',
 'rgb(0.25950239, 0.39535521, 0.77376848)',
 'rgb(0.28023585, 0.35244234, 0.72622103)',
 'rgb(0.28862749, 0.31519824, 0.66278813)',
 'rgb(0.28233561, 0.28527482, 0.58742866)',
 'rgb(0.2659204, 0.25949691, 0.511417)',
 'rgb(0.24433654, 0.23539226, 0.4397245)',
 'rgb(0.22048761, 0.21211723, 0.37372555)',
 'rgb(0.19619947, 0.18972425, 0.31383846)',
 'rgb(0.17300684, 0.16876921, 0.26024893)',
 'rgb(0.15277065, 0.14994111, 0.21368395)',
 'rgb(0.13667798, 0.13446606, 0.17493774)',
 