In [1]:
import numpy as np
import pandas as pd
import random
import warnings

In [2]:
# import data
centroid_locations = pd.read_csv(r"C:\Users\dodie\Downloads\spring 2024\trio-capstone\data\CensusTractCentroids.csv")
metro_locations = pd.read_csv(r"C:\Users\dodie\Downloads\spring 2024\trio-capstone\data\MetroLinkStations_REGISTERED.csv")
census_data = data = pd.read_csv(r"C:\Users\dodie\Downloads\spring 2024\trio-capstone\data\B08119_stl_city.csv")

In [3]:
# clean census data

warnings. filterwarnings('ignore')

# Only keep important columns
census_data = census_data[['location','label','estimate','moe']]
census_data['label'] = census_data['label'].str.replace('Estimate', '').str.replace('Total', '').str.replace('!!', '').str.replace(':', '')
census_data['location'] = census_data['location'].str.replace('; St. Louis city; Missouri','')

# Filter out aggregate columns
all_totals = census_data[census_data['label']=='']
trans_totals_mask = ~census_data['label'].astype(str).str.contains('[$]')
trans_totals = census_data.loc[trans_totals_mask][1:]
income_totals = census_data[census_data['label'].str.startswith("$")]
not_to_include = pd.concat([trans_totals, income_totals, all_totals])
merged_df = pd.merge(census_data, not_to_include, how='left', indicator=True)
no_totals = merged_df[merged_df['_merge'] == 'left_only'].drop(columns=['_merge'])[1:]
split_data = no_totals['label'].str.split('$', 1, expand=True)
no_totals[['transportation', 'income']] = split_data
no_totals.drop(columns=['label'], inplace=True)
no_totals['income'] = '$' + no_totals['income']
from_this = no_totals['income'].unique()
to_this = [1, 10000, 15000, 25000, 35000, 50000, 65000, 75000]
no_totals['avg income'] = no_totals['income'].replace(from_this, to_this)
clean_census_data = no_totals

# Group by census tract and mode of transportation
by_tract_trans = clean_census_data.groupby(['location','transportation'])['estimate'].sum()

# Find percentage of people who take public transport in each tract
tract_pops = clean_census_data.groupby('location', as_index = False)['estimate'].sum()['estimate'].tolist()
public_transit_pops = clean_census_data[clean_census_data['transportation']=='Public transportation (excluding taxicab)'].groupby(['location','transportation'], as_index = False)['estimate'].sum()['estimate'].tolist()

pct_public_trans = []
for i in range(len(tract_pops)):
    pct_public_trans.append(public_transit_pops[i]/tract_pops[i])

# Add percentages back in to groupby
by_tract = clean_census_data.groupby('location', as_index = False)['estimate'].sum()
by_tract['public transport count'] = public_transit_pops
by_tract['public transport %'] = pct_public_trans

In [4]:
# merge census tract location data with census data

centroid_locations_small = centroid_locations[['NAMELSAD','INTPTLAT','INTPTLON']]
centroid_locations_small = centroid_locations_small.rename(columns = {"NAMELSAD": "location", "INTPTLAT" : "lat", "INTPTLON" : "lon"})

full_data = by_tract.merge(centroid_locations_small, on='location')
full_data.head()

Unnamed: 0,location,estimate,public transport count,public transport %,lat,lon
0,Census Tract 1011,1039,31,0.029836,38.554718,-90.27362
1,Census Tract 1012,1921,11,0.005726,38.561824,-90.274863
2,Census Tract 1013,2161,35,0.016196,38.56656,-90.264211
3,Census Tract 1014,1502,95,0.063249,38.562243,-90.253522
4,Census Tract 1015,1318,32,0.024279,38.548544,-90.264511


In [5]:
# weighted k-means function!

def weighted_kmeans(data, k):
    n = len(data[0]) # number of census tracts
    stop = False
    labels = np.zeros(n) # assigns each centroid to a cluster
    centroids = np.zeros([k, 2]) # centroid for each cluster
    count = 0

    # randomly initialize centroids to be somewhere in the STL City region (between min and max of data)
    for i in range(k):
        centroids[i][0] = round(random.uniform(np.min(data[0]), np.max(data[0])), 6)
        centroids[i][1] = round(random.uniform(np.min(data[1]), np.max(data[1])), 6)
        
    # iterate until labels do not change (or at least twice)
    while stop == False or count < 2:
        count = count + 1
        old_labels = labels

        # Calculating labels by finding nearest centroid for each census tract
        for i in range(n): # for each tract
            closest_distance = float('inf')
            tract_center = np.array([data[0][i], data[1][i]])
            for j in range(k): # for each centroid/cluster
                centroid = np.array([centroids[j][0], centroids[j][1]])
                dist = np.linalg.norm(tract_center - centroid)
                if (dist < closest_distance):
                    closest_distance = dist
                    labels[i] = j

        # Check if any of the labels changed
        all_same = True
        for i in range(n):
            if (labels[i] != old_labels[i]):
                all_same = False
        if all_same == True:
            stop = True

        # updating centroid locations as the (weighted?) mean of each census tract in its cluster
        new_centroids = np.zeros([k,2])
        for i in range(k): # for each cluster
            weights = []
            cluster = []
            for j in range(n): # for each tract
                if (labels[j] == i):
                    weights.append(data[2][j]) # weights will not all sum to 1 (different number in each cluster)
                    cluster.append(np.array([data[0][j], data[1][j]]))

            new_x = [point[0] for point in cluster]
            new_y = [point[1] for point in cluster]
            normalized_weights = np.array(weights)/np.sum(weights)

            if len(new_x) > 1: # if there are points in the cluster
                new_centroids[i][0] = np.dot(normalized_weights, new_x)
                new_centroids[i][1] = np.dot(normalized_weights, new_y)
            else:
                new_centroids[i][0] = centroids[i][0]
                new_centroids[i][1] = centroids[i][1]

        centroids = new_centroids
        
        return centroids, labels

In [6]:
# data is coordinates of tracts and weights
data = [list(full_data['lon']), list(full_data['lat']), list(full_data['public transport count'])]
k = 12

centroids, labels = weighted_kmeans(data, k)

print(f'Final centroids: {centroids} \n')

print(f'Final centroid labels: {labels} \n')
print(f'{len(np.unique(labels))} out of {k} total clusters have tracts in them!')

Final centroids: [[-90.2508135   38.70578821]
 [-90.26598479  38.59538746]
 [-90.19549304  38.64841085]
 [-90.23081195  38.59119676]
 [-90.24438463  38.66100377]
 [-90.25555865  38.56534393]
 [-90.22665184  38.70193049]
 [-90.28507388  38.65126426]
 [-90.19432573  38.62718119]
 [-90.21857537  38.6193805 ]
 [-90.27918232  38.67388788]
 [-90.30279419  38.6125421 ]] 

Final centroid labels: [ 5.  5.  5.  5.  5.  5.  1.  1.  5.  1.  5. 11. 11. 11. 11. 11. 11. 11.
  7.  7.  7. 10.  7. 10. 10. 10.  4.  4.  4.  4.  0.  0.  0.  4.  4.  6.
  0.  6.  6.  6.  4.  4.  4.  4.  6.  4.  4.  4.  7.  4.  4.  4.  1.  1.
  1.  1.  1.  1.  1.  1.  5.  5.  3.  3.  1.  1.  1.  1.  3.  3.  1.  1.
  9.  4.  4.  4.  4.  4.  4.  2.  4.  9.  9.  3.  3.  3.  3.  3.  8.  8.
  2.  2.  2. 11.  4.  6.  2.  1.  9.  9.  9.  8.  4.  4.] 

12 out of 12 total clusters have tracts in them!


In [7]:
full_data['cluster'] = labels
full_data.head()

Unnamed: 0,location,estimate,public transport count,public transport %,lat,lon,cluster
0,Census Tract 1011,1039,31,0.029836,38.554718,-90.27362,5.0
1,Census Tract 1012,1921,11,0.005726,38.561824,-90.274863,5.0
2,Census Tract 1013,2161,35,0.016196,38.56656,-90.264211,5.0
3,Census Tract 1014,1502,95,0.063249,38.562243,-90.253522,5.0
4,Census Tract 1015,1318,32,0.024279,38.548544,-90.264511,5.0


In [None]:
# next idea: add a new column to the dataframe for "distance to cluster centroid"
# then, we can run both weighted and unweighted k-means and see how helpful weighting by population was