# Fitting Equal-Sized K-Means

NOTE: Code will not run without the data directory in the head of the repository. The files are too large to store on Github and can be downloaded from [https://openaddresses.io](https://openaddresses.io) and saved in `gerrymandering/data`.

In [2]:
# Import packages

## Core packages
import pandas as pd
import numpy as np

## Directory manipulation
import glob
import os

## Fitting and mathematics
import k_means_eq
# from geovoronoi import voronoi_regions_from_coords

## Plotting
import matplotlib.pyplot as plt
import plotly.express as px
# import geopandas as gpd
# from geovoronoi.plotting import subplot_for_map, plot_voronoi_polys_with_points_in_area

## Quality of life
from tqdm.notebook import tqdm

In [3]:
# Read in summary data
df_state_info = pd.read_csv("../../dash/state_summary.csv")

In [5]:
# Consolidate address files (if separated)
## Set static variables
root_directory = os.path.dirname(os.path.dirname(os.getcwd()))
extension = 'csv'
## Loop through every avaliable file per state
for i in tqdm(df_state_info["state_abbrev"].str.lower()):
    working_directory = root_directory + "/data/" + i + "/"
    ## In case a file was not downloaded, error is handled
    try:
        os.chdir(working_directory)
    except FileNotFoundError:
        pass
    else:
        files = [i for i in glob.glob('*.{}'.format(extension))]
        filtered_files = [value for value in files if value != "statewide"]
        combined_files = pd.concat([
            pd.read_csv(f, low_memory=False) for f in filtered_files
        ])
        combined_files.to_csv(root_directory + "/data/" + i + ".csv", index=False)

HBox(children=(FloatProgress(value=0.0, max=51.0), HTML(value='')))




#### NOTE: Some fit times will take much longer than others. We will go through one example of how to fit data using our algorithm.

In [6]:
# Read in address from North Carolina
df_state = pd.read_csv(root_directory + "/data/nc.csv", low_memory=False)
# Remove all unnecessary data, all we need are the latitude/longitude coordinates
df_state = df_state[[
    "LON",
    "LAT",
]]
array_state = np.array(df_state)

In [10]:
# Fit the equal KMeans model, number of clusters are the number of congressional seats
clf = k_means_eq.EqualGroupsKMeans(n_clusters=13)
clf.fit(array_state)
df_state["CLUSTER"] = clf.labels_
centroids = clf.cluster_centers_

In [49]:
world = gpd.read_file(gpd.datasets.get_path('naturalearth_lowres'))
area = world[world.name == 'United States of America']

area = area.to_crs(epsg=3395)    # convert to World Mercator CRS
area_shape = area.iloc[0].geometry   # get the Polygon

poly_shapes, pts, poly_to_pt_assignments = voronoi_regions_from_coords(coords, area_shape)

fig, ax = subplot_for_map()
plot_voronoi_polys_with_points_in_area(ax, area_shape, poly_shapes, coords, poly_to_pt_assignments)
plt.show()


'+init=<authority>:<code>' syntax is deprecated. '<authority>:<code>' is the preferred initialization method.



In [15]:
# Save final dataframe with labels as .csv

X_plot = pd.DataFrame(dict(lon=X["LON"], lat=X["LAT"], label=clf.labels_))
groups = X_plot.groupby('label')
X_plot.to_csv("../../usa/ar/clustered.csv")