# Prerequisites

You need to have the `powerlaw` package installed in order to run any of the following. Installation is easy through `pip`:

In [None]:
!pip install powerlaw

# Analysis

## Basic Setup

In [None]:
import data # local module
import resulthelpers # local module
import powerlaw
import numpy
import scipy.stats
import pandas
import math
import os
import re
from matplotlib import pyplot
%matplotlib inline
numpy.seterr(divide="ignore", invalid="ignore") # https://github.com/jeffalstott/powerlaw/issues/25

In [None]:
datasets = {
    "CCA Tract": {
        "get_data": data.get_cca_tract_data,
        "params": [3000],
        "population_cutoff": 1,
        "discrete": True
    },
    "CCA Tract (10k+)": {
        "get_data": data.get_cca_tract_data,
        "params": [3000],
        "population_cutoff": 4000,
        "discrete": True
    },
    "CCA Tract (3+)": {
        "get_data": data.get_cca_tract_data,
        "params": [3000, 3],
        "population_cutoff": 12000,
        "discrete": True
    },
    "CCA Tract (3+, 20k+)": {
        "get_data": data.get_cca_tract_data,
        "params": [3000, 3],
        "population_cutoff": 20000,
        "discrete": True
    },
   "CCA Raster": {
        "get_data": data.get_cca_raster_data,
        "params": [1000, 5],
        "population_cutoff": 4000,
        "discrete": False
    },
    "CCA Block": {
        "get_data": data.get_cca_block_data,
        "params": [1000, 1000],
        "population_cutoff": 50,
        "discrete": True
    },
    "Census UA/UC": {
        "get_data": data.get_census_uauc_data,
        "params": [],
        "population_cutoff": 2500,
        "discrete": True
    },
    "Census Place": {
        "get_data": data.get_census_place_data,
        "params": [],
        "population_cutoff": 1,
        "discrete": True
    },
    "Census Place (1k+)": {
        "get_data": data.get_census_place_data,
        "params": [],
        "population_cutoff": 1000,
        "discrete": True
    },
    "Census Place (5k+)": {
        "get_data": data.get_census_place_data,
        "params": [],
        "population_cutoff": 5000,
        "discrete": True
    },
    "Census Place (12k+)": {
        "get_data": data.get_census_place_data,
        "params": [],
        "population_cutoff": 12000,
        "discrete": True
    },
    "Census CBSA": {
        "get_data": data.get_census_cbsa_data,
        "params": [],
        "population_cutoff": 10000,
        "discrete": True
    },
    "Census CBSA (20k+)": {
        "get_data": data.get_census_cbsa_data,
        "params": [],
        "population_cutoff": 20000,
        "discrete": True
    },
    "Census CBSA (50k+)": {
        "get_data": data.get_census_cbsa_data,
        "params": [],
        "population_cutoff": 50000,
        "discrete": True
    },
    "Census CBSA (100k+)": {
        "get_data": data.get_census_cbsa_data,
        "params": [],
        "population_cutoff": 100000,
        "discrete": True
    },
    "CCA Street Network": {
        "get_data": data.get_cca_street_network_data,
        "params": ["full", 5000000],
        "population_cutoff": 10,
        "discrete": False
    }
}

In [None]:
def analyze_dataset(name, outputdir = "./results/lognormal"):
    dataset = datasets[name]
    points = dataset["get_data"](*dataset["params"])["population"]
    points = points[points >= dataset["population_cutoff"]]
    fit = powerlaw.Fit(data=points, xmin=dataset["population_cutoff"])
    mu = fit.lognormal.mu
    sigma = fit.lognormal.sigma
    ks = fit.lognormal.KS()
    #print(name)
    #print("xmin =", dataset["population_cutoff"])
    #print("mu =", mu)
    #print("sigma =", sigma)
    #print("KS =", ks)
    ratio, p = fit.distribution_compare("power_law", "lognormal")
    #print("loglikelihood ratio (ln vs. pl) =", ratio, "p =", p)
    
    # plot it!
    fig = pyplot.figure()
    ax = fig.add_subplot(111)
    ax.set_xlabel(name + " population")
    ax.set_ylabel("CCDF")
    ccdf = fit.ccdf()
    ax.plot(ccdf[0], ccdf[1], '.', color="#00000030", markersize=8, markeredgewidth=0)
    fit.lognormal.plot_ccdf(ax=ax, color="#0099cc", linewidth=1.5)
    
    filename = os.path.join(outputdir, re.sub("[^a-z0-9]", "", name.lower()))
    fig.savefig(filename + "-truncatedln.png", bbox_inches="tight", dpi=300)
    pyplot.close()
    
    fig = pyplot.figure()
    ax = fig.add_subplot(111)
    ax.set_xlabel("log(" + name + " population)")
    ax.set_ylabel("Density")
    pyplot.hist(numpy.log(points), bins=50, color="#00000030")
    
    a = (math.log(dataset["population_cutoff"]) - mu) / sigma
    b = math.inf
    t_rv = scipy.stats.truncnorm(a, b, mu, sigma)
    x = numpy.linspace(t_rv.ppf(1e-6), t_rv.ppf(1 - 1e-6), 100)
    pyplot.plot(x, t_rv.pdf(x), color="#0099cc", linewidth=1.5)
    
    fig.savefig(filename + "-hist.png", bbox_inches="tight", dpi=300)
    pyplot.close()
    
    ret = {
        "name": name,
        "col": "population", # we're only doing population
        "x_min": dataset["population_cutoff"],
        "n": len(points),
        "mu": mu,
        "sigma": sigma,
        "ks": ks,
        "lratio": ratio,
        "p_lratio": p
    }
    batchid = resulthelpers.lognormal_batch_id(**ret)
    ret["batchid"] = batchid
    return ret


In [None]:
results_filename = "./results/lognormal-results.tsv"

def save_results(df):
    df.to_csv(results_filename, sep="\t")

def analyze_all_datasets():
    results = []
    for name in datasets.keys():
        results.append(analyze_dataset(name))
    return pandas.DataFrame.from_records(results)

save_results(analyze_all_datasets())

## Write Bootstrap Files

As in the main analysis, we write bootstrap files to be run on an HPC.

In [None]:
def write_bootstrap_batch_files():
    i = 0
    records = load_results().to_records()
    for record in records:
        i += 1
        args = [
            record["mu"],
            record["sigma"],
            record["x_min"],
            record["n"],
            record["batchid"]
        ]

        args.append("--trials 625")

        if datasets[record["name"]]["discrete"]:
            args.append("--discrete")

        with open("batch/ln" + str(i) + ".sh", "w") as f:
            f.write("#!/bin/bash\n")
            f.write("#SBATCH --partition=batch\n")
            f.write("#SBATCH --cpus-per-task=20\n")
            f.write("#SBATCH --ntasks=4\n")
            if record["name"] != "CCA Street Network": # those take forever...
                f.write("#SBATCH --time=6:00:00\n")
            f.write("#SBATCH --output=results/lognormal/%j.out\n\n")
            f.write("# " + record["name"] + " lognormal\n")

            f.write("srun /path/to/python simulate_lognormal.py " + " ".join(map(str, args)))

write_bootstrap_batch_files()

In [None]:
# NaN KS results (errors due to lack of precision on extreme parameters)
sim_results = resulthelpers.load_lognormal_simulation_results("./batch/all-results-lognormal.tsv")
sim_results[pandas.isnull(sim_results["fit_ks"])]

## Kernel Density Estimates

In [None]:
def generate_combined_kde():
    kde_datasets = {
        "CCA Block": {
            "get_data": data.get_cca_block_data,
            "params": [1000, 1000],
            "population_cutoff": 50,
            "color": "#ff9900"
        },
        "Census Place": {
            "get_data": data.get_census_place_data,
            "params": [],
            "population_cutoff": 5000,
            "color": "#cc0033"
        },
        "CCA Raster": {
            "get_data": data.get_cca_raster_data,
            "params": [1000, 5],
            "population_cutoff": 4000,
            "color": "#006699"
        }
    }
    
    fig, axes = pyplot.subplots(len(kde_datasets), 1, sharex=True, figsize=(10,6))
    x = numpy.linspace(-2, 17, 5000)
    
    i = 0
    for name in kde_datasets:
        ax = axes[i]
        i += 1
        dataset = kde_datasets[name]
        df = dataset["get_data"](*dataset["params"])
        popcutoff = dataset["population_cutoff"]

        population = df["population"][df["population"] > 0]
        kde = scipy.stats.gaussian_kde(numpy.log(population), bw_method=.02)
        ax.set_title(name + " Log Population Density")
        #ax.set_ylabel("Density")
        if i == len(kde_datasets):
            ax.set_xlabel("ln(population)")
        ax.plot(x, kde(x), color="#004454")
        ax.axvline(numpy.log(popcutoff), color="#00000080", linestyle="dotted")

    pyplot.tight_layout()
    fig.savefig("illustrations/combined-pop-density.png", dpi=300)
    pyplot.close(fig)

generate_combined_kde()