This script takes Texas Education Agency data about school district demographics and disciplinary actions, and puts them together in one GeoJSON file for the Texas Appleseed "School to Prison Pipeline" map. See http://www.texasdisciplinelab.org/

To use the script, follow these instructions:

1. For every year that you want to cover, download all 20 of the region files from http://rptsvr1.tea.texas.gov/adhocrpt/Disciplinary_Data_Products/Download_Region_Districts.html and put them in the directory '../data/from_agency/by_region/'

2. For every year that you want to cover, download the "District and Charter Detail Data" Snapshot Data File (comma-delimited *.dat)" from https://rptsvr1.tea.texas.gov/perfreport/snapshot/download.html. The website automatically delivers these files with the same filename: district.dat. You will need to rename them to have different names by adding the year after "district". For instance, "district2016.dat"

3. This script needs a GeoJSON file of district shapes. Make sure it can find that file at '../geojson/base_districts.geojson'

4. Change the first_year and last_year variables below to reflect the years you want your file to cover.

5. Run the notebook with "Kernel -> Restart and Run All"

6. Wait a while for it to finish. After about 15 minutes, the notebook should produce 'districts_with_data.geojson' in the '../geojson/' directory.

7. The resulting file will be about 20 MB depending on how many years it covers. You can make it smaller (about 10 MB) by uploading it to http://mapshaper.org/, using the "simplify" function to reduce the number of lines in the district boundaries, and exporting the file as TopoJSON instead of GeoJSON. I did this and put the result in the '../topojson/' directory.

In [1]:
import pandas as pd
import numpy as np
import scipy.stats as stats

pd.options.display.max_columns = 999

first_year = 2006 # the year 2006 is the first year on the TEA site
last_year = 2016


def formatDF(apple, year_col):
    
    # Removes rows and columns not needed for the map
    
    apple = apple.drop(["AGGREGATION LEVEL","REGION","DISTNAME"], axis = 1)
    
    # Adding totals for each discipline in each district, by adding up actions 
    # against special ed students and non-special ed students. This will be 
    # inefficient because it makes a dict list first instead of staying in pandas.
    
    non_special = {"D06": ("D05","D-EXPULSION ACTIONS"), 
                   "D09": ("D08","E-DAEP PLACEMENTS"), 
                   "D12": ("D11", "F-OUT OF SCHOOL SUSPENSIONS"), 
                   "D15": ("D14", "G-IN SCHOOL SUSPENSIONS")}
    
    all_actions = []
    
    # if it was a .csv, the headers would be ["DISTRICT", "SECTION", "HEADING", "HEADING NAME", year_col]
    
    unfound = []
    
    for d in apple.index.get_level_values(0).unique():
        for key in non_special:
            try: 
                a = apple.loc[(d, key)][year_col]
            except KeyError:
                a = 0
            try:
                b = apple.loc[(d, non_special[key][0])][year_col]
            except KeyError:
                b = 0
            if a < 0: # in case of dummy values like -999
                a = 1
            if b < 0:
                b = 1
            total = a + b
            all_actions.append({"DISTRICT": d, "HEADING": key, "SECTION": non_special[key][1], 
                                "HEADING NAME": "ALL", year_col: total})
    
    new = pd.DataFrame(all_actions)
    new = new.set_index(["DISTRICT", "HEADING"])

    # Keeping only the rows that categorize students by protected class.
    
    patternIn = 'WHITE|BLACK OR AFRICAN AMERICAN|AMERICAN INDIAN OR ALASKA NAT|HISPANIC|NATIVE HAWAIIAN|ASIAN|TWO OR MORE RACES|SPEC. ED|ECO. DISAD|ECO DISAD.|TOTAL'
    apple = apple[apple["HEADING NAME"].str.contains(patternIn)]
    
    # Getting rid of rows that count students instead of incidents, or non-disadvantaged kids.
    
    patternOut = 'SPEC. ED. STUDENTS| SPEC. ED. EXPULSIONS TO JJAEP|ECO DISAD. STUDENTS|ECO. DISAD. STUDENTS|AT RISK|NON AT|UNKNOWN AT|NON SPEC. ED.|NON ECO DISAD.|NON ECO. DISAD.'
    apple = apple[apple["HEADING NAME"].str.contains(patternOut) == False]

    # Delete rows appearing to double-count the same expulsions.
    
    JJAEPReplace = {"SECTION": {
                        'M-ECO\. DISADV\. JJAEP PLACEMENTS|H-SPEC\. ED\. JJAEP EXPULSIONS': 'C-JJAEP EXPULSIONS'}}
    apple = apple.replace(to_replace=JJAEPReplace, regex=True)
    apple = apple[apple["SECTION"].str.contains("JJAEP EXPULSIONS|DISCIPLINE ACTION COUNTS") == False]
                    
    apple = apple.append(new)
    
    # Consolidating some of the descriptors into broader categories
    
    appleReplace = {year_col:
                        {-99999999: 1, -999999: 1, -999: 1},
                    "SECTION": {
                        'D-EXPULSION ACTIONS|N-ECO\. DISADV\. EXPULSIONS|I-SPEC\. ED\. EXPULSIONS': 'EXP',
                        'E-DAEP PLACEMENTS|O-ECO\. DISADV\. DAEP PLACEMENTS|J-SPEC\. ED\. DAEP PLACEMENTS': 'DAE',
                        'F-OUT OF SCHOOL SUSPENSIONS|P-ECO\. DISADV\. OUT OF SCHOOL SUS.|K-SPEC\. ED\. OUT OF SCHOOL SUS\.': 'OSS',
                        'G-IN SCHOOL SUSPENSIONS|Q-ECO\. DISADV\. IN SCHOOL SUS\.|L-SPEC\. ED\. IN SCHOOL SUS\.': 'ISS'},
                    "HEADING NAME": {'SPEC\. ED.*$': 'SPE',
                                     'ECO?. DISAD.*$': 'ECO',
                                     'HISPANIC': 'HIS',
                                     'HIS/LATINO': 'HIS',
                                     'HISPANIC/LATINO': 'HIS',
                                     'BLACK OR AFRICAN AMERICAN': 'BLA',
                                     'BLACK/AFRICAN AMERICAN': 'BLA',
                                     'WHITE': 'WHI',
                                     'AMERICAN INDIAN OR ALASKA NAT': 'IND',
                                     'ASIAN': 'ASI',
                                     'NATIVE HAWAIIAN/OTHER PACIFIC': 'PCI',
                                     'TWO OR MORE RACES': 'TWO',
                                    }
                    }

    apple = apple.replace(to_replace=appleReplace, regex=True)
    
    return apple

def getYear(year):
    year_col = "YR{}".format(str(year)[-2:])
    apple_path = '../data/from_agency/by_region/REGION_{}_DISTRICT_summary_{}.csv'
    one_year = [pd.read_csv(apple_path.format(str(region).zfill(2),str(year)[-2:]), 
                            index_col = ["DISTRICT","HEADING"], dtype = {year_col: int})
                for region in range(1,21)]
    a = pd.concat(one_year)
    
    # a = a.set_index(["DISTRICT","HEADING"] )
    a = a[~a.index.duplicated(keep='last')]  # a single row was causing a non-unique multiindex error 
    # print(a.loc[31901])
    a = formatDF(a, year_col)
    
        # the path to the files in the district demographics directory
    districtPath = '../data/from_agency/districts/district{}.dat'.format(year)
    district = populations(districtPath)

    statewide_students_count = district["DPETALLC"].sum()

    apple = a.reset_index()
    appleAll = apple[apple["HEADING NAME"] == "ALL"].rename({year_col: "all_punishments"}, axis = 1).drop(["HEADING NAME", "HEADING"], axis = 1)
    print(appleAll[:5])
    apple = apple.merge(district, how = "left", left_on = "DISTRICT", right_index = True)
    
    apple = apple[apple["DPETALLC"].notnull()]
    
    punishment_totals = {}
    for p in apple["SECTION"].unique():
        punishment_totals[p] = apple[apple["SECTION"] == p][apple["HEADING NAME"] == "ALL"][year_col].sum()
        
    # apple[18464:18470]  previous problem rows, gone because of the .notnull()
    
    apple = apple.merge(appleAll, how = "left", left_on = ["DISTRICT","SECTION"], right_on = ["DISTRICT","SECTION"])
    
    """    
    # This line will run slowly because for each row, it searches the entire dataframe
    apple["all_punishments"] = apple.apply(lambda x: 
                                               apple[apple["DISTRICT"] == x["DISTRICT"]][apple["SECTION"] == x["SECTION"]][apple["HEADING NAME"] == "ALL"][year_col].values[0], axis=1)
    """
    
    # New column will show the district's percentage of the state's student population.
    district["DPETALLP"] = district.apply(lambda x: x["DPETALLC"] / statewide_students_count * 100, axis=1).round(2)
    
    apple["LikelyError"] = apple.apply(lambda x: getLE(x, year_col), axis=1)
    apple["Scale"] = apple.apply(lambda x: getScale(x, year_col, punishment_totals, statewide_students_count), axis=1)
    apple["Percentage"] = apple.apply(lambda x: getPercentage(x, year_col, punishment_totals), axis=1).round(3)
    
    apple["Year"] = year
    district["Year"] = year
    apple = apple.rename(index=str, columns={year_col: "Count"})
    
    # apple = apple.set_index(["DISTRICT","SECTION","HEADING NAME", "Year"])
    # apple = apple.sort_index() # trying to improve speed
    
    return apple[["DISTRICT","SECTION","HEADING NAME", "Year", "Count", "LikelyError", "Percentage", "Scale"]], district

In [2]:
def populations(districtPath):
    district = pd.read_csv(districtPath, index_col="DISTRICT")

    district = district.rename(columns = {"SNAPDIST": 'DISTNAME'})
    
    sometimes_missing = [ 'DPETINDP', 'DPETASIP', 'DPETPCIP', 'DPETTWOP']
    
    for c in sometimes_missing:
        if c not in district.columns:
            district[c] = np.nan
    
    # deleting redundant columns
    
    district = district[['DISTNAME', 'REGION', 'DPETALLC', 
                         'DPETBLAP', 'DPETHISP', 'DPETWHIP', 'DPETINDP',
                         'DPETASIP', 'DPETPCIP', 'DPETTWOP', 'DPETECOP', 
                         'DPETSPEP']] # 'DISTRICT' not listed because it's the index

    return district




def getRatio(distPop, racePop, all_punishments, group_punishments):
    # Calculating ratio of punishments for the demographic group compared to the punishments for the student population
    # as a whole. For instance, "0.505" in the disparity column indicates the group got the punishment 50.5% as often
    # as average for the student population.

    """
    >>> getRatio(200, 20, 20, 10)
    4.0
    >>> getRatio(200, 20, 20, 2)
    0.0
    >>> print(getRatio(200, 0, 20, 0))
    None
    """

    if max(racePop, group_punishments) == 0 or None:
        return None
    elif all_punishments == 0 or None:
        return 0
    else:
        disparity = (group_punishments / (max(all_punishments, group_punishments))
                     / (max(racePop, group_punishments) / distPop)) - 1
        disparity = Decimal(disparity)
        disparity = disparity.quantize(Decimal('0.01'))
    return float(disparity)

In [3]:
def impossible(distPop, raceP, all_punishments, group_punishments):

    """
    >>> print(impossible(50, 20, 20, 100))
    1
    >>> impossible(20, 0, 20, 0)
    0
    """

    # The "RecordError" column flags implausible data entries. Some of them could still be true if school administrators
    # applied different standards different standards to determine which students belong to which demographic group.
    # Or some could be the result of students not being counted because of the time they moved in and out of district.

    if group_punishments > max(all_punishments,8): # eight because TEA could report 2 masked columns with 4 each
        return 1
    if raceP == 0 and group_punishments > 0:
        return 1
    return 0



def getFisher(distPop, racePop, all_punishments, group_punishments):

    """
    >>> getFisher(20, 5, 20, 10)
    2
    >>> getFisher(20, 0, 20, 0)
    None
    """
    
    # I don't know if this is a valid way to report the Fisher's exact test statistic, but the idea is that if getFisher returns a
    # positive number over .95, there's a 95% chance that the group's better-than-average treatment is not due to chance.
    # If it returns a number under -.95, there's a 95% chance that the group's worse-than-average treatment is not due to chance.
    # I think it should be easier to create a color scale to show the scores on a map this way.

    # The getFisher function assumes wrongly that everyone can have only one punishment (of each type). If the number of
    # punishments exceeds the number of kids, it reduces the number of punishments (and assumes wrongly that every
    # kid has been punished) But maybe the results are still close enough to correct to use for scaling?

    
    if max(racePop, group_punishments) == 0 or None:
        return None
    if distPop == 0:
        return None
    elif max(group_punishments, all_punishments) == 0 or None:
        return 0
    else:
        try: 
            oddsratio, pvalueG = stats.fisher_exact([[racePop, max(distPop - racePop, 0)],
                                                 [group_punishments, max(all_punishments - group_punishments, 0)]],
                                                alternative='greater')
        except ValueError:
            print(distPop, racePop, all_punishments, group_punishments)
        oddsratio, pvalueL = stats.fisher_exact([[racePop, max(distPop - racePop, 0)],
                                                 [group_punishments, max(all_punishments - group_punishments, 0)]],
                                                alternative='less')
        if pvalueL < pvalueG:
            pv = 1 - pvalueL
        else:
            pv = pvalueG - 1
        
        # To save space in the JSON, this simplifies the decimal values to an integer from -6 to 6
        # It should replace similar code in txappleseedmap/js/index.js
        
        scale = -6
        scale_colors = (-0.99999,-0.9984,-0.992,-0.96,-0.8,-0.2,0.2,0.8,0.96,0.992,0.9984,0.99999)
        
        for v in scale_colors:
            if pv > v:
                scale += 1
        
        # pv = Decimal(pv)
        # pv = pv.quantize(Decimal('0.000001'))
    return scale

print(getFisher(20, 5, 20, 10))

2


In [4]:
import json
with open("../geojson/base_districts.geojson") as json_data:
    district_map = json.load(json_data)
    json_data.close()

In [5]:
shapeIDs = set()

for shape in district_map["features"]:
    shape["id"] = shape["properties"]["DISTRICT_N"]
    assert shape["id"] not in shapeIDs, "id already in list: %r" % shape["id"]
    shapeIDs.add(shape["id"])
    
    # These two fields look redundant. Let's try deleting them.
    
    shape["properties"].pop("DISTRICT_1", None)
    shape["properties"].pop("OBJECTID_1", None)


In [6]:
type(district_map["features"][1]['geometry']['coordinates'][0][1][1])

float

In [7]:
# For districts overall, need columns that show what percentage of the state population they have
# and what percentage of the punishments?

def getLE(x, year_col):
    
    # Collects the correct values from the dataframes called "apple" and "district"
    # and calls the "impossible" function, which looks for data errors
    
    distPop = x["DPETALLC"]
    if distPop in (0, None, np.nan):
        return 1
    elif x["HEADING NAME"] == "ALL":
        return 0
    else:    
        all_punishments = x["all_punishments"]
        # all_punishments = apple.loc[x["DISTRICT"]][x["SECTION"]]["ALL"]
        group_punishments = x[year_col]
        # trying to make this run faster by returning info for two columns, then splitting them
        raceP = x["DPET{}P".format(x["HEADING NAME"][:3])]
        return impossible(distPop, raceP, all_punishments, group_punishments)


In [8]:
def getScale(x, year_col, punishment_totals, statewide_students_count):
    
    """
    This function does something different for the "HEADING NAME == ALL" rows than for the other rows.
    For the "ALL" rows it uses the whole state population as the "distPop" and uses the entire district population
    as the "racePop". For the other rows, the entire district population is used as "distPop", not "racePop".
    
    And this function calls getFisher for the real calculation.
    """
    
    group_punishments = x[year_col]
    if x["HEADING NAME"] == "ALL":
        distPop = statewide_students_count
        racePop = x["DPETALLC"]
        all_punishments = punishment_totals[x["SECTION"]]
    else:
        distPop = x["DPETALLC"]
        racePop = x["DPET{}P".format(x["HEADING NAME"])] * distPop * .01
        if pd.isna(racePop):
            return None
        if pd.isnull(x["all_punishments"]):
            print("null all_punishments: " + str(x))
        all_punishments = x["all_punishments"]
    return getFisher(distPop, racePop, all_punishments, group_punishments)

def getPercentage(x, year_col, punishment_totals):
    if x["HEADING NAME"] == "ALL":
        return x["all_punishments"] / punishment_totals[x["SECTION"]] * 100
    else:
        return x[year_col] / x["all_punishments"] * 100

In [9]:
# Need to merge columns of apple and district.

years = [x for x in range(first_year, last_year + 1)] # change back to first_year

pop_stats = ("DPETALLC","DPETALLP", "DPETBLAP","DPETHISP","DPETWHIP","DPETINDP","DPETASIP","DPETPCIP",
             "DPETTWOP","DPETECOP","DPETSPEP")

demos = ('ALL','SPE', 'ECO','HIS','BLA', 'WHI','IND', 'ASI','PCI', 'TWO')

punishments = ('EXP','DAE','OSS','ISS')

fail = {} # for testing
noScale = {}


manyDFs = []
districtDFs = []

for year in years:
    print("starting year " + str(year))
    yearDFs = getYear(year)
    manyDFs.append(yearDFs[0])
    districtDFs.append(yearDFs[1])
    
    

starting year 2006
       DISTRICT SECTION  all_punishments
12970     31901     EXP               73
12971     31901     DAE              826
12972     31901     OSS             3144
12973     31901     ISS            15309
12974    108902     EXP               18




starting year 2007
       DISTRICT SECTION  all_punishments
13039     31901     EXP              161
13040     31901     DAE             1083
13041     31901     OSS             3797
13042     31901     ISS            16270
13043    108902     EXP               13
starting year 2008
       DISTRICT SECTION  all_punishments
13041     31901     EXP              167
13042     31901     DAE              924
13043     31901     OSS             4865
13044     31901     ISS            12851
13045    108902     EXP               25
starting year 2009
       DISTRICT SECTION  all_punishments
12939     31901     EXP               56
12940     31901     DAE              841
12941     31901     OSS             4136
12942     31901     ISS            13229
12943    108902     EXP               28
starting year 2010
       DISTRICT SECTION  all_punishments
13065     31901     EXP               58
13066     31901     DAE              889
13067     31901     OSS             4952
13068     31901     IS

In [10]:
yearDFs[1][:5]

Unnamed: 0_level_0,DISTNAME,REGION,DPETALLC,DPETBLAP,DPETHISP,DPETWHIP,DPETINDP,DPETASIP,DPETPCIP,DPETTWOP,DPETECOP,DPETSPEP,DPETALLP,Year
DISTRICT,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
1902,CAYUGA ISD,7,568,4.2,7.0,82.9,0.0,0.7,0.0,5.1,33.8,13.7,0.01,2016
1903,ELKHART ISD,7,1244,5.4,9.6,81.4,0.1,0.9,0.2,2.6,54.7,11.6,0.02,2016
1904,FRANKSTON ISD,7,841,8.8,10.2,75.9,0.5,0.7,0.1,3.8,56.5,8.2,0.02,2016
1906,NECHES ISD,7,383,8.4,11.5,77.3,0.0,0.8,0.0,2.1,42.8,11.0,0.01,2016
1907,PALESTINE ISD,7,3385,27.0,41.2,27.9,0.2,0.8,0.0,2.8,74.1,8.5,0.06,2016


In [18]:
help(districtDFs)

Help on list object:

class list(object)
 |  list() -> new empty list
 |  list(iterable) -> new list initialized from iterable's items
 |  
 |  Methods defined here:
 |  
 |  __add__(self, value, /)
 |      Return self+value.
 |  
 |  __contains__(self, key, /)
 |      Return key in self.
 |  
 |  __delitem__(self, key, /)
 |      Delete self[key].
 |  
 |  __eq__(self, value, /)
 |      Return self==value.
 |  
 |  __ge__(self, value, /)
 |      Return self>=value.
 |  
 |  __getattribute__(self, name, /)
 |      Return getattr(self, name).
 |  
 |  __getitem__(...)
 |      x.__getitem__(y) <==> x[y]
 |  
 |  __gt__(self, value, /)
 |      Return self>value.
 |  
 |  __iadd__(self, value, /)
 |      Implement self+=value.
 |  
 |  __imul__(self, value, /)
 |      Implement self*=value.
 |  
 |  __init__(self, /, *args, **kwargs)
 |      Initialize self.  See help(type(self)) for accurate signature.
 |  
 |  __iter__(self, /)
 |      Implement iter(self).
 |  
 |  __le__(self, value, /

In [11]:
districtDFs[0]

Unnamed: 0_level_0,DISTNAME,REGION,DPETALLC,DPETBLAP,DPETHISP,DPETWHIP,DPETINDP,DPETASIP,DPETPCIP,DPETTWOP,DPETECOP,DPETSPEP,DPETALLP,Year
DISTRICT,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
1902,Cayuga,7,582,14,4,81,,,,,40.2,16,0.01,2006
1903,Elkhart,7,1294,9,5,86,,,,,47.6,17,0.03,2006
1904,Frankston,7,797,14,7,79,,,,,50.2,14,0.02,2006
1906,Neches,7,329,8,4,87,,,,,38.0,21,0.01,2006
1907,Palestine,7,3372,30,30,40,,,,,63.5,11,0.07,2006
1908,Westwood,7,1789,17,12,71,,,,,43.7,14,0.04,2006
1909,Slocum,7,389,2,5,93,,,,,58.1,22,0.01,2006
2901,Andrews,18,2898,2,58,40,,,,,47.3,16,0.06,2006
3801,Pineywoods Community Acad,7,220,21,8,69,,,,,60.0,12,0.00,2006
3902,Hudson,7,2429,4,20,75,,,,,54.3,11,0.05,2006


In [12]:
apple = pd.concat(manyDFs)

apple.to_csv('../data/processed/DistrictDisparitiesAll.csv', index = False)

In [19]:
districts = pd.concat(districtDFs)

districts.reset_index().to_csv('../data/processed/districtDFs.csv', index = False)

In [14]:
removedErrors = apple[apple["LikelyError"] == 0]

In [15]:

removedErrors.to_csv('../data/processed/DistrictDisparitiesRE.csv', index = False,
                    columns = ["DISTRICT","SECTION","HEADING NAME", "Year", "Count", "Percentage", "Scale"])

In [16]:
help(apple.to_csv)

Help on method to_csv in module pandas.core.frame:

to_csv(path_or_buf=None, sep=',', na_rep='', float_format=None, columns=None, header=True, index=True, index_label=None, mode='w', encoding=None, compression=None, quoting=None, quotechar='"', line_terminator='\n', chunksize=None, tupleize_cols=None, date_format=None, doublequote=True, escapechar=None, decimal='.') method of pandas.core.frame.DataFrame instance
    Write DataFrame to a comma-separated values (csv) file
    
    Parameters
    ----------
    path_or_buf : string or file handle, default None
        File path or object, if None is provided the result is returned as
        a string.
    sep : character, default ','
        Field delimiter for the output file.
    na_rep : string, default ''
        Missing data representation
    float_format : string, default None
        Format string for floating point numbers
    columns : sequence, optional
        Columns to write
    header : boolean or list of string, default Tr

In [17]:
# populating the GeoJSON file, which already has geometry for the districts.
    # This previously was called in the loop for creating a single year of data, 
    # will need to be changed to apply it to all years at once.
    
    for entry in district_map["features"]:
        if entry["id"] in district.index:
            entry["properties"][year] = {}
            for stat in pop_stats:
                # This will give NaN (numpy.float64) when empty
                if pd.notnull(district.loc[entry["id"]][stat]):
                    try:
                        entry["properties"][year][stat] = district.loc[entry["id"]][stat]
                    except KeyError:
                        # for when the map has a district not in the TEA's data
                        print("no stats for " + str(year) + " " + str(entry["id"]))
                        entry["properties"][year][stat] = None
        if entry["id"] in apple.index.get_level_values(0):
            for punishment in punishments:
                entry["properties"][year][punishment] = {}
                for demo in demos:
                    if (entry["id"],punishment,demo) in apple.index:
                    # if pd.notnull(apple.loc[entry["id"],punishment,demo][year_col]): # should prevent empty dicts at "demo" level
                        entry["properties"][year][punishment][demo] = {} 
                        try:
                            entry["properties"][year][punishment][demo]["C"] = int(apple.loc[entry["id"],punishment,demo][year_col])
                            entry["properties"][year][punishment][demo]["E"] = int(apple.loc[entry["id"],punishment,demo]["LikelyError"])
                            entry["properties"][year][punishment][demo]["%"] = float(apple.loc[entry["id"],punishment,demo]["Percentage"])
                        except:
                            fail[entry["id"]] = (year,punishment,demo)
                        try:
                            entry["properties"][year][punishment][demo]["S"] = int(apple.loc[entry["id"],punishment,demo]["Scale"])
                        except:
                            noScale[entry["id"]] = (year,punishment,demo)
    print(district_map["features"][30]["properties"])
                    # print("Nothing for {} {} {}".format(entry["id"],punishment,demo))
                    # impossible(distPop, racePop, all_punishments, group_punishments)

IndentationError: unexpected indent (<ipython-input-17-f3d52f1c47d3>, line 5)

In [None]:
pd.notnull(district.loc[228905])

In [None]:
int(apple.loc[entry["id"],punishment,demo][year_col])

In [None]:
float(apple.loc[entry["id"],punishment,demo]["Percentage"])

In [None]:
apple.loc[entry["id"],punishment,demo]["Scale"]

In [None]:
with open('../geojson/districts_with_data.geojson', 'w') as fp:
    json.dump(district_map, fp, default=int)
    fp.close()

In [None]:
import geojson

fc = geojson.FeatureCollection(district_map)

In [None]:
type(fc)

In [None]:
help(geojson.load)

In [None]:
with open("../geojson/districts_with_data.geojson") as geo_data:
    fc = geojson.load(geo_data)
    geo_data.close()

In [None]:
district_map["features"][3]

In [None]:
# Testing to see if the file we produced is valid GeoJSON

fc.is_valid

In [None]:
district.loc[67908]

In [None]:
len(fail)

In [None]:
len(noScale)

In [None]:
# df[df['A'] > 0]

q = district_map["features"][900]["properties"]["DISTRICT_N"]

district.loc[q]

# district[district["DISTRICT"] == 167903]