In [1]:
import geopandas as gpd

## Texas Precinct Cleaning

In [2]:
file_path = "./clean_data/texas_data/texasAggPrecinct.geojson"

gdf = gpd.read_file(file_path)

print(gdf.head())

print(gdf.crs)

   CNTY  COLOR  PREC   PCTKEY  CNTYKEY  G20VR  G20SSVR  G20PRERTRU  \
0   113      7  1104  1131104       57   2745     39.5         221   
1   201      2  0312  2010312      101   3973     11.3        1124   
2   351      4  0003  3510003      176    626      1.1         412   
3   181      4  0304  1810304       91   3058      4.5        1290   
4   201      2  0877  2010877      101   5743     27.1        1352   

   G20PREDBID  G20PRELJOR  ...  G20SCCDBIR  C_TOT20  C_WHT20  C_BLK20  \
0        1173           7  ...        1162  5838.24   163.87  1279.49   
1        1460          21  ...        1345  7205.53  3282.30  1884.00   
2          28           0  ...          31   934.30   922.73     0.00   
3         676          29  ...         619  4857.44  3410.96   908.94   
4        2554          43  ...        2490  9865.82  1459.26  2882.58   

   C_HSP20  C_ASN20  C_AIA20  C_NHP20  C_2OM20  \
0  4358.57     0.00     0.00     9.34      0.0   
1  1308.67   641.17    69.00     0.00   

In [3]:
gdf.columns

Index(['CNTY', 'COLOR', 'PREC', 'PCTKEY', 'CNTYKEY', 'G20VR', 'G20SSVR',
       'G20PRERTRU', 'G20PREDBID', 'G20PRELJOR', 'G20PREGHAW', 'G20PREOWRI',
       'G20USSRCOR', 'G20USSDHEG', 'G20USSLMCK', 'G20USSGCOL', 'G20RRCRWRI',
       'G20RRCDCAS', 'G20RRCLSTE', 'G20RRCGGRU', 'G20SSCRHEC', 'G20SSCDMEA',
       'G20SSCLASH', 'G20SSCRBLA', 'G20SSCDCHE', 'G20SSCRBOY', 'G20SSCDWIL',
       'G20SSCLSTR', 'G20SSCRBUS', 'G20SSCDTRI', 'G20SSCLOXF', 'G20SCCRRIC',
       'G20SCCDFRI', 'G20SCCRYEA', 'G20SCCDCLI', 'G20SCCRNEW', 'G20SCCDBIR',
       'C_TOT20', 'C_WHT20', 'C_BLK20', 'C_HSP20', 'C_ASN20', 'C_AIA20',
       'C_NHP20', 'C_2OM20', 'geometry'],
      dtype='object')

In [8]:
columnNameMapping = {
    "C_TOT20": "Population",
    "PREC": "Precinct",
    "G20PRERTRU": "Republican",
    "G20PREDBID": "Democrat",
    "C_WHT20": "White",
    "C_BLK20": "Black",
    "C_HSP20": "Hispanic",
    "C_ASN20": "Asian",
    "C_NHP20": "Pacific",
    "C_AIA20": "Native",
    "C_2OM20": "Other",
    "geometry": "geometry"
}

In [9]:
columns_to_keep = list(columnNameMapping.keys())

gdf_filtered = gdf[columns_to_keep]

gdf_filtered = gdf_filtered.rename(columns=columnNameMapping)

gdf_filtered.head()

Unnamed: 0,Population,Precinct,Republican,Democrat,White,Black,Hispanic,Asian,Pacific,Native,Other,geometry
0,5838.24,1104,221,1173,163.87,1279.49,4358.57,0.0,9.34,0.0,0.0,"MULTIPOLYGON (((721018.74 3624264.239, 721021...."
1,7205.53,312,1124,1460,3282.3,1884.0,1308.67,641.17,0.0,69.0,0.0,"MULTIPOLYGON (((836708.731 3295423.758, 836717..."
2,934.3,3,412,28,922.73,0.0,5.82,0.0,0.0,0.0,0.0,"MULTIPOLYGON (((1009076.553 3451230.248, 10090..."
3,4857.44,304,1290,676,3410.96,908.94,147.2,137.99,0.0,276.53,10.0,"MULTIPOLYGON (((720126.266 3726117.785, 720147..."
4,9865.82,877,1352,2554,1459.26,2882.58,5051.91,388.85,0.0,68.55,0.0,"MULTIPOLYGON (((813359.332 3308655.067, 813368..."


In [11]:
# Check if any of the variables is NaN for a precinct
variables = [value for key, value in columnNameMapping.items()]
print(variables)
precincts_with_nan = gdf_filtered[variables].isnull().any(axis=1)

# Count how many precincts have NaN values
nan_precincts_count = precincts_with_nan.sum()

# Total number of precincts
total_precincts = len(gdf_filtered)

# Calculate the percentage of precincts with NaN values
nan_percentage = (nan_precincts_count / total_precincts) * 100

# Display the results
print(f"Number of precincts with NaN values in any variable: {nan_precincts_count}")
print(f"Percentage of precincts with NaN values: {nan_percentage:.2f}%")

['Population', 'Precinct', 'Republican', 'Democrat', 'White', 'Black', 'Hispanic', 'Asian', 'Pacific', 'Native', 'Other', 'geometry']
Number of precincts with NaN values in any variable: 5
Percentage of precincts with NaN values: 0.06%


In [12]:
gdf_filtered = gdf_filtered.to_crs(4326)

In [13]:
# Check if any of the variables is NaN for a precinct
variables = [value for key, value in columnNameMapping.items()]
print(variables)
precincts_with_nan = gdf_filtered[variables].isnull().any(axis=1)

# Count how many precincts have NaN values
nan_precincts_count = precincts_with_nan.sum()

# Total number of precincts
total_precincts = len(gdf_filtered)

# Calculate the percentage of precincts with NaN values
nan_percentage = (nan_precincts_count / total_precincts) * 100

# Display the results
print(f"Number of precincts with NaN values in any variable: {nan_precincts_count}")
print(f"Percentage of precincts with NaN values: {nan_percentage:.2f}%")

['Population', 'Precinct', 'Republican', 'Democrat', 'White', 'Black', 'Hispanic', 'Asian', 'Pacific', 'Native', 'Other', 'geometry']
Number of precincts with NaN values in any variable: 5
Percentage of precincts with NaN values: 0.06%


In [14]:
gdf_filtered.to_file("./clean_data/texas_data/texas_precinct_data.geojson", driver="GeoJSON")

## Texas Current District Plan Cleaning

In [2]:
file_path = "./texas_data/texasAggDistrict.geojson"

gdf = gpd.read_file(file_path)

print(gdf.head())

print(gdf.crs)

NameError: name 'gpd' is not defined

In [15]:
incumbents = ["Republican", "Republican", "Republican", "Republican", "Republican", "Republican", "Democrat", "Republican", "Democrat", "Republican", "Republican", "Republican", "Republican", "Republican", "Republican", "Democrat", "Republican", "Democrat", "Republican", "Democrat", "Republican", "Republican", "Republican", "Republican", "Republican", "Republican", "Republican", "Democrat", "Democrat", "Democrat", "Republican", "Democrat", "Democrat", "Democrat", "Republican", "Democrat", "Republican", "Democrat"]
gdf["Incumbent"] = incumbents
display_columns = gdf[["Incumbent", "REP_NM"]]
print(display_columns)

     Incumbent              REP_NM
0   Republican     Nathaniel Moran
1   Republican        Dan Crenshaw
2   Republican          Keith Self
3   Republican          Pat Fallon
4   Republican        Lance Gooden
5   Republican         Jake Ellzey
6     Democrat     Lizzie Fletcher
7   Republican     Morgan Luttrell
8     Democrat            Al Green
9   Republican      Michael McCaul
10  Republican      August Pfluger
11  Republican         Kay Granger
12  Republican       Ronny Jackson
13  Republican         Randy Weber
14  Republican   Monica De La Cruz
15    Democrat    Veronica Escobar
16  Republican       Pete Sessions
17    Democrat  Sheila Jackson Lee
18  Republican     Jodey Arrington
19    Democrat      Joaquin Castro
20  Republican            Chip Roy
21  Republican          Troy Nehls
22  Republican       Tony Gonzales
23  Republican      Beth Van Duyne
24  Republican      Roger Williams
25  Republican     Michael Burgess
26  Republican       Michael Cloud
27    Democrat      

In [16]:
columnNameMapping = {
        "DISTRICT": "District",
        "REP_NM": "Representative",
        "Incumbent": "Incumbent",
        "TOT_POP21": "Population",
        "G20PRERTRU": "Republican",
        "G20PREDBID": "Democrat",
        "WHT_NHSP21": "White",
        "BLK_NHSP21": "Black",
        "HSP_POP21": "Hispanic",
        "ASN_NHSP21": "Asian",
        "HPI_NHSP21": "Pacific",
        "AIA_NHSP21": "Native",
        "OTH_NHSP21": "Other",
    }
gdf = gdf.rename(columns=columnNameMapping)
unwanted_columns = [
    "G20PRELJOR",
    "G20PREGHAW",
    "G20PREOWRI"
]
gdf = gdf.drop(columns=unwanted_columns)
gdf.head()

Unnamed: 0,Representative,District,Population,White,Black,Hispanic,Asian,Native,Pacific,Other,Republican,Democrat,geometry,Incumbent
0,Nathaniel Moran,1,755951.0,462992.0,134268.0,128887.0,8208.0,1413.0,457.0,1212.0,233263,85451,"MULTIPOLYGON (((-94.12964 31.09929, -94.13149 ...",Republican
1,Dan Crenshaw,2,746875.0,397228.0,76916.0,221684.0,28252.0,1259.0,852.0,1572.0,195333,121928,"MULTIPOLYGON (((-95.12464 29.75725, -95.1252 2...",Republican
2,Keith Self,3,746534.0,445066.0,71208.0,112590.0,87496.0,2140.0,226.0,2983.0,204906,152128,"MULTIPOLYGON (((-96.29566 32.8978, -96.29591 3...",Republican
3,Pat Fallon,4,762614.0,474925.0,65851.0,107312.0,83372.0,3018.0,410.0,2773.0,216555,125752,"MULTIPOLYGON (((-95.65981 32.71141, -95.66079 ...",Republican
4,Lance Gooden,5,756845.0,374352.0,102182.0,222093.0,31884.0,2415.0,219.0,1736.0,180386,113485,"MULTIPOLYGON (((-96.05279 32.0059, -96.05279 3...",Republican


In [17]:
gdf.to_file("texas_current_district_data.geojson", driver="GeoJSON")

NameError: name 'gdf' is not defined

In [7]:
file_path = "texas_current_district_data.geojson"

gdf = gpd.read_file(file_path)

print(gdf.head())

print(gdf.crs)

Skipping field District: unsupported OGR type: 1


    Representative  Population     White     Black  Hispanic    Asian  Native  \
0  Nathaniel Moran    755951.0  462992.0  134268.0  128887.0   8208.0  1413.0   
1     Dan Crenshaw    746875.0  397228.0   76916.0  221684.0  28252.0  1259.0   
2       Keith Self    746534.0  445066.0   71208.0  112590.0  87496.0  2140.0   
3       Pat Fallon    762614.0  474925.0   65851.0  107312.0  83372.0  3018.0   
4     Lance Gooden    756845.0  374352.0  102182.0  222093.0  31884.0  2415.0   

   Pacific   Other  Republican  Democrat   Incumbent  \
0    457.0  1212.0      233263     85451  Republican   
1    852.0  1572.0      195333    121928  Republican   
2    226.0  2983.0      204906    152128  Republican   
3    410.0  2773.0      216555    125752  Republican   
4    219.0  1736.0      180386    113485  Republican   

                                            geometry  
0  MULTIPOLYGON (((-94.12964 31.09929, -94.13149 ...  
1  MULTIPOLYGON (((-95.12464 29.75725, -95.1252 2...  
2  MULTIPOL

KeyError: 'District'

In [6]:
gdf.to_file("texas_current_district_data.geojson", driver="GeoJSON")