# MSA Analysis - Build Datasets for ML for site selection recommendations

- has similar code to other files
- combine accordingly

## Import Packages and Libraries

In [1]:
#import functions
import os
import sys

sys.path.append(
    os.path.join(os.path.abspath(os.path.join(os.path.curdir, os.path.pardir)))
)

#censusdis
from collections import OrderedDict

import geopandas as gpd
import matplotlib.pyplot as plt

from typing import Optional

import censusdis.data as ced
import censusdis.maps as cem
import censusdis.values as cev
import censusdis.geography as cgeo
from censusdis.states import STATE_MA
from censusdis import states
from censusdis.maps import ShapeReader, plot_us_boundary
import censusdis.maps as cmap


# Make sure it is there.
from censusdis.values import ALL_SPECIAL_VALUES

# _______________________________________________________________________

#standard packages
import pandas as pd
import numpy as np
import math
from math import pi, sqrt
import matplotlib.pyplot as plt
# import pygwalker as pyg

# import seaborn as sns
from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler
import datetime
import time
# from tqdm import tqdm, trange

#gis packages
import osmnx as ox
import logging as lg
ox.settings.log_console=True #use cache to avoid overloading the server
# ox.settings.memory_cache=True #use cache to avoid overloading the server
ox.settings.memory = 4294967296 #set memory cache to 4GB

from shapely.geometry import Point
import folium
import networkx as nx
## future libaries
# import contextily as cx
# import fiona
# from pandana.loaders import osm
# import momepy
# import missingno as msno
# from us import states
# import imageio

import warnings
warnings.filterwarnings("ignore")

%matplotlib inline
ox.__version__



pd.set_option('display.max_colwidth', None)
pd.set_option('display.width', 1000)
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', 1000)

# Import Datasets

## Import ACS5 Shell File for General Model

In [2]:
# download shell file and keep only General Model rows

acs22_5_shell_genmodel = pd.read_excel('C:/Users/jerem/OneDrive/Documents/Git Projects/MeridianXYZ/data/census/acs/summary files/2022/ACS20225YR_Table_Shells.xlsx', sheet_name='ACS20225YR_Table_Shells')

In [3]:
#ger varialbes for general model

variables = acs22_5_shell_genmodel[acs22_5_shell_genmodel['General_Model'] == 1][['Table_ID']].drop_duplicates()
variable_list = variables['Table_ID'].tolist()
variable_list =variable_list[:5] #test with 5 variables
variable_list

['B01001', 'B01002', 'B01003', 'B02001', 'B08103']

## Import datasets based on variables on model choice

In [8]:
# import ACS data files based on variable list

for i in range(0, len(variable_list)):
    print(i, variable_list[i])
    file_path = f'C:/Users/jerem/OneDrive/Documents/Git Projects/MeridianXYZ/data/census/acs/summary files/2022/5YRData/acsdt5y2022-{variable_list[i]}.dat'
    data = pd.read_csv(file_path, sep='|')
    
    if i == 0:
        dataset = data  # Initialize dataset with the first file
        print('i')
    else:
        # Ensure both dataframes have the 'GEO_ID' column before merging
        if 'GEO_ID' in dataset.columns and 'GEO_ID' in data.columns:
            dataset = pd.merge(dataset, data, on='GEO_ID')
            print(dataset.shape)
        else:
            print(f"Error: 'GEO_ID' not found in both datasets for {variable_list[i]}")

# Print the shape of the dataset to verify the merges
print(dataset.shape)


0 B01001
i
1 B01002
(546335, 105)
2 B01003
(546335, 107)
3 B02001
(546335, 127)
4 B08103
(301904, 141)
(301904, 141)


In [7]:
dataset.head()

Unnamed: 0,GEO_ID,B01001_E001,B01001_M001,B01001_E002,B01001_M002,B01001_E003,B01001_M003,B01001_E004,B01001_M004,B01001_E005,B01001_M005,B01001_E006,B01001_M006,B01001_E007,B01001_M007,B01001_E008,B01001_M008,B01001_E009,B01001_M009,B01001_E010,B01001_M010,B01001_E011,B01001_M011,B01001_E012,B01001_M012,B01001_E013,B01001_M013,B01001_E014,B01001_M014,B01001_E015,B01001_M015,B01001_E016,B01001_M016,B01001_E017,B01001_M017,B01001_E018,B01001_M018,B01001_E019,B01001_M019,B01001_E020,B01001_M020,B01001_E021,B01001_M021,B01001_E022,B01001_M022,B01001_E023,B01001_M023,B01001_E024,B01001_M024,B01001_E025,B01001_M025,B01001_E026,B01001_M026,B01001_E027,B01001_M027,B01001_E028,B01001_M028,B01001_E029,B01001_M029,B01001_E030,B01001_M030,B01001_E031,B01001_M031,B01001_E032,B01001_M032,B01001_E033,B01001_M033,B01001_E034,B01001_M034,B01001_E035,B01001_M035,B01001_E036,B01001_M036,B01001_E037,B01001_M037,B01001_E038,B01001_M038,B01001_E039,B01001_M039,B01001_E040,B01001_M040,B01001_E041,B01001_M041,B01001_E042,B01001_M042,B01001_E043,B01001_M043,B01001_E044,B01001_M044,B01001_E045,B01001_M045,B01001_E046,B01001_M046,B01001_E047,B01001_M047,B01001_E048,B01001_M048,B01001_E049,B01001_M049,B01002_E001,B01002_M001,B01002_E002,B01002_M002,B01002_E003,B01002_M003,B01003_E001,B01003_M001,B02001_E001,B02001_M001,B02001_E002,B02001_M002,B02001_E003,B02001_M003,B02001_E004,B02001_M004,B02001_E005,B02001_M005,B02001_E006,B02001_M006,B02001_E007,B02001_M007,B02001_E008,B02001_M008,B02001_E009,B02001_M009,B02001_E010,B02001_M010,B08103_E001,B08103_M001,B08103_E002,B08103_M002,B08103_E003,B08103_M003,B08103_E004,B08103_M004,B08103_E005,B08103_M005,B08103_E006,B08103_M006,B08103_E007,B08103_M007
0,0100000US,331097593,-555555555,164200298,8084,9725644,3889,10210019,22849,10974635,22485,6577849,4115,4618967,4552,2348124,12572,2326225,13955,6726381,18245,11574776,5454,11533188,4030,11139243,21904,10497372,21381,10160361,4646,10433237,3881,10628155,19903,4278900,13890,5965215,17271,3629086,16608,4859756,14500,6716886,16130,4325136,13369,2626186,10105,2324957,11059,166897295,8129,9279281,3720,9728129,20323,10458513,21187,6259635,4293,4424350,5213,2208387,13108,2192443,12749,6438019,17629,11099035,4512,11181154,4495,10869175,22739,10304569,22506,10086469,4564,10407290,3698,10907144,19158,4453471,14475,6344590,15712,3962963,13595,5412557,14610,7679431,19424,5393379,14196,3622161,12936,4185150,15412,38.5,0.1,37.4,0.1,39.7,0.1,331097593,-555555555,331097593,-555555555,218123424,99387,41288572,37945,2786431,16957,19112979,20786,624863,6340,20018544,83160,29142780,73187,17351681,86240,11791099,79222,41.6,0.1,42.1,0.1,38.2,0.1,38.5,0.2,33.4,0.2,38.2,0.2,44.0,0.1
1,0100089US,1072026,13373,536487,7273,34798,983,40417,1357,42181,1096,25419,838,16147,703,7576,445,7270,499,21479,1075,34914,1212,34011,1121,31710,903,29735,959,29076,947,31403,920,34699,829,14715,548,20010,771,11855,574,17509,643,22089,580,14136,468,8784,400,6554,316,535539,6633,33889,1072,36630,1137,40905,1209,24229,786,15117,674,7311,503,7100,426,20405,778,33141,950,32241,1064,31190,973,28931,965,29286,780,30861,788,34773,813,14433,551,21371,725,12833,511,17735,601,24294,772,17676,708,10343,451,10845,724,36.6,0.2,35.6,0.2,37.7,0.4,1072026,13373,1072026,13373,405726,4478,17265,1300,511394,7390,16489,1138,3616,544,34620,1634,82916,3044,31579,1784,51337,1990,42.9,0.3,43.0,0.4,39.0,0.6,41.8,2.6,41.0,1.2,42.2,1.6,48.8,0.8
2,0100090US,1171,359,600,199,39,41,65,61,26,24,9,10,5,10,1,2,12,15,26,31,15,15,16,14,38,31,43,48,48,38,19,22,48,27,23,16,37,30,16,14,49,34,25,17,19,23,15,17,6,8,571,180,52,37,16,17,36,32,7,11,10,19,8,15,2,4,9,15,34,32,39,33,51,45,17,15,44,40,35,34,31,25,13,10,23,20,16,19,17,17,16,12,54,49,13,13,28,30,46.9,8.5,45.8,6.8,47.1,13.9,1171,359,1171,359,216,93,190,151,606,175,38,41,0,13,2,3,119,91,2,4,117,92,43.8,6.0,43.3,6.1,-666666666.0,-222222222.0,-666666666.0,-222222222.0,33.5,21.1,-666666666.0,-222222222.0,50.8,14.7
3,0100091US,2593741,2617,1295420,1534,81664,577,86798,1203,94372,1411,54614,495,35611,417,19253,918,18121,813,49850,1108,85865,626,85974,733,87319,1232,81277,1259,77221,551,78127,579,80689,1216,33196,897,48622,1064,28613,937,39593,1098,53176,1089,36774,813,22984,787,15707,521,1298321,1965,77979,466,84495,1397,89337,1359,52687,466,32477,494,17776,804,16613,814,44621,952,82001,587,82101,640,84218,1156,76728,1196,74928,596,75407,580,83356,1091,34317,960,50097,933,30027,867,42500,1283,62397,1199,44127,936,29495,1095,30637,1012,38.1,0.2,37.1,0.2,39.1,0.1,2593741,2617,2593741,2617,1788619,4259,142056,1916,250839,2623,43677,835,3325,292,67878,2542,297347,3611,66424,2200,230923,3095,41.4,0.1,41.7,0.1,37.5,0.4,37.3,4.3,36.5,1.0,38.5,1.6,44.8,0.5
4,0100092US,54870,936,27183,547,1136,125,1390,160,1472,176,859,108,595,129,250,60,225,58,746,123,1272,134,1538,169,1493,108,1444,143,1688,169,1401,88,1915,142,832,108,1262,103,945,87,1353,85,2272,127,1563,125,868,77,664,65,27687,524,1098,146,1182,128,1173,107,894,125,566,103,290,71,147,46,742,122,1293,104,1292,122,1566,110,1467,152,1418,131,1673,120,2096,179,983,102,1453,120,1133,118,1524,106,2419,153,1553,115,908,81,817,85,50.3,0.8,48.6,0.8,52.1,0.9,54870,936,54870,936,45274,687,3217,429,899,141,897,152,75,27,1174,235,3334,343,1338,208,1996,252,46.4,0.6,45.7,1.0,41.2,3.2,47.5,2.9,44.4,5.9,39.5,3.9,53.2,1.8


## Import Geography File

In [7]:
# Import Geography

geography = pd.read_csv('C:/Users/jerem/OneDrive/Documents/Git Projects/MeridianXYZ/data/census/acs/summary files/2022/Geos20225YR.txt', sep='|')
geography.head()

Unnamed: 0,FILEID,STUSAB,SUMLEVEL,COMPONENT,US,REGION,DIVISION,STATE,COUNTY,COUSUB,PLACE,TRACT,BLKGRP,CONCIT,AIANHH,AIANHHFP,AIHHTLI,AITS,AITSFP,ANRC,CBSA,CSA,METDIV,MACC,MEMI,NECTA,CNECTA,NECTADIV,UA,CDCURR,SLDU,SLDL,ZCTA5,SUBMCD,SDELM,SDSEC,SDUNI,UR,PCI,PUMA5,GEO_ID,NAME,BTTR,BTBG,TL_GEO_ID
0,ACSSF,US,10,0,1.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,0100000US,United States,,,
1,ACSSF,US,10,89,1.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,0100089US,United States -- American Indian Reservation and Trust Land -- Federal,,,
2,ACSSF,US,10,90,1.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,0100090US,United States -- American Indian Reservation and Trust Land -- State,,,
3,ACSSF,US,10,91,1.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,0100091US,United States -- Oklahoma Tribal Statistical Area,,,
4,ACSSF,US,10,92,1.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,0100092US,United States -- Tribal Designated Statistical Area,,,


In [8]:
# Select Geography Columns

# geography = geography[geography['BLKGRP'] > 1] # columns where there are blockgroup information
geography = geography[[
    'FILEID',
    'US',
    'REGION',
    'DIVISION',
    'STATE',
    'COUNTY',
    'COUSUB',
    'PLACE',
    'TRACT',
    'BLKGRP',
    'CBSA',
    'CSA
    'GEO_ID',
    'NAME',
    'TL_GEO_ID'
]]
geography

Unnamed: 0,FILEID,US,REGION,DIVISION,STATE,COUNTY,COUSUB,PLACE,TRACT,BLKGRP,CBSA,GEO_ID,NAME,TL_GEO_ID
0,ACSSF,1.0,,,,,,,,,,0100000US,United States,
1,ACSSF,1.0,,,,,,,,,,0100089US,United States -- American Indian Reservation and Trust Land -- Federal,
2,ACSSF,1.0,,,,,,,,,,0100090US,United States -- American Indian Reservation and Trust Land -- State,
3,ACSSF,1.0,,,,,,,,,,0100091US,United States -- Oklahoma Tribal Statistical Area,
4,ACSSF,1.0,,,,,,,,,,0100092US,United States -- Tribal Designated Statistical Area,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
619335,ACSSF,,,,56.0,,,,,,,9700000US5605830,"Teton County School District 1, Wyoming",5605830.0
619336,ACSSF,,,,56.0,,,,,,,9700000US5606090,"Weston County School District 7, Wyoming",5606090.0
619337,ACSSF,,,,56.0,,,,,,,9700000US5606240,"Washakie County School District 1, Wyoming",5606240.0
619338,ACSSF,,,,56.0,,,,,,,9700000US5699999,"Remainder of Wyoming, Wyoming",


In [10]:
geo_dataset = pd.merge(geography, dataset, on='GEO_ID')
geo_dataset

Unnamed: 0,FILEID,US,REGION,DIVISION,STATE,COUNTY,COUSUB,PLACE,TRACT,BLKGRP,CBSA,GEO_ID,NAME,TL_GEO_ID,B01001_E001,B01001_M001,B01001_E002,B01001_M002,B01001_E003,B01001_M003,B01001_E004,B01001_M004,B01001_E005,B01001_M005,B01001_E006,B01001_M006,B01001_E007,B01001_M007,B01001_E008,B01001_M008,B01001_E009,B01001_M009,B01001_E010,B01001_M010,B01001_E011,B01001_M011,B01001_E012,B01001_M012,B01001_E013,B01001_M013,B01001_E014,B01001_M014,B01001_E015,B01001_M015,B01001_E016,B01001_M016,B01001_E017,B01001_M017,B01001_E018,B01001_M018,B01001_E019,B01001_M019,B01001_E020,B01001_M020,B01001_E021,B01001_M021,B01001_E022,B01001_M022,B01001_E023,B01001_M023,B01001_E024,B01001_M024,B01001_E025,B01001_M025,B01001_E026,B01001_M026,B01001_E027,B01001_M027,B01001_E028,B01001_M028,B01001_E029,B01001_M029,B01001_E030,B01001_M030,B01001_E031,B01001_M031,B01001_E032,B01001_M032,B01001_E033,B01001_M033,B01001_E034,B01001_M034,B01001_E035,B01001_M035,B01001_E036,B01001_M036,B01001_E037,B01001_M037,B01001_E038,B01001_M038,B01001_E039,B01001_M039,B01001_E040,B01001_M040,B01001_E041,B01001_M041,B01001_E042,B01001_M042,B01001_E043,B01001_M043,B01001_E044,B01001_M044,B01001_E045,B01001_M045,B01001_E046,B01001_M046,B01001_E047,B01001_M047,B01001_E048,B01001_M048,B01001_E049,B01001_M049,B01002_E001,B01002_M001,B01002_E002,B01002_M002,B01002_E003,B01002_M003,B01003_E001,B01003_M001,B02001_E001,B02001_M001,B02001_E002,B02001_M002,B02001_E003,B02001_M003,B02001_E004,B02001_M004,B02001_E005,B02001_M005,B02001_E006,B02001_M006,B02001_E007,B02001_M007,B02001_E008,B02001_M008,B02001_E009,B02001_M009,B02001_E010,B02001_M010,B08103_E001,B08103_M001,B08103_E002,B08103_M002,B08103_E003,B08103_M003,B08103_E004,B08103_M004,B08103_E005,B08103_M005,B08103_E006,B08103_M006,B08103_E007,B08103_M007
0,ACSSF,1.0,,,,,,,,,,0100000US,United States,,331097593,-555555555,164200298,8084,9725644,3889,10210019,22849,10974635,22485,6577849,4115,4618967,4552,2348124,12572,2326225,13955,6726381,18245,11574776,5454,11533188,4030,11139243,21904,10497372,21381,10160361,4646,10433237,3881,10628155,19903,4278900,13890,5965215,17271,3629086,16608,4859756,14500,6716886,16130,4325136,13369,2626186,10105,2324957,11059,166897295,8129,9279281,3720,9728129,20323,10458513,21187,6259635,4293,4424350,5213,2208387,13108,2192443,12749,6438019,17629,11099035,4512,11181154,4495,10869175,22739,10304569,22506,10086469,4564,10407290,3698,10907144,19158,4453471,14475,6344590,15712,3962963,13595,5412557,14610,7679431,19424,5393379,14196,3622161,12936,4185150,15412,38.5,0.1,37.4,0.1,39.7,0.1,331097593,-555555555,331097593,-555555555,218123424,99387,41288572,37945,2786431,16957,19112979,20786,624863,6340,20018544,83160,29142780,73187,17351681,86240,11791099,79222,41.6,0.1,42.1,0.1,38.2,0.1,38.5,0.2,33.4,0.2,38.2,0.2,44.0,0.1
1,ACSSF,1.0,,,,,,,,,,0100089US,United States -- American Indian Reservation and Trust Land -- Federal,,1072026,13373,536487,7273,34798,983,40417,1357,42181,1096,25419,838,16147,703,7576,445,7270,499,21479,1075,34914,1212,34011,1121,31710,903,29735,959,29076,947,31403,920,34699,829,14715,548,20010,771,11855,574,17509,643,22089,580,14136,468,8784,400,6554,316,535539,6633,33889,1072,36630,1137,40905,1209,24229,786,15117,674,7311,503,7100,426,20405,778,33141,950,32241,1064,31190,973,28931,965,29286,780,30861,788,34773,813,14433,551,21371,725,12833,511,17735,601,24294,772,17676,708,10343,451,10845,724,36.6,0.2,35.6,0.2,37.7,0.4,1072026,13373,1072026,13373,405726,4478,17265,1300,511394,7390,16489,1138,3616,544,34620,1634,82916,3044,31579,1784,51337,1990,42.9,0.3,43.0,0.4,39.0,0.6,41.8,2.6,41.0,1.2,42.2,1.6,48.8,0.8
2,ACSSF,1.0,,,,,,,,,,0100090US,United States -- American Indian Reservation and Trust Land -- State,,1171,359,600,199,39,41,65,61,26,24,9,10,5,10,1,2,12,15,26,31,15,15,16,14,38,31,43,48,48,38,19,22,48,27,23,16,37,30,16,14,49,34,25,17,19,23,15,17,6,8,571,180,52,37,16,17,36,32,7,11,10,19,8,15,2,4,9,15,34,32,39,33,51,45,17,15,44,40,35,34,31,25,13,10,23,20,16,19,17,17,16,12,54,49,13,13,28,30,46.9,8.5,45.8,6.8,47.1,13.9,1171,359,1171,359,216,93,190,151,606,175,38,41,0,13,2,3,119,91,2,4,117,92,43.8,6.0,43.3,6.1,-666666666.0,-222222222.0,-666666666.0,-222222222.0,33.5,21.1,-666666666.0,-222222222.0,50.8,14.7
3,ACSSF,1.0,,,,,,,,,,0100091US,United States -- Oklahoma Tribal Statistical Area,,2593741,2617,1295420,1534,81664,577,86798,1203,94372,1411,54614,495,35611,417,19253,918,18121,813,49850,1108,85865,626,85974,733,87319,1232,81277,1259,77221,551,78127,579,80689,1216,33196,897,48622,1064,28613,937,39593,1098,53176,1089,36774,813,22984,787,15707,521,1298321,1965,77979,466,84495,1397,89337,1359,52687,466,32477,494,17776,804,16613,814,44621,952,82001,587,82101,640,84218,1156,76728,1196,74928,596,75407,580,83356,1091,34317,960,50097,933,30027,867,42500,1283,62397,1199,44127,936,29495,1095,30637,1012,38.1,0.2,37.1,0.2,39.1,0.1,2593741,2617,2593741,2617,1788619,4259,142056,1916,250839,2623,43677,835,3325,292,67878,2542,297347,3611,66424,2200,230923,3095,41.4,0.1,41.7,0.1,37.5,0.4,37.3,4.3,36.5,1.0,38.5,1.6,44.8,0.5
4,ACSSF,1.0,,,,,,,,,,0100092US,United States -- Tribal Designated Statistical Area,,54870,936,27183,547,1136,125,1390,160,1472,176,859,108,595,129,250,60,225,58,746,123,1272,134,1538,169,1493,108,1444,143,1688,169,1401,88,1915,142,832,108,1262,103,945,87,1353,85,2272,127,1563,125,868,77,664,65,27687,524,1098,146,1182,128,1173,107,894,125,566,103,290,71,147,46,742,122,1293,104,1292,122,1566,110,1467,152,1418,131,1673,120,2096,179,983,102,1453,120,1133,118,1524,106,2419,153,1553,115,908,81,817,85,50.3,0.8,48.6,0.8,52.1,0.9,54870,936,54870,936,45274,687,3217,429,899,141,897,152,75,27,1174,235,3334,343,1338,208,1996,252,46.4,0.6,45.7,1.0,41.2,3.2,47.5,2.9,44.4,5.9,39.5,3.9,53.2,1.8
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
301899,ACSSF,,,,56.0,,,,,,,9700000US5605830,"Teton County School District 1, Wyoming",5605830.0,23346,-555555555,12114,145,541,76,697,187,596,216,301,60,165,102,57,65,70,62,393,123,1104,91,1145,147,1014,273,1117,272,786,24,833,119,730,172,255,140,375,140,288,136,544,176,493,166,426,87,139,79,45,46,11232,145,421,83,316,166,901,178,342,83,216,113,16,21,96,60,345,117,855,74,1022,101,1087,237,935,202,666,77,702,104,446,173,242,124,672,229,298,114,454,177,494,153,366,118,177,109,163,85,39.9,0.6,39.9,1.3,40.0,0.6,23346,-555555555,23346,-555555555,19424,416,147,80,29,48,302,113,16,23,2365,502,1063,413,649,396,414,125,40.5,0.4,41.3,1.2,37.0,4.7,28.4,1.4,33.0,7.3,36.4,7.7,50.9,8.9
301900,ACSSF,,,,56.0,,,,,,,9700000US5606090,"Weston County School District 7, Wyoming",5606090.0,1414,288,703,159,34,37,37,31,30,38,11,21,0,13,0,13,0,13,0,13,31,48,16,28,0,13,81,67,109,46,124,48,12,13,11,14,33,30,35,34,11,19,56,45,8,14,12,15,52,43,711,189,46,37,55,62,10,18,17,29,24,29,0,13,24,31,0,13,36,45,0,13,40,46,29,30,46,42,104,45,9,15,26,23,38,39,23,20,17,20,59,37,21,33,73,45,14,24,50.3,2.3,50.1,2.6,50.4,5.0,1414,288,1414,288,1316,266,0,13,0,13,0,13,0,13,8,14,90,104,0,13,90,104,50.4,0.8,50.6,0.8,45.5,7.9,40.9,8.4,61.6,21.5,-666666666.0,-222222222.0,46.6,28.0
301901,ACSSF,,,,56.0,,,,,,,9700000US5606240,"Washakie County School District 1, Wyoming",5606240.0,6861,187,3606,103,138,13,237,87,324,72,221,86,80,76,0,18,40,40,163,98,169,60,190,44,188,92,188,59,254,55,194,30,180,80,76,57,190,86,146,69,206,67,97,39,168,54,111,37,46,36,3255,136,147,65,130,51,231,82,118,8,73,53,19,25,0,18,93,71,180,83,142,21,234,87,199,63,163,10,199,47,207,104,107,62,241,80,95,45,82,37,220,58,108,46,130,44,137,47,44.1,2.8,41.1,1.8,47.3,3.6,6861,187,6861,187,5837,310,1,3,49,43,28,30,0,18,287,209,659,209,452,199,207,69,45.4,2.4,46.7,4.3,34.3,5.0,-666666666.0,-222222222.0,49.3,8.1,35.7,16.5,54.8,13.1
301902,ACSSF,,,,56.0,,,,,,,9700000US5699999,"Remainder of Wyoming, Wyoming",,1836,237,1045,162,60,29,52,23,76,25,62,36,27,18,15,11,29,23,48,23,65,32,52,30,44,26,78,24,82,53,81,31,62,52,26,17,26,15,16,11,30,22,13,11,73,56,20,23,8,12,791,133,27,20,65,30,70,26,33,18,39,26,13,10,8,10,43,28,45,21,36,20,70,36,45,30,31,16,52,21,55,26,36,21,20,12,26,21,21,15,33,19,8,9,9,12,6,9,38.3,5.7,39.5,8.0,37.3,7.1,1836,237,1836,237,465,114,21,33,1192,206,4,9,0,13,32,21,122,62,56,47,66,36,48.6,4.5,47.1,5.2,46.3,19.2,-666666666.0,-222222222.0,50.1,34.1,-666666666.0,-222222222.0,48.8,6.6


In [13]:
cbsa_data = geo_dataset[geo_dataset != US	REGION	DIVISION	STATE	COUNTY	COUSUB	PLACE	TRACT	BLKGRP

Index(['FILEID', 'US', 'REGION', 'DIVISION', 'STATE', 'COUNTY', 'COUSUB', 'PLACE', 'TRACT', 'BLKGRP',
       ...
       'B08103_E003', 'B08103_M003', 'B08103_E004', 'B08103_M004', 'B08103_E005', 'B08103_M005', 'B08103_E006', 'B08103_M006', 'B08103_E007', 'B08103_M007'], dtype='object', length=154)


In [None]:
# geo_dataset =
geo_dataset[geo_dataset['TRACT'] >= 1]
geo_dataset[geo_dataset['STATE'] >= 0]
# geo_dataset

In [None]:
# Create property column values

# geography['BLKGRP'] = geography['STATE'].astype(int).astype(str)+geography['COUNTY'].astype(int).astype(str)+geography['TRACT'].astype(int).astype(str)+geography['BLKGRP'].astype(int).astype(str)
# geography['TRACT'] = geography['STATE'].astype(int).astype(str)+geography['COUNTY'].astype(int).astype(str)+geography['TRACT'].astype(int).astype(str)
# geography['COUNTY'] = geography['STATE'].astype(int).astype(str)+geography['COUNTY'].astype(int).astype(str)

# geography

In [None]:
api_key = "781655d3b92f252d48d8acae0f5669dddf3cb9a4"

# Choose variable list from csv file

In [None]:
#load csv and drop old index file
#choose model type

acs5 = pd.read_csv('C:/Users/jerem/OneDrive/Documents/Git Projects/MeridianXYZ/data/census/ACS Variables.csv')
acs5 = acs5[(acs5['General Model'] == 1) & (acs5['ACS1'] != 1)].reset_index().drop(columns = ['index'])
acs5

In [None]:
acs1 = pd.read_csv('C:/Users/jerem/OneDrive/Documents/Git Projects/MeridianXYZ/data/census/ACS Variables.csv')
acs1 = acs1[(acs1['General Model'] == 1) & (acs1['ACS1'] == 1)].reset_index().drop(columns = ['index'])
acs1

In [None]:
# extract single variable from group list
value = acs5.loc[1, 'Code']
value

group_acs5 = acs5['Code'].to_list()
print(len(group_acs5))
group_acs5[:10]

In [None]:
# extract single variable from group list
value = acs1.loc[1, 'Code']
value

group_acs1 = acs1['Code'].to_list()
print(len(group_acs1))
group_acs1

In [None]:
dataset = 'acs/acs5'
year = 2022

# get human readable variable names ACS5

group_acs5_names = []
group_acs5_updated = []

for i in range(len(group_acs5)):
    try:
        #test which group is going through the loop
        # print(i)
        # print(f'Group: {group[i]}')
    
        #get the first group variable
        group_variables = ced.variables.group_variables(dataset, year, group_acs5[i])
        group_variables = group_variables[0]
        # print(group_vaariables)
    
        #get the dict for each group
        get_group = ced.variables.get_group(dataset, year, group_acs5[i])
        group_info = get_group[group_variables]
        # print(group_info)
    
        #get readable name
        label = get_group[group_variables]['label'].replace(' --','').replace(':!!', '-').replace('!!','_').replace(' ','_').replace(':','')
        concept = get_group[group_variables]['concept'].replace(' ','_')
        name = concept + " - " + label
        # print(name)
        # print(" ")
    
        group_acs5_names.append(name)
        group_acs5_updated.append(group_variables)
        
    except: 
        print(f'Group: {group_acs5[i]} FAILED')
        print(" ")
        continue
print(len(group_acs5_names))
print(len(group_acs5_updated))

group_acs5_updated
group_acs5_names

In [None]:
variable_and_names = pd.DataFrame(list(zip(group_acs5_updated, group_acs5_names)), columns=['variables', 'names'])
variable_and_names

In [None]:
dataset = 'acs/acs1'
year = 2022

# get human readable variable names ACS5

group_acs1_names = []
group_acs1_updated = []

for i in range(len(group_acs1)):
    try:
        #test which group is going through the loop
        # print(i)
        # print(f'Group: {group[i]}')
    
        #get the first group variable
        group_variables = ced.variables.group_variables(dataset, year, group_acs1[i])
        group_variables = group_variables[0]
        # print(group_vaariables)
    
        #get the dict for each group
        get_group = ced.variables.get_group(dataset, year, group_acs1[i])
        group_info = get_group[group_variables]
        # print(group_info)
    
        #get readable name
        label = get_group[group_variables]['label'].replace(' --','').replace(':!!', '-').replace('!!','_').replace(' ','_').replace(':','')
        concept = get_group[group_variables]['concept'].replace(' ','_')
        name = concept + " - " + label
        # print(name)
        # print(" ")
    
        group_acs1_names.append(name)
        group_acs1_updated.append(group_variables)
        
    except: 
        print(f'Group: {group_acs1[i]} FAILED')
        print(" ")
        continue
print(len(group_acs1_names))
print(len(group_acs1_updated))

group_acs1_updated
group_acs1_names

In [None]:
# help(ced.variables.group_tree)
# help(ced.variables)
# help(ced.variables.group_variables)

***
# Get Data from Censusdis
***

In [None]:
dataset = 'acs/acs5'
year = 2022
state = STATE_MA

In [None]:
reader = ShapeReader(year=year)

In [None]:
variables = 'NAME'
counties_ma = ['017', '009', '021', '023', '025'] # ['Middlesex County, Massachusetts', 'Essex County, Massachusetts', 'Norfolk County, Massachusetts', 'Plymouth County, Massachusetts', 'Suffolk County, Massachusetts']

polygon_block_groups = ced.download(
    dataset,
    year,
    variables,
    state = state,
    county = counties_ma,
    tract = '*',
    block_group = "*",
    with_geometry = True,
    # remove_water = True,
    set_to_nan=ALL_SPECIAL_VALUES
)

polygon_block_groups

#### Check Tests

In [None]:
def chunk_list(lst, chunk_size):
    """Yield successive chunk_size chunks from lst."""
    for i in range(0, len(lst), chunk_size):
        yield lst[i:i + chunk_size]


def process_data(data_chunk):
    """Process the data in some way."""
    processed = [item * 2 for item in data_chunk]  # Example processing
    return processed

# Example list
data = list(range(10))

# Break the list into chunks of size 10
chunk_size = 10
chunks = chunk_list(data, chunk_size)
chunks

# Process each chunk and collect the results
results = []
for chunk in chunks:
    processed_chunk = process_data(chunk)
    results.extend(processed_chunk)

data

results

In [None]:
def process_data(data_chunk):
    polygon_block_groups = ced.download(
        dataset,
        year,
        data_chunk, #variables,
        state = state,
        county = counties_ma,
        tract = '*',
        block_group = "*",
        # with_geometry = True,
        # remove_water = True,
        set_to_nan=ALL_SPECIAL_VALUES
    )
    return polygon_block_groups


# Break the list into chunks of size 10
chunk_size = 40
chunks = chunk_list(group_updated, chunk_size)

# Process each chunk and collect the results
results = []
for chunk in chunks:
    processed_chunk = process_data(chunk)
    results.extend(processed_chunk)

results

#### Chunk Tests over

In [None]:
# Variables
# TOTAL_POPULATION_VARIABLE = "B01003_001E"
variables_0 = 'NAME'
variables_1 = group_acs5_updated[:40]
variables_2 = group_acs5_updated[40:80]
variables_3 = group_acs5_updated[80:120]
variables_4 = group_acs5_updated[120:]


counties_ma = ['017', '009', '021', '023', '025'] # ['Middlesex County, Massachusetts', 'Essex County, Massachusetts', 'Norfolk County, Massachusetts', 'Plymouth County, Massachusetts', 'Suffolk County, Massachusetts']

In [None]:
#0
polygon_block_groups_0 = ced.download(
    dataset,
    year,
    variables_0,
    state = state,
    county = counties_ma,
    tract = '*',
    block_group = "*",
    with_geometry = True,
    # remove_water = True,
    set_to_nan=ALL_SPECIAL_VALUES
)

#1
polygon_block_groups_1 = ced.download(
    dataset,
    year,
    variables_1,
    state = state,
    county = counties_ma,
    tract = '*',
    block_group = "*",
    with_geometry = True,
    # remove_water = True,
    set_to_nan=ALL_SPECIAL_VALUES
)

#2
polygon_block_groups_2 = ced.download(
    dataset,
    year,
    variables_2,
    state = state,
    county = counties_ma,
    tract = '*',
    block_group = "*",
    with_geometry = True,
    # remove_water = True,
    set_to_nan=ALL_SPECIAL_VALUES
)

#3
polygon_block_groups_3 = ced.download(
    dataset,
    year,
    variables_3,
    state = state,
    county = counties_ma,
    tract = '*',
    block_group = "*",
    with_geometry = True,
    # remove_water = True,
    set_to_nan=ALL_SPECIAL_VALUES
)

#4
polygon_block_groups_4 = ced.download(
    dataset,
    year,
    variables_4,
    state = state,
    county = counties_ma,
    tract = '*',
    block_group = "*",
    with_geometry = True,
    # remove_water = True,
    set_to_nan=ALL_SPECIAL_VALUES
)

In [None]:
# polygon_block_groups_0
# polygon_block_groups_1
# polygon_block_groups_2
# polygon_block_groups_3
# polygon_block_groups_4

In [None]:
polygon_block_groups = polygon_block_groups.dropna(subset=['geometry'])
polygon_block_groups_0 = polygon_block_groups_0.dropna(subset=['geometry']).drop(columns=['STATE','COUNTY','TRACT','geometry'])
polygon_block_groups_1 = polygon_block_groups_1.dropna(subset=['geometry']).drop(columns=['STATE','COUNTY','TRACT','geometry'])
polygon_block_groups_2 = polygon_block_groups_2.dropna(subset=['geometry']).drop(columns=['STATE','COUNTY','TRACT','geometry'])
polygon_block_groups_3 = polygon_block_groups_3.dropna(subset=['geometry']).drop(columns=['STATE','COUNTY','TRACT','geometry'])
polygon_block_groups_4 = polygon_block_groups_4.dropna(subset=['geometry']).drop(columns=['STATE','COUNTY','TRACT','geometry'])

result_df_bg= pd.concat([polygon_block_groups, polygon_block_groups_0, polygon_block_groups_1, polygon_block_groups_2, polygon_block_groups_3, polygon_block_groups_4], axis=1)

In [None]:
result_df_bg.shape

In [None]:
result_df_bg.describe()

In [None]:
df_describe = result_df_bg.describe()[:1]
df_describe

In [None]:
def contains_zero(series):
    return (series == 0).any()

# Use apply to check each column
columns_with_zero = df_describe.apply(contains_zero)
true_columns = columns_with_zero[columns_with_zero].index.tolist()
true_columns

In [None]:
variable_and_names.head()

In [None]:
df_subset = variable_and_names[variable_and_names['variables'].isin(true_columns)]

df_subset['names'] = 'Variable_' + df_subset['names']
df_subset

# test variables

In [None]:
import requests

API_KEY = 'YOUR_API_KEY'  # Place your API key here
VARIABLE = 'B09001_003E'  # Example variable

data = {
    'county': [],
    'tract': [],
    'block group': []
}
geo_status = pd.DataFrame(data)
geo_status

In [None]:
def check_variable_by_geography(variable, api_key):
    # Common geographic levels to check
    geographies = ['county', 'tract', 'block group']
    for geo in geographies:
        url = f"https://api.census.gov/data/2022/acs/acs5?get=NAME,{variable}&for={geo}:*"
        response = requests.get(url + f"&key={api_key}")
        if response.status_code == 200:
            geo_status[f'{geo}'] = 1
            # print(f"YES {variable} @ {geo}")
            # break
        else:
            geo_status[f'{geo}'] = 0
            # print(f"NO  {variable} @ {geo}")

In [None]:
# Run the check
for i in range(len(true_columns)):
    check_variable_by_geography(true_columns[i], api_key)

In [None]:
#FIX THIS TO RUN IN A DATAFRAME WHERE IT UPDATES THE ROWS ACCORDING TO THE NUMBER OF VARIABLES
geo_status

In [None]:
DATASET = 'B09001'
YEAR = 2022

In [None]:
# cgeo.geo_path_snake_specs(DATASET, YEAR)
help(cgeo)

In [None]:
import matplotlib.pyplot as plt

# This is a census variable for median household income.
# See https://api.census.gov/data/2020/acs/acs5/variables/B19013_001E.html
MEDIAN_HOUSEHOLD_INCOME_VARIABLE = "B19013_001E"

gdf_state_bounds = reader.read_cb_shapefile("us", "state")
gdf_state_bounds = gdf_state_bounds[
    gdf_state_bounds["STATEFP"].isin(states.ALL_STATES_AND_DC)
]

reader = cem.ShapeReader(year=YEAR)

In [None]:
plt.rcParams["figure.figsize"] = (18, 8)


def plot_map(
    gdf: gpd.GeoDataFrame,
    geo: str,
    *,
    gdf_bounds: Optional[gpd.GeoDataFrame] = None,
    bounds_color: str = "white",
    max_income: float = 200_000.0,
):
    if gdf_bounds is None:
        gdf_bounds = gdf

    ax = cem.plot_us(gdf_bounds, color="lightgray")

    ax = cem.plot_us(
        gdf,
        MEDIAN_HOUSEHOLD_INCOME_VARIABLE,
        cmap="autumn",
        legend=True,
        vmin=0.0,
        vmax=max_income,
        ax=ax,
    )

    ax = cem.plot_us_boundary(gdf_bounds, edgecolor=bounds_color, linewidth=0.5, ax=ax)

    ax.set_title(f"{YEAR} Median Household Income by {geo.title()}")

    ax.axis("off")

In [None]:
DATASET = 'acs/acs5'
YEAR = 2022
STATE = states.MA
VARIABLES = 'B09001_001E'

In [None]:
gdf_bg = ced.download(
    DATASET,
    YEAR,
    VARIABLES,
    state=STATE,
    block_group="*",
    with_geometry=True,
    set_to_nan=cev.ALL_SPECIAL_VALUES,
)

In [None]:
plt.rcParams["figure.figsize"] = (8, 8)

plot_map(
    gdf_bg,
    f"block group in {states.NAMES_FROM_IDS[STATE]}",
    gdf_bounds=gdf_state_bounds[gdf_state_bounds["STATEFP"] == STATE],
    bounds_color="black",
)