# Produce Geo x File Matrix

In [3]:
#import functions
import os
import sys

sys.path.append(
    os.path.join(os.path.abspath(os.path.join(os.path.curdir, os.path.pardir)))
)

#censusdis
from collections import OrderedDict

import geopandas as gpd
import matplotlib.pyplot as plt

from typing import Optional

import censusdis.data as ced
import censusdis.maps as cem
import censusdis.values as cev
import censusdis.geography as cgeo
from censusdis import states
from censusdis.maps import ShapeReader, plot_us_boundary
import censusdis.maps as cmap


# Make sure it is there.
from censusdis.values import ALL_SPECIAL_VALUES

# _______________________________________________________________________

#standard packages
import pandas as pd
import numpy as np
import math
import glob
from math import pi, sqrt

#plotting
import plotly.express as px
import matplotlib.pyplot as plt
# import pygwalker as pyg

# import seaborn as sns
from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler
import datetime
import time
from tqdm import tqdm, trange

#gis packages
import osmnx as ox
import logging as lg
ox.settings.log_console=True #use cache to avoid overloading the server
# ox.settings.memory_cache=True #use cache to avoid overloading the server
ox.settings.memory = 4294967296 #set memory cache to 4GB

from shapely.geometry import Point
import folium
import networkx as nx


## future libaries
# import contextily as cx
# import fiona
# from pandana.loaders import osm
# import momepy
# import missingno as msno
# from us import states
# import imageio

import warnings
warnings.filterwarnings("ignore")

%matplotlib inline
ox.__version__

pd.set_option('display.max_colwidth', 50)
pd.set_option('display.width', 1000)
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', 10)

# Create Geography x Filename Matrix
- This code will take all of the files in the directory and describe if that file has data at a given geometry. 
- This is useful to understand what data can be used for different types of analysis. 
- Many of the data sets are not available at the block group level, therefore it is necessary to describe what data can be used
at different geographic levels of analysis.

In [None]:
# Directory file path

acsdt5y2022_directory = 'C:/Users/jerem/OneDrive/Documents/Git Projects/MeridianXYZ/data/census/acs/summary files/2022/5YRData'


# Get the list of files names within directory

acsdt5y2022_file_list_os = os.listdir(acsdt5y2022_directory)
acsdt5y2022_file_list_os = [f for f in acsdt5y2022_file_list_os if os.path.isfile(os.path.join(acsdt5y2022_directory, f))]
print(acsdt5y2022_file_list_os[:5])


# List of geographic regions within the US

geographic_levels_nogeoid = ['US', 'REGION', 'DIVISION', 'STATE', 'COUNTY', 'COUSUB', 'PLACE', 'TRACT', 'BLKGRP', 'CONCIT', 'AIANHH', 'AIANHHFP', 'AIHHTLI', 'AITS', 'AITSFP', 'ANRC', 'CBSA', 'CSA', 'METDIV', 'MACC', 'MEMI', 'NECTA', 'CNECTA', 'NECTADIV', 'UA', 'CDCURR', 'SLDU', 'SLDL', 'ZCTA5', 'SUBMCD', 'SDELM', 'SDSEC', 'SDUNI', 'UR', 'PCI', 'PUMA5', 'BTTR', 'BTBG']
geographic_levels =         ['US', 'REGION', 'DIVISION', 'STATE', 'COUNTY', 'COUSUB', 'PLACE', 'TRACT', 'BLKGRP', 'CONCIT', 'AIANHH', 'AIANHHFP', 'AIHHTLI', 'AITS', 'AITSFP', 'ANRC', 'CBSA', 'CSA', 'METDIV', 'MACC', 'MEMI', 'NECTA', 'CNECTA', 'NECTADIV', 'UA', 'CDCURR', 'SLDU', 'SLDL', 'ZCTA5', 'SUBMCD', 'SDELM', 'SDSEC', 'SDUNI', 'UR', 'PCI', 'PUMA5', 'GEO_ID', 'BTTR', 'BTBG']


# create empty df with file names as index and geo levels as columns

acs5_22_geo_lookup = pd.DataFrame(index=acsdt5y2022_file_list_os, columns=geographic_levels_nogeoid)
acs5_22_geo_lookup[:] = 0
# acs5_22_geo_lookup.head(1)


# Create Empty lists

geoid_key = []
geoid_values = []


# Create lists of 

for name in geographic_levels_nogeoid:
    geoid_key.append(name)
    geoid_values.append([name, 'GEO_ID'])

geo_levels = {}
for geoid_key, geoid_values in zip(geoid_key, geoid_values):
    geo_levels[geoid_key] = geoid_values

# Initialize the geographic data from the text file
file_path = 'C:/Users/jerem/OneDrive/Documents/Git Projects/MeridianXYZ/data/census/acs/summary files/2022/Geos20225YR.txt'
geo_levels_data = pd.read_csv(file_path, sep='|')

# create empty dictionary
geos_dict = {}

# Loop through the configuration to create lists dynamically
for name, geo_levels_df in geo_levels.items():
    geo_levels_name = geo_levels_df[0]
    geos_dict[name] = pd.read_csv(
        file_path, 
        sep='|', 
        usecols=geo_levels_df
    ).dropna(subset=[geo_levels_name]).drop([geo_levels_name], axis=1)['GEO_ID'].tolist()


# Evaluate each file to see if it contains data at each geographic level
# this is the time consuming process!

for file in tqdm(acsdt5y2022_file_list_os):
    df = pd.read_csv(f'{acsdt5y2022_directory}/{file}', sep='|')['GEO_ID'].tolist()
    
    # Check for intersections and update lookup DataFrame
    for key in geos_dict.keys():
        if set(df).intersection(geos_dict[key]):
            acs5_22_geo_lookup.loc[file, key] = 1

#test columns

acs5_22_geo_lookup#.columns

# save to file

acs5_22_geo_lookup.to_csv('C:/Users/jerem/OneDrive/Documents/Git Projects/MeridianXYZ/data/census/acs/summary files/2022/Meridian_ACS522_Geos20225YR_Matrix.csv')

# Get Data from Summary Files

In [4]:
# IMPORT MATRIX FILE

matrix = pd.read_csv(f'C:/Users/jerem/OneDrive/Documents/Git Projects/MeridianXYZ/data/census/acs/summary files/2022/Meridian_ACS522_Geos20225YR_Matrix.csv')

In [8]:
matrix.head(10)

Unnamed: 0.1,Unnamed: 0,US,REGION,DIVISION,STATE,COUNTY,COUSUB,PLACE,TRACT,BLKGRP,CONCIT,AIANHH,AIANHHFP,AIHHTLI,AITS,AITSFP,ANRC,CBSA,CSA,METDIV,MACC,MEMI,NECTA,CNECTA,NECTADIV,UA,CDCURR,SLDU,SLDL,ZCTA5,SUBMCD,SDELM,SDSEC,SDUNI,UR,PCI,PUMA5,BTTR,BTBG
0,acsdt5y2022-b01001.dat,1,1,1,1,1,1,1,1,1,1,1,0,1,1,0,1,1,1,1,0,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1
1,acsdt5y2022-b01001a.dat,1,1,1,1,1,1,1,1,0,1,1,0,1,1,0,1,1,1,1,0,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,0
2,acsdt5y2022-b01001b.dat,1,1,1,1,1,1,1,1,0,1,1,0,1,1,0,1,1,1,1,0,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,0
3,acsdt5y2022-b01001c.dat,1,1,1,1,1,1,1,1,0,1,1,0,1,1,0,1,1,1,1,0,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,0
4,acsdt5y2022-b01001d.dat,1,1,1,1,1,1,1,1,0,1,1,0,1,1,0,1,1,1,1,0,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,0
5,acsdt5y2022-b01001e.dat,1,1,1,1,1,1,1,1,0,1,1,0,1,1,0,1,1,1,1,0,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,0
6,acsdt5y2022-b01001f.dat,1,1,1,1,1,1,1,1,0,1,1,0,1,1,0,1,1,1,1,0,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,0
7,acsdt5y2022-b01001g.dat,1,1,1,1,1,1,1,1,0,1,1,0,1,1,0,1,1,1,1,0,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,0
8,acsdt5y2022-b01001h.dat,1,1,1,1,1,1,1,1,0,1,1,0,1,1,0,1,1,1,1,0,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,0
9,acsdt5y2022-b01001i.dat,1,1,1,1,1,1,1,1,0,1,1,0,1,1,0,1,1,1,1,0,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,0
