In [10]:
import pandas as pd
import os
from io import StringIO
import sqlite3
import csv
import geopandas as gpd
import re

# Step 2A : Match Combined Random Samples

HCAD Account Number with Polygons

Download Real Property from HCAD : http://hcad.org/pdata/pdata-property-downloads.html

Real Property Data from 2025 is ~890MB, data split out available in chunked txt files.

##### *To recreate original file*


``` python
def load_real_property_chunks_safe(folder_path, delimiter='\t', log_bad_lines=True):
    files = sorted([
        f for f in os.listdir(folder_path)
        if f.startswith('real_acct_2025') and f.endswith('.txt')
    ])
    
    data_rows = []
    header = None
    bad_lines = []

    for file in files:
        file_path = os.path.join(folder_path, file)
        with open(file_path, 'r', encoding='utf-8', errors='replace') as f:
            lines = f.readlines()
            if header is None:
                header = lines[0].strip().split(delimiter)
                expected_cols = len(header)
            for i, line in enumerate(lines[1:], start=2):  # Start from line 2 in human terms
                parts = line.strip().split(delimiter)
                if len(parts) == expected_cols:
                    data_rows.append(parts)
                else:
                    if log_bad_lines:
                        bad_lines.append((file, i, len(parts), line.strip()))
    
    # Create DataFrame
    HCAD_RealProperty = pd.DataFrame(data_rows, columns=header)

    # Optionally print or log bad lines
    if log_bad_lines and bad_lines:
        print(f"\nSkipped {len(bad_lines)} malformed rows:")
        for file, lineno, cols, preview in bad_lines[:10]:  # show only first 10
            print(f"{file}, line {lineno}: expected {expected_cols}, found {cols} columns — {preview[:100]}...")

    return HCAD_RealProperty
```

HCAD_RealProperty = load_real_property_chunks_safe("../2025_RPD")

## Real Property Database

In [2]:
# Define Real Property Data column names
columns = [
    "acct", "yr", "mailto", "mail_addr_1", "mail_addr_2", "mail_city", "mail_state", "mail_zip", "mail_country",
    "undeliverable", "str_pfx", "str_num", "str_num_sfx", "str", "str_sfx", "str_sfx_dir", "str_unit",
    "site_addr_1", "site_addr_2", "site_addr_3", "state_class", "school_dist", "map_facet", "key_map",
    "Neighborhood_Code", "Neighborhood_Grp", "Market_Area_1", "Market_Area_1_Dscr", "Market_Area_2",
    "Market_Area_2_Dscr", "econ_area", "econ_bld_class", "center_code", "yr_impr", "yr_annexed", "splt_dt",
    "dsc_cd", "nxt_bld", "bld_ar", "land_ar", "acreage", "Cap_acct", "shared_cad", "land_val", "bld_val",
    "x_features_val", "ag_val", "assessed_val", "tot_appr_val", "tot_mkt_val", "prior_land_val",
    "prior_bld_val", "prior_x_features_val", "prior_ag_val", "prior_tot_appr_val", "prior_tot_mkt_val",
    "new_construction_val", "tot_rcn_val", "value_status", "noticed", "notice_dt", "protested",
    "certified_date", "rev_dt", "rev_by", "new_own_dt", "lgl_1", "lgl_2", "lgl_3", "lgl_4", "jurs"
]

In [3]:
def load_realacct_to_sqlite(txt_file, db_file, table_name, delimiter='\t'):
    conn = sqlite3.connect(db_file)
    cur = conn.cursor()

    # Drop and create table with appropriate schema
    cur.execute(f'DROP TABLE IF EXISTS {table_name}')
    col_defs = ', '.join([f'"{col}" TEXT' for col in columns])
    cur.execute(f'CREATE TABLE {table_name} ({col_defs})')

    # Insert valid rows in chunks
    placeholders = ','.join(['?'] * len(columns))
    with open(txt_file, 'r', encoding='utf-8', errors='replace') as f:
        reader = csv.reader(f, delimiter=delimiter)
        header = next(reader)  # Skip header in file

        batch = []
        for row in reader:
            if len(row) == len(columns):
                batch.append(row)
                if len(batch) >= 10000:
                    cur.executemany(f'INSERT INTO {table_name} VALUES ({placeholders})', batch)
                    batch = []
        if batch:
            cur.executemany(f'INSERT INTO {table_name} VALUES ({placeholders})', batch)

    conn.commit()
    conn.close()
    print(f"Loaded data into table '{table_name}' in '{db_file}'.")

In [4]:
# Load Data into Sqlite
load_realacct_to_sqlite('../2025_RPD/real_acct.txt', 'real_property.db', 'HCAD_RealProperty')

Loaded data into table 'HCAD_RealProperty' in 'real_property.db'.


In [5]:
# SQL Connect
conn = sqlite3.connect('real_property.db')
cur = conn.cursor()

In [6]:
cur.execute("""
CREATE VIEW IF NOT EXISTS HCAD_RealProperty_View AS
SELECT
    acct,
    mail_city,
    state_class,
    school_dist,
    Neighborhood_Code,
    Neighborhood_Grp,
    Market_Area_1,
    Market_Area_1_Dscr,
    Market_Area_2,
    Market_Area_2_Dscr,
    econ_area,
    econ_bld_class,
    center_code,
    yr_impr,
    yr_annexed,
    splt_dt,
    dsc_cd,
    acreage,
    land_val,
    bld_val,
    x_features_val,
    ag_val,
    assessed_val,
    tot_appr_val,
    tot_mkt_val
FROM HCAD_RealProperty;
""")

# Commit
conn.commit()

In [9]:
HCAD_RealProperty = pd.read_sql_query("SELECT * FROM HCAD_RealProperty_View", conn)

In [11]:
conn.close()

## Analysis

In [26]:
gdf_allsamples = gpd.read_file("OUTPUT\combined_random_samples.geojson")

temp step

In [27]:
print("HCAD_NUM samples from gdf:")
print(gdf_allsamples["HCAD_NUM"].dropna().astype(str).unique()[:5])

HCAD_NUM samples from gdf:
['1274410000036' '1152790430005' '1219740030067' '1416470010001'
 '0432120010343']


In [28]:
print("\nacct samples from HCAD_RealProperty:")
print(HCAD_RealProperty["acct"].dropna().astype(str).unique()[:5])


acct samples from HCAD_RealProperty:
['0010010000013            ' '0010020000001            '
 '0010020000003            ' '0010020000004            '
 '0010020000013            ']


In [29]:
gdf_accts = set(gdf_allsamples["HCAD_NUM"].dropna().astype(str))
db_accts = set(HCAD_RealProperty["acct"].dropna().astype(str))

common = gdf_accts & db_accts
print(f"Matching account count: {len(common)} of {len(gdf_accts)} in gdf")

Matching account count: 0 of 6076 in gdf


In [30]:
gdf_allsamples["HCAD_NUM_clean"] = gdf_allsamples["HCAD_NUM"].astype(str).str.strip().str.zfill(13)  # example padding
HCAD_RealProperty["acct_clean"] = HCAD_RealProperty["acct"].astype(str).str.strip().str.zfill(13)


In [31]:
common_clean = set(gdf_allsamples["HCAD_NUM_clean"]) & set(HCAD_RealProperty["acct_clean"])
print(f"Matches after cleaning: {len(common_clean)}")


Matches after cleaning: 6072


actual join

In [89]:
gdf_joined = gdf_allsamples.merge(HCAD_RealProperty, 
                                  left_on="HCAD_NUM_clean", 
                                  right_on="acct_clean", 
                                  how="left")

In [90]:
columns_to_drop = ["parcel_type", 
                   "yr_annexed", 
                   "splt_dt", 
                   "dsc_cd",
                   "timestamp",
                   "sampling_batch_size",
                   "total_properties_sampled",
                   "sampling_attempts",
                   "samplesetID"
                   ]
gdf_joined = gdf_joined.drop(columns=columns_to_drop, errors="ignore")

In [91]:
front_cols = [
    'parcel_index', 
    'unique_id', # unique record ID
    'sample_id', # sample set identifier
    'HCAD_NUM', 
    'HCAD_NUM_clean', 
    'acct_clean',
    'city', 
    'mail_city', 
    'state_class_y', 
    'state_class_x',
    'appr_val', 
    'tot_appr_val', 
    'mkt_val', 
    'tot_mkt_val'
]

In [92]:
remaining_cols = [col for col in gdf_joined.columns if col not in front_cols]

In [93]:
gdf_joined = gdf_joined[front_cols + remaining_cols]

#### Data Type Conversion

In [94]:
# Columns to convert
currency_cols = ['appr_val', 'tot_appr_val', 'mkt_val', 'tot_mkt_val']
conversion_report = {}

In [95]:
# Column Conversion
for col in currency_cols:
    if col in gdf_joined.columns:
        original_nulls = gdf_joined[col].isna().sum()
        original_type = gdf_joined[col].dtype

        # Clean and convert
        gdf_joined[col] = (
            gdf_joined[col]
            .replace('', '0')             # Replace empty strings with '0'
            .replace(' ', '0')            # Replace space-only strings too
            .fillna(0)                    # Fill any remaining NaNs
            .astype(float)                # Convert to float
            .round(2)
        )

        # Confirmation
        conversion_report[col] = {
            'original_dtype': str(original_type),
            'nulls_filled': original_nulls,
            'total_records': len(gdf_joined)
        }


In [96]:
# Print conversion report
print("Currency Conversion Summary:")
for col, stats in conversion_report.items():
    print(f" - {col}: {stats['total_records']} rows processed | {stats['nulls_filled']} nulls filled | original type was {stats['original_dtype']}")

Currency Conversion Summary:
 - appr_val: 12174 rows processed | 610 nulls filled | original type was float64
 - tot_appr_val: 12174 rows processed | 8 nulls filled | original type was object
 - mkt_val: 12174 rows processed | 610 nulls filled | original type was float64
 - tot_mkt_val: 12174 rows processed | 8 nulls filled | original type was object


#### Integrity Report

In [97]:
mismatch_report = {}

In [98]:
def check_and_cleanup(col1, col2, drop_col_if_match, rename_if_match=None):
    global gdf_joined
    total = len(gdf_joined)

    if col1 not in gdf_joined.columns or col2 not in gdf_joined.columns:
        print(f"Warning: One or both columns missing: {col1}, {col2}")
        return

    if (gdf_joined[col1] == gdf_joined[col2]).all():
        if drop_col_if_match and drop_col_if_match in gdf_joined.columns:
            gdf_joined.drop(columns=[drop_col_if_match], inplace=True)
        if rename_if_match:
            gdf_joined.rename(columns={col2: rename_if_match}, inplace=True)
    else:
        mismatch_count = (gdf_joined[col1] != gdf_joined[col2]).sum()
        mismatch_report[f"{col1} vs {col2}"] = {
            "count": mismatch_count,
            "percent": round((mismatch_count / total) * 100, 2)
        }

In [99]:
# HCAD_NUM comparisons
check_and_cleanup("HCAD_NUM", "HCAD_NUM_clean", drop_col_if_match="HCAD_NUM_clean")
check_and_cleanup("HCAD_NUM", "acct_clean", drop_col_if_match="acct_clean")

# State class comparison
check_and_cleanup("state_class_y", "state_class_x", drop_col_if_match="state_class_x", rename_if_match="state_class")

# Appraised value
check_and_cleanup("appr_val", "tot_appr_val", drop_col_if_match="appr_val")

# Market value
check_and_cleanup("mkt_val", "tot_mkt_val", drop_col_if_match="mkt_val")

In [101]:
# Print mismatch report if any
if mismatch_report:
    print("\nColumn Mismatches Found:")
    for k, stats in mismatch_report.items():
        print(f"{k}: {stats['count']} records do not match ({stats['percent']}% of total)")
else:
    print("All compared columns are fully aligned; cleanup applied.")


Column Mismatches Found:
HCAD_NUM vs acct_clean: 8 records do not match (0.07% of total)
state_class_y vs state_class_x: 1162 records do not match (9.54% of total)
appr_val vs tot_appr_val: 14 records do not match (0.11% of total)
mkt_val vs tot_mkt_val: 14 records do not match (0.11% of total)


In [None]:
gdf_joined

# Step 2B : Sample Matching Analysis

In [102]:
# Save point
gdf_joined.to_file("OUTPUT/gdf_joined_output.geojson", driver="GeoJSON")