### Get the data from the website and prepare it

File: PropertyData.txt  
Source: https://www.tad.org (Tarrant Appraisal District, TX)

In [6]:
import requests, zipfile, io, os
from pyspark.sql import SparkSession
from pyspark.sql.functions import monotonically_increasing_id, trim, col, when
from pyspark.ml.feature import StringIndexer, OneHotEncoder

# memory issues .. see https://stackoverflow.com/questions/21138751/spark-java-lang-outofmemoryerror-java-heap-space
spark = SparkSession.builder.master("local[*]").appName("PropertyData").getOrCreate()
spark

zip_url = "https://www.tad.org/content/data-download/PropertyData(Delimited).ZIP"
source_file = "/tmp/spark/PropertyData.txt"
download_dir = "/tmp/spark/"

if os.path.isfile(source_file):
    print("Using existing property file")
else:
    r = requests.get(zip_url)
    z = zipfile.ZipFile(io.BytesIO(r.content))
    z.extractall(download_dir)

df = spark.read.csv(source_file, sep="|", header=True, inferSchema=True)
df = df.withColumn('id', monotonically_increasing_id())
df = df[['id'] + df.columns[:-1]]   # move id column to front
for name, dtype in df.dtypes:       # trim all string columns
    if dtype == "string":
        df = df.withColumn(name, trim(col(name)))


# Replace missing string values for columns that are used in logistic regression later.
# (Even if you convert them to IDX and drop them, you still get "Cannot have an empty string for name" error!!)
# see: https://stackoverflow.com/questions/33089781/spark-dataframe-handing-empty-string-in-onehotencoder
df = df.withColumn('Swimming_Pool_Ind', when(col('Swimming_Pool_Ind') == 'X', 'Y').otherwise('N'))

# convert certain string columns to idx for use with logistic regression later on
string_to_idx_columns = ['Swimming_Pool_Ind', 'Central_Heat_Ind', 'Central_Air_Ind']
for col_name in string_to_idx_columns:
    si = StringIndexer(inputCol=col_name, outputCol=f"{col_name}_idx")
    df = si.fit(df).transform(df)    


# convert the Appraised_Value to integer
df = df.withColumn('Appraised_Value_Int', df['Appraised_Value'].cast('int'))

# drop specific columns we don't need going forward, including 'Appraised_Value' ...
drop_cols = ['Sequence_No', 'Record_Type', 'PIDN', 'Owner_Name', 'Owner_Address', 'Owner_CityState', 'Owner_Zip4',
             'Owner_CRRT', 'Situs_Address', 'TAD_Map', 'MAPSCO', 'Exemption_Code', 'State_Use_Code', 'LegalDescription',
             'Notice_Date', 'Deed_Date', 'Deed_Book', 'Appraisal_Date', 'Deed_Page', 'ARB_Indicator', 'From_Accts',
             'GIS_Link', 'Instrument_No', 'Overlap_Flag', 'Num_Bedrooms', 'Num_Bathrooms', 'Structure_Count', 'Ag_Code',
             'Appraisal_Year', 'Owner_Zip', 'Appraised_Value']
df = df.drop(*drop_cols)

# Apply One-Hot encoding to _idx to get _bin for logistic columns
for col_name in ['Swimming_Pool_Ind', 'Central_Air_Ind', 'Central_Heat_Ind']:
    enc = OneHotEncoder(inputCols=[f"{col_name}_idx"], outputCols=[f"{col_name}_bin"])
    df = enc.fit(df).transform(df)

# Save as a parquet file
df.write.parquet(download_dir + "tarrant-tx.parquet", mode="overwrite")
print('Parquet files created. Use tarrant-tx_multi-linear.ipynb and tarrant-tx_logistic.ipynb for analysis.')

Using existing property file


                                                                                

In [None]:
# !! Optional step to see how many rows should be removed for incomplete data
# Inspect dataframe for missing values

# from pyspark.sql.functions import sum, col
# missing_value_counts = df.select([sum(col(column).isNull().cast("int")).alias(column) for column in df.columns])
# missing_value_counts.show()