File: data-samples/tarrant-tx.csv  
Source: https://www.tad.org/data-reports

### Get the data from the website into a data-frame
1. The only clean-up we're doing here is making sure the string data is trimmed
2. The part is idempotent - after running it at least once, delete /tmp/spark/PropertyData.txt to have it use fresh data from the download source

In [17]:
import requests, zipfile, io, os
from pyspark.sql import SparkSession
from pyspark.sql.functions import monotonically_increasing_id, trim, col

spark = SparkSession.builder.master("local[*]").appName("PropertyData").getOrCreate()
spark.conf.set("spark.sql.repl.eagerEval.enabled", True) # Property used to format output tables better
spark
zip_url = "https://www.tad.org/content/data-download/PropertyData(Delimited).ZIP"
source_file = "/tmp/spark/PropertyData.txt"

if os.path.isfile(source_file):
    print("Using existing property file")
else:
    download_url = "/tmp/spark/"
    r = requests.get(zip_url)
    z = zipfile.ZipFile(io.BytesIO(r.content))
    z.extractall(download_url)

df = spark.read.csv(source_file, sep="|", header=True, inferSchema=True)
df = df.withColumn('id', monotonically_increasing_id())
df = df[['id'] + df.columns[:-1]]   # move id column to front
for name, dtype in df.dtypes:       # trim all string columns
    if dtype == "string":
        df = df.withColumn(name, trim(col(name)))

# drop columns we don't need for this
drop_cols = ['Sequence_No', 'Record_Type', 'PIDN', 'Owner_Name', 'Owner_Address', 'Owner_CityState', 'Owner_Zip4',
             'Owner_CRRT', 'Situs_Address', 'TAD_Map', 'MAPSCO', 'Exemption_Code', 'State_Use_Code', 'LegalDescription',
             'Notice_Date', 'Deed_Date', 'Deed_Book', 'Appraisal_Date', 'Deed_Page', 'ARB_Indicator', 'From_Accts',
             'GIS_Link', 'Instrument_No', 'Overlap_Flag']
df = df.drop(*drop_cols)

# optionally put the data into a view for SQL queries
df.createOrReplaceTempView("data")
spark.sql('select * from data limit 10')

Using existing property file


                                                                                

id,RP,Appraisal_Year,Account_Num,Record_Type,Owner_Zip,Property_Class,State_Use_Code,County,City,School,Num_Special_Dist,Spec1,Spec2,Spec3,Spec4,Spec5,Land_Value,Improvement_Value,Total_Value,Garage_Capacity,Num_Bedrooms,Num_Bathrooms,Year_Built,Living_Area,Swimming_Pool_Ind,Ag_Code,Land_Acres,Land_SqFt,Ag_Acres,Ag_Value,Central_Heat_Ind,Central_Air_Ind,Structure_Count,Appraised_Value
0,C,2023,51,AAAA,76102.0,C1C,C1,220,26,905,4,223,224,225,0,601,450000.0,0.0,450000.0,0.0,0,0,0.0,0.0,,,0.1147,5000.0,0.0,0.0,N,N,0.0,450000.0
1,C,2023,78,AAAA,76102.0,C1C,C1,220,26,905,4,223,224,225,0,601,450000.0,0.0,450000.0,0.0,0,0,0.0,0.0,,,0.1147,5000.0,0.0,0.0,N,N,0.0,450000.0
2,C,2023,86,AAAA,76102.0,C1C,C1,220,26,905,4,223,224,225,0,601,675000.0,0.0,675000.0,0.0,0,0,0.0,0.0,,,0.1721,7500.0,0.0,0.0,N,N,0.0,675000.0
3,C,2023,94,AAAA,76102.0,C1C,C1,220,26,905,4,223,224,225,0,601,225000.0,0.0,225000.0,0.0,0,0,0.0,0.0,,,0.0573,2500.0,0.0,0.0,N,N,0.0,225000.0
4,C,2023,108,AAAA,76102.0,C1C,C1,220,26,905,4,223,224,225,0,601,1080000.0,0.0,1080000.0,0.0,0,0,0.0,0.0,,,0.2754,12000.0,0.0,0.0,N,N,0.0,1080000.0
5,C,2023,116,AAAA,76102.0,F1,F1,220,26,905,4,223,224,225,0,601,0.0,0.0,0.0,0.0,0,0,0.0,0.0,,,0.2295,10000.0,0.0,0.0,N,N,0.0,0.0
6,C,2023,124,AAAA,76102.0,F1,F1,220,26,905,4,223,224,225,0,601,0.0,0.0,0.0,0.0,0,0,0.0,0.0,,,0.0688,3000.0,0.0,0.0,N,N,0.0,0.0
7,C,2023,132,AAAA,,F1,F1,220,26,905,4,223,224,225,0,601,1350000.0,2526017.0,3876017.0,0.0,0,0,1917.0,72779.0,,,0.3443,15000.0,0.0,0.0,N,N,0.0,3876017.0
8,C,2023,140,AAAA,76102.0,C2C,C2,220,26,905,4,223,224,225,0,601,630090.0,1000.0,631090.0,0.0,0,0,0.0,0.0,,,0.1607,7001.0,0.0,0.0,N,N,0.0,631090.0
9,C,2023,159,AAAA,76102.0,F1,F1,220,26,905,4,223,224,225,0,601,522000.0,199000.0,721000.0,0.0,0,0,1920.0,12400.0,,,0.1331,5800.0,0.0,0.0,N,N,0.0,721000.0


In [23]:
spark.sql('select Property_Class, State_Use_Code  from data group by 1, 2 limit 20')

                                                                                

Property_Class,State_Use_Code
J3,J3
F1,F1
EC,EC
AC,AC
C1C,C1
BC,BC
G1,G1
J2,J2
J4,J4
J6,J6


### Start the training.
First, use the numeric values.

In [18]:
train, test = df.randomSplit([0.7, 0.3])
numerical_features = ['Appraisal_Year', 'County', 'City', 'School', 'Num_Special_Dist', 'Spec1', 'Spec2',
                      'Spec3', 'Spec4', 'Spec5', 'Land_Value', 'Improvement_Value', 'Total_Value',
                      'Garage_Capacity', 'Num_Bedrooms', 'Num_Bathrooms', 'Year_Built', 'Living_Area',
                      'Land_Acres', 'Land_SqFt', 'Ag_Acres', 'Ag_Value', 'Structure_Count']

from pyspark.ml.feature import Imputer, VectorAssembler, StandardScaler

imputer = Imputer(inputCols=numerical_features, outputCols=numerical_features)
imputer = imputer.fit(train)
train = imputer.transform(train)
test = imputer.transform(test)

numerical_VA = VectorAssembler(inputCols=numerical_features, outputCol='numerical_vector')
train = numerical_VA.transform(train)
test = numerical_VA.transform(test)

scaler = StandardScaler(inputCol='numerical_vector', outputCol='scaled_vector', withStd=True, withMean=True)
scaler = scaler.fit(train)
train = scaler.transform(train)
test = scaler.transform(test)

                                                                                

Second, use indexed categorical (string) values

In [19]:
from pyspark.ml.feature import StringIndexer, OneHotEncoder
pool_catidx = StringIndexer(inputCol='Swimming_Pool_Ind', outputCol='Swimming_Pool_Ind_catidx')
pool_catidx = pool_catidx.fit(train)
train = pool_catidx.transform(train)
test = pool_catidx.transform(test)
# to-do: add other categories as needed


                                                                                

In [20]:
ohe = OneHotEncoder(inputCol='Swimming_Pool_Ind_catidx', outputCol='Swimming_Pool_Ind_one_hot')
ohe = ohe.fit(train)
# train = ohe.transform(train)
# test = ohe.transform(test)

IllegalArgumentException: requirement failed: Cannot have an empty string for name.