File: data-samples/tarrant-tx.csv  
Source: https://www.tad.org/data-reports

### Get the data from the website into a data-frame
1. The only clean-up we're doing here is making sure the string data is trimmed
2. The part is idempotent - after running it at least once, delete /tmp/spark/PropertyData.txt to have it use fresh data from the download source

In [None]:
import requests, zipfile, io, os
from pyspark.sql import SparkSession
from pyspark.sql.functions import monotonically_increasing_id, trim, col

spark = SparkSession.builder.appName("PropertyData").getOrCreate()
zip_url = "https://www.tad.org/content/data-download/PropertyData(Delimited).ZIP"
source_file = "/tmp/spark/PropertyData.txt"

if os.path.isfile(source_file):
    print("Using existing property file")
else:
    download_url = "/tmp/spark/"
    r = requests.get(zip_url)
    z = zipfile.ZipFile(io.BytesIO(r.content))
    z.extractall(download_url)

df = spark.read.csv(source_file, sep="|", header=True, inferSchema=True)
df = df.withColumn('id', monotonically_increasing_id())
df = df[['id'] + df.columns[:-1]]   # move id column to front
for name, dtype in df.dtypes:       # trim all string columns
    if dtype == "string":
        df = df.withColumn(name, trim(col(name)))

### Start the training.
First, use the numeric values.

In [None]:
train, test = df.randomSplit([0.7, 0.3])
numerical_features = ['Appraisal_Year', 'County', 'City', 'School', 'Num_Special_Dist', 'Spec1', 'Spec2',
                      'Spec3', 'Spec4', 'Land_Value', 'Improvement_Value', 'Total_Value', 'Garage_Capacity',
                      'Num_Bedrooms', 'Num_Bathrooms', 'Year_Built', 'Living_Area', 'Land_Acres', 'Land_SqFt',
                      'Ag_Acres', 'Ag_Value', 'Structure_Count',]

from pyspark.ml.feature import Imputer, VectorAssembler, StandardScaler

imputer = Imputer(inputCols=numerical_features, outputCols=numerical_features)
imputer = imputer.fit(train)
train = imputer.transform(train)
test = imputer.transform(test)

numerical_VA = VectorAssembler(inputCols=numerical_features, outputCol='numerical_vector')
train = numerical_VA.transform(train)
test = numerical_VA.transform(test)

scaler = StandardScaler(inputCol='numerical_vector', outputCol='scaled_vector', withStd=True, withMean=True)
scaler = scaler.fit(train)
train = scaler.transform(train)
test = scaler.transform(test)

Second, use indexed categorical (string) values

In [None]:
from pyspark.ml.feature import StringIndexer, OneHotEncoder
pool_catidx = StringIndexer(inputCol='Swimming_Pool_Ind', outputCol='Swimming_Pool_Ind_catidx')
pool_catidx = pool_catidx.fit(train)
train = pool_catidx.transform(train)
test = pool_catidx.transform(test)
# to-do: add other categories as needed


In [None]:
train.select()

In [None]:
ohe = OneHotEncoder(inputCol='Swimming_Pool_Ind_catidx', outputCol='Swimming_Pool_Ind_one_hot')
ohe = ohe.fit(train)
# train = ohe.transform(train)
# test = ohe.transform(test)