In [1]:
from pyspark.ml.image import ImageSchema
from pyspark.sql import SparkSession, functions, types

In [2]:
sparkTrain = SparkSession \
    .builder \
    .appName("Crime DB") \
    .config("spark.mongodb.input.uri", "mongodb://127.0.0.1/CrimDB.TrainData") \
    .config("spark.mongodb.output.uri", "mongodb://127.0.0.1/CrimDB.TrainData") \
    .getOrCreate()

In [3]:
import sys
assert sys.version_info >= (3, 5) # make sure we have Python 3.5+

from pyspark.sql import SparkSession, functions, types
spark = SparkSession.builder.appName('tmax model tester').getOrCreate()
assert spark.version >= '2.3' # make sure we have Spark 2.3+
spark.sparkContext.setLogLevel('WARN')

from pyspark.ml import PipelineModel
from pyspark.ml.evaluation import RegressionEvaluator
from pyspark.ml.regression import GBTRegressor,RandomForestRegressor
from pyspark.ml.feature import StringIndexer, VectorAssembler, SQLTransformer
from pyspark.ml import Pipeline



def main():
    data = sparkTrain.read.format("com.mongodb.spark.sql.DefaultSource").load()
    data.show(10)
    # get the data
    train, validation = data.randomSplit([0.90, 0.10])
    train = train.cache()
    validation = validation.cache()
    
    query ="SELECT  * FROM __THIS__"
    
    crime_assembler_2 = VectorAssembler(inputCols=["YEAR"," HOUR","MONTH"," DAY"," MINUTE", "LATITUDE_A", "LONGITUDE_A"], outputCol="features")

    type_indexer = StringIndexer(inputCol="TYPE", outputCol="TYPEINDEX", handleInvalid="error",\
    stringOrderType="frequencyDesc")
    
    nb_indexer = StringIndexer(inputCol="NEIGHBOURHOOD", outputCol="NBINDEX", handleInvalid="error",\
    stringOrderType="frequencyDesc")
    
    
    
    facility_indexer = StringIndexer(inputCol="FACILITY", outputCol="FCINDEX", handleInvalid="error",\
    stringOrderType="frequencyDesc")
    
    station_indexer = StringIndexer(inputCol="STATION", outputCol="STINDEX", handleInvalid="error",\
    stringOrderType="frequencyDesc")
    
    block_indexer = StringIndexer(inputCol=" HUNDRED_BLOCK", outputCol="BLINDEX", handleInvalid="error",\
    stringOrderType="frequencyDesc")
    
    
    indexrs = [type_indexer,nb_indexer,facility_indexer]
    
    
    gbt = RandomForestRegressor(featuresCol='features', labelCol='TYPEINDEX', predictionCol='prediction', numTrees=2, maxDepth=20, seed=72)
    sqlTrans = SQLTransformer(statement=query)
    
    weather_pipeline = Pipeline(stages=[sqlTrans, crime_assembler_2, type_indexer, nb_indexer, facility_indexer,station_indexer,block_indexer, gbt])
    print("before fit")
    weather_model = weather_pipeline.fit(train)
    print("before transform")


    # use the model to make predictions
    predictions = weather_model.transform(validation)
    #predictions.show()

    # evaluate the predictions
    r2_evaluator = RegressionEvaluator(predictionCol='prediction', labelCol='TYPEINDEX', metricName='r2')
    r2 = r2_evaluator.evaluate(predictions)

    rmse_evaluator = RegressionEvaluator(predictionCol='prediction', labelCol='TYPEINDEX', metricName='rmse')
    rmse = rmse_evaluator.evaluate(predictions)
    print('r2 =', r2)
    print('rmse =', rmse)

    


if __name__ == '__main__':
    main()


+----+-----+-------------------+-------+--------------------+------------------+-------------------+-----+-------------------+--------------------+--------------------+--------------------+------+--------------------+
| DAY| HOUR|      HUNDRED_BLOCK| MINUTE|            FACILITY|        LATITUDE_A|        LONGITUDE_A|MONTH|      NEIGHBOURHOOD|         SCHOOL_NAME|             STATION|                TYPE|  YEAR|                 _id|
+----+-----+-------------------+-------+--------------------+------------------+-------------------+-----+-------------------+--------------------+--------------------+--------------------+------+--------------------+
|16.0|  3.0|    32XX E 22ND AVE|    0.0|Grandview Calvary...| 49.25064827629371|-123.03445849473383|  5.0|Renfrew-Collingwood|Renfrew Community...|              RUPERT|Break and Enter C...|2003.0|[5c084820792d8035...|
|21.0|  3.0|       11XX BUTE ST|    0.0|Dusk to Dawn-Dire...| 49.28225755775164|-123.13272877223083|  5.0|           West End|  