In [1]:
from pyspark.ml.image import ImageSchema
from pyspark.sql import SparkSession, functions, types


In [2]:
sparkTrain = SparkSession \
    .builder \
    .appName("Crime DB") \
    .config("spark.mongodb.input.uri", "mongodb://127.0.0.1/CrimDB.Data") \
    .config("spark.mongodb.output.uri", "mongodb://127.0.0.1/CrimDB.Data") \
    .getOrCreate()

In [8]:
import sys
assert sys.version_info >= (3, 5) # make sure we have Python 3.5+
import pandas as pd,timeit as tm, pickle as pk, csv, sys

from pyspark.sql import SparkSession, types
spark = SparkSession.builder.appName('tmax model tester').getOrCreate()
assert spark.version >= '2.3' # make sure we have Spark 2.3+
spark.sparkContext.setLogLevel('WARN')

from pyspark.ml import PipelineModel
from pyspark.ml.evaluation import RegressionEvaluator
from pyspark.ml.regression import GBTRegressor
from pyspark.ml.feature import StringIndexer, VectorAssembler, SQLTransformer
from pyspark.ml import Pipeline
from pyspark.sql import functions 
from math import radians, cos, sin, asin, sqrt
from pyspark.sql.types import *
from pyspark.sql.window import Window
from pyspark.sql.functions import rank, col

crime_schema = types.StructType([
    types.StructField('TYPE', types.StringType()),
    types.StructField('YEAR', types.DoubleType()),
    types.StructField('MONTH', types.DoubleType()),
    types.StructField('DAY', types.DoubleType()),
    types.StructField('HOUR', types.DoubleType()),
    types.StructField('MINUTE', types.DoubleType()),
    types.StructField('HUNDRED_BLOCK', types.StringType()),
    types.StructField('NEIGHBOURHOOD', types.StringType()),
    types.StructField('LATITUDE', types.DoubleType()),
    types.StructField('LONGITUDE', types.DoubleType()),
])

gff_schema = types.StructType([
    types.StructField('COUNT', types.DoubleType()),
    types.StructField('LATITUDE', types.DoubleType()),
    types.StructField('LONGITUDE', types.DoubleType()),
])

hl_schema = types.StructType([
    types.StructField('FACILITY', types.StringType()),
    types.StructField('LAT', types.DoubleType()),
    types.StructField('LONG', types.DoubleType()),
    types.StructField('CATEGORY', types.StringType()),
    types.StructField('PHONE', types.StringType()),
    types.StructField('MEALS', types.StringType()),
    types.StructField('PETS', types.StringType()),
    types.StructField('CARTS', types.StringType()),
])

st_schema = types.StructType([
    types.StructField('STATION', types.StringType()),
    types.StructField('LAT', types.DoubleType()),
    types.StructField('LONG', types.DoubleType()),
])
sl_schema = types.StructType([
    types.StructField('NODE_NUMBER', types.DoubleType()),
    types.StructField('LAT', types.DoubleType()),
    types.StructField('LONG', types.DoubleType()),
    types.StructField('BLOCK_NUMBER', types.DoubleType()),
])

sc_schema = types.StructType([
    types.StructField('SCHOOL_NAME', types.StringType()),
    types.StructField('LATITUDE', types.DoubleType()),
    types.StructField('LONGITUDE', types.DoubleType()),
    types.StructField('ADDRESS', types.StringType()),
    types.StructField('SCHOOL_CATEGORY', types.StringType()),
])
def get_distance(longit_a, latit_a, longit_b, latit_b):

    # Transform to radians

    longit_a, latit_a, longit_b, latit_b = map(radians, [longit_a,  latit_a, longit_b, latit_b])

    dist_longit = longit_b -longit_a

    dist_latit = latit_b - latit_a

    # Calculate area

    area = sin(dist_latit/2)**2 + cos(latit_a) * sin(dist_longit/2)**2

    # Calculate the central angle

    central_angle = 2 * asin(sqrt(area))

    radius = 6371

    # Calculate Distance

    distance = central_angle * radius

    return abs(round(distance, 2))


udf_get_distance = functions.udf(get_distance)
def main(inputs):
    
    # get the data
    crime_df = spark.read.csv(inputs, schema = crime_schema)
    crime_df = crime_df.filter(crime_df.NEIGHBOURHOOD.isNotNull())
    crime_df.show()
    
    
    #graffiti = spark.read.csv('/Users/jaideepmishra/downloads/Big_Data_Project/data/graffiti/graffiti.csv', schema = gff_schema)
    #graffiti = graffiti.filter(graffiti.COUNT.isNotNull())
    
    homeless = spark.read.csv('/Users/jaideepmishra/downloads/Big_Data_Project/data/homeless_shelters/doc.csv', schema = hl_schema)
    homeless = homeless.filter(homeless.FACILITY.isNotNull())
    homeless = homeless.select("FACILITY","LAT", "LONG")
    #homeless.show()
    #homeless.write.format("com.mongodb.spark.sql.DefaultSource").mode("overwrite").save()
    #print('written')
    
    skystation  = spark.read.csv('/Users/jaideepmishra/downloads/Big_Data_Project/data/skytrain_stations/rapid_transit_stations.csv', schema = st_schema)
    skystation = skystation.filter(skystation.STATION.isNotNull())
    
    schools = spark.read.csv('/Users/jaideepmishra/downloads/Big_Data_Project/data/schools/schools.csv', schema = sc_schema)
    schools = schools.filter(schools.SCHOOL_NAME.isNotNull())
    schools = schools.select("SCHOOL_NAME","LATITUDE","LONGITUDE")

    # cross join starts
    
    nz_station_pairs = (crime_df.crossJoin(homeless).toDF("TYPE", "YEAR", "MONTH","DAY","HOUR","MINUTE","HUNDRED_BLOCK","NEIGHBOURHOOD", "LATITUDE_A", "LONGITUDE_A", "FACILITY", "LATITUDE_B", "LONGITUDE_B"))
    #nz_station_pairs.show()
    
    nz_pairs_distance = nz_station_pairs.withColumn("ABS_DISTANCE", udf_get_distance(nz_station_pairs.LONGITUDE_A, nz_station_pairs.LATITUDE_A, nz_station_pairs.LONGITUDE_B, nz_station_pairs.LATITUDE_B))
    nz_pairs_distance = nz_pairs_distance.select("TYPE","YEAR", "MONTH"," DAY"," HOUR"," MINUTE"," HUNDRED_BLOCK","NEIGHBOURHOOD","LATITUDE_A", "LONGITUDE_A", "ABS_DISTANCE", "FACILITY")
    nz_pairs_distance.show(100)
    
    window = Window.partitionBy(nz_pairs_distance['TYPE'], nz_pairs_distance['YEAR'],\
    nz_pairs_distance['MONTH'], nz_pairs_distance['LATITUDE_A'],\
    nz_pairs_distance['LONGITUDE_A']).orderBy(nz_pairs_distance['ABS_DISTANCE'].asc())
    
    nz_pairs_distance = nz_pairs_distance.select('*', rank().over(window).alias('rank')).filter(col('rank') <= 1)
    nz_pairs_distance = nz_pairs_distance.select("TYPE","YEAR", "MONTH", " DAY"," HOUR"," MINUTE"," HUNDRED_BLOCK","NEIGHBOURHOOD","LATITUDE_A", "LONGITUDE_A", "FACILITY")
    nz_pairs_distance.show(100)
    print('done_1')
    
    # cross join stations on crime_df
    nz_station_pairs_2 = (nz_pairs_distance.crossJoin(skystation).toDF("TYPE", "YEAR", "MONTH"," DAY"," HOUR"," MINUTE"," HUNDRED_BLOCK","NEIGHBOURHOOD", "LATITUDE_A", "LONGITUDE_A", "FACILITY", "STATION", "LATITUDE_B", "LONGITUDE_B"))

    nz_pairs_distance_2 = nz_station_pairs_2.withColumn("ABS_DISTANCE", udf_get_distance(nz_station_pairs_2.LONGITUDE_A, nz_station_pairs_2.LATITUDE_A, nz_station_pairs_2.LONGITUDE_B, nz_station_pairs_2.LATITUDE_B))
    nz_pairs_distance_2 = nz_pairs_distance_2.select("TYPE","YEAR", "MONTH"," DAY"," HOUR"," MINUTE"," HUNDRED_BLOCK","NEIGHBOURHOOD","LATITUDE_A", "LONGITUDE_A", "ABS_DISTANCE", "FACILITY","STATION")
    nz_pairs_distance_2.show(100)
    
    window = Window.partitionBy(nz_pairs_distance_2['TYPE'], nz_pairs_distance_2['YEAR'],\
    nz_pairs_distance_2['MONTH'], nz_pairs_distance_2['LATITUDE_A'],\
    nz_pairs_distance_2['LONGITUDE_A']).orderBy(nz_pairs_distance_2['ABS_DISTANCE'].asc())
    
    nz_pairs_distance_2 = nz_pairs_distance_2.select('*', rank().over(window).alias('rank')).filter(col('rank') <= 1)
    nz_pairs_distance_2 = nz_pairs_distance_2.select("TYPE","YEAR", "MONTH"," DAY"," HOUR"," MINUTE"," HUNDRED_BLOCK","NEIGHBOURHOOD", "LATITUDE_A", "LONGITUDE_A", "FACILITY","STATION")
    nz_pairs_distance_2.show(100)
    print('done_2')
    
    # cross join schools on crime_df
    
    nz_station_pairs_3 = (nz_pairs_distance_2.crossJoin(schools).toDF("TYPE", "YEAR", "MONTH"," DAY"," HOUR"," MINUTE"," HUNDRED_BLOCK","NEIGHBOURHOOD", "LATITUDE_A", "LONGITUDE_A", "FACILITY", "STATION", "SCHOOL_NAME", "LATITUDE_B", "LONGITUDE_B"))

    nz_pairs_distance_3 = nz_station_pairs_3.withColumn("ABS_DISTANCE", udf_get_distance(nz_station_pairs_3.LONGITUDE_A, nz_station_pairs_3.LATITUDE_A, nz_station_pairs_3.LONGITUDE_B, nz_station_pairs_3.LATITUDE_B))
    nz_pairs_distance_3 = nz_pairs_distance_3.select("TYPE","YEAR", "MONTH"," DAY"," HOUR"," MINUTE"," HUNDRED_BLOCK","NEIGHBOURHOOD","LATITUDE_A", "LONGITUDE_A", "ABS_DISTANCE", "FACILITY","STATION","SCHOOL_NAME")
    nz_pairs_distance_3.show(100)
    
    window = Window.partitionBy(nz_pairs_distance_3['TYPE'], nz_pairs_distance_3['YEAR'],\
    nz_pairs_distance_3['MONTH'], nz_pairs_distance_3['LATITUDE_A'],\
    nz_pairs_distance_3['LONGITUDE_A']).orderBy(nz_pairs_distance_3['ABS_DISTANCE'].asc())
    
    nz_pairs_distance_3 = nz_pairs_distance_3.select('*', rank().over(window).alias('rank')).filter(col('rank') <= 1)
    nz_pairs_distance_3 = nz_pairs_distance_3.select("TYPE","YEAR", "MONTH"," DAY"," HOUR"," MINUTE"," HUNDRED_BLOCK","NEIGHBOURHOOD", "LATITUDE_A", "LONGITUDE_A", "FACILITY","STATION", "SCHOOL_NAME")
    nz_pairs_distance_3.show(100)
    print('done_2 stop')
    
    
    nz_pairs_distance_3.write.format("com.mongodb.spark.sql.DefaultSource").mode("overwrite").save()
    #nz_pairs_distance_3.coalesce(1).write.csv('/Users/jaideepmishra/downloads/Big_Data_Project/data/output/output.csv')

if __name__ == '__main__':
    #model_file = sys.argv[1]
    inputs = '/Users/jaideepmishra/downloads/Big_Data_Project/data/crime_03_15/crime_latlong.csv'
    main(inputs)

+--------------------+------+-----+----+----+------+--------------------+--------------------+------------------+-------------------+
|                TYPE|  YEAR|MONTH| DAY|HOUR|MINUTE|       HUNDRED_BLOCK|       NEIGHBOURHOOD|          LATITUDE|          LONGITUDE|
+--------------------+------+-----+----+----+------+--------------------+--------------------+------------------+-------------------+
|            Mischief|2003.0|  6.0|14.0|21.0|   0.0|       6X E 52ND AVE|              Sunset| 49.22285547453633|-123.10457767461014|
|    Theft of Vehicle|2003.0|  9.0|14.0|21.0|   0.0|     71XX NANAIMO ST| Victoria-Fraserview| 49.21942208176436|-123.05928356709362|
|Break and Enter C...|2003.0|  7.0|16.0| 5.0|   0.0|     1XX E PENDER ST|Central Business ...|49.280454355702865|-123.10100566349294|
|            Mischief|2003.0|  9.0|24.0|20.0|   0.0|       9XX CHILCO ST|            West End| 49.29261448054877|-123.13962081805273|
|            Mischief|2003.0|  6.0|30.0|22.0|   0.0|       9XX

+--------------------+------+-----+----+-----+-------+--------------------+--------------------+------------------+-------------------+--------------------+
|                TYPE|  YEAR|MONTH| DAY| HOUR| MINUTE|       HUNDRED_BLOCK|       NEIGHBOURHOOD|        LATITUDE_A|        LONGITUDE_A|            FACILITY|
+--------------------+------+-----+----+-----+-------+--------------------+--------------------+------------------+-------------------+--------------------+
|Break and Enter C...|2003.0|  1.0|31.0| 20.0|    0.0|      6XX W 41ST AVE|            Oakridge| 49.23352408125696|-123.11850363836272| Tenth Avenue Church|
|Break and Enter C...|2003.0|  1.0|16.0| 17.0|   30.0|      6XX W 41ST AVE|            Oakridge| 49.23352408125696|-123.11850363836272| Tenth Avenue Church|
|Break and Enter C...|2003.0|  4.0|14.0|  5.0|   35.0|        64XX MAIN ST|              Sunset| 49.22651352431312|-123.10151371169188| Tenth Avenue Church|
|Break and Enter C...|2003.0|  5.0|17.0| 22.0|    0.0|    

+--------------------+------+-----+----+-----+-------+---------------+-------------------+-----------------+-------------------+------------+--------------------+--------------------+
|                TYPE|  YEAR|MONTH| DAY| HOUR| MINUTE|  HUNDRED_BLOCK|      NEIGHBOURHOOD|       LATITUDE_A|        LONGITUDE_A|ABS_DISTANCE|            FACILITY|             STATION|
+--------------------+------+-----+----+-----+-------+---------------+-------------------+-----------------+-------------------+------------+--------------------+--------------------+
|Break and Enter C...|2003.0|  1.0|31.0| 20.0|    0.0| 6XX W 41ST AVE|           Oakridge|49.23352408125696|-123.11850363836272|        5.87| Tenth Avenue Church|          WATERFRONT|
|Break and Enter C...|2003.0|  1.0|31.0| 20.0|    0.0| 6XX W 41ST AVE|           Oakridge|49.23352408125696|-123.11850363836272|        5.82| Tenth Avenue Church|             BURRARD|
|Break and Enter C...|2003.0|  1.0|31.0| 20.0|    0.0| 6XX W 41ST AVE|          

+--------------------+------+-----+----+-----+-------+--------------------+--------------------+------------------+-------------------+--------------------+--------------------+
|                TYPE|  YEAR|MONTH| DAY| HOUR| MINUTE|       HUNDRED_BLOCK|       NEIGHBOURHOOD|        LATITUDE_A|        LONGITUDE_A|            FACILITY|             STATION|
+--------------------+------+-----+----+-----+-------+--------------------+--------------------+------------------+-------------------+--------------------+--------------------+
|Break and Enter C...|2003.0|  1.0|31.0| 20.0|    0.0|      6XX W 41ST AVE|            Oakridge| 49.23352408125696|-123.11850363836272| Tenth Avenue Church|OAKRIDGE-\n41st. ...|
|Break and Enter C...|2003.0|  1.0|16.0| 17.0|   30.0|      6XX W 41ST AVE|            Oakridge| 49.23352408125696|-123.11850363836272| Tenth Avenue Church|OAKRIDGE-\n41st. ...|
|Break and Enter C...|2003.0|  4.0|14.0|  5.0|   35.0|        64XX MAIN ST|              Sunset| 49.2265135243

+--------------------+------+-----+----+-----+-------+---------------+-------------+-----------------+-------------------+------------+-------------------+--------------------+--------------------+
|                TYPE|  YEAR|MONTH| DAY| HOUR| MINUTE|  HUNDRED_BLOCK|NEIGHBOURHOOD|       LATITUDE_A|        LONGITUDE_A|ABS_DISTANCE|           FACILITY|             STATION|         SCHOOL_NAME|
+--------------------+------+-----+----+-----+-------+---------------+-------------+-----------------+-------------------+------------+-------------------+--------------------+--------------------+
|Break and Enter C...|2003.0|  1.0|31.0| 20.0|    0.0| 6XX W 41ST AVE|     Oakridge|49.23352408125696|-123.11850363836272|        6.07|Tenth Avenue Church|OAKRIDGE-\n41st. ...|Admiral Seymour E...|
|Break and Enter C...|2003.0|  1.0|31.0| 20.0|    0.0| 6XX W 41ST AVE|     Oakridge|49.23352408125696|-123.11850363836272|        6.07|Tenth Avenue Church|OAKRIDGE-\n41st. ...|Admiral Seymour S...|
|Break and

+--------------------+------+-----+----+-----+-------+--------------------+--------------------+------------------+-------------------+--------------------+--------------------+--------------------+
|                TYPE|  YEAR|MONTH| DAY| HOUR| MINUTE|       HUNDRED_BLOCK|       NEIGHBOURHOOD|        LATITUDE_A|        LONGITUDE_A|            FACILITY|             STATION|         SCHOOL_NAME|
+--------------------+------+-----+----+-----+-------+--------------------+--------------------+------------------+-------------------+--------------------+--------------------+--------------------+
|Break and Enter C...|2003.0|  1.0|31.0| 20.0|    0.0|      6XX W 41ST AVE|            Oakridge| 49.23352408125696|-123.11850363836272| Tenth Avenue Church|OAKRIDGE-\n41st. ...|King David High S...|
|Break and Enter C...|2003.0|  1.0|16.0| 17.0|   30.0|      6XX W 41ST AVE|            Oakridge| 49.23352408125696|-123.11850363836272| Tenth Avenue Church|OAKRIDGE-\n41st. ...|King David High S...|
|Brea

In [9]:
data = sparkTrain.read.format("com.mongodb.spark.sql.DefaultSource").load()

In [10]:
data.printSchema()

root
 |--  DAY: double (nullable = true)
 |--  HOUR: double (nullable = true)
 |--  HUNDRED_BLOCK: string (nullable = true)
 |--  MINUTE: double (nullable = true)
 |-- FACILITY: string (nullable = true)
 |-- LATITUDE_A: double (nullable = true)
 |-- LONGITUDE_A: double (nullable = true)
 |-- MONTH: double (nullable = true)
 |-- NEIGHBOURHOOD: string (nullable = true)
 |-- SCHOOL_NAME: string (nullable = true)
 |-- STATION: string (nullable = true)
 |-- TYPE: string (nullable = true)
 |-- YEAR: double (nullable = true)
 |-- _id: struct (nullable = true)
 |    |-- oid: string (nullable = true)



In [11]:
data.count()

673413

In [12]:
data.show(10)

+----+-----+-----------------+-------+--------------------+------------------+-------------------+-----+-------------------+--------------------+--------------------+--------------------+------+--------------------+
| DAY| HOUR|    HUNDRED_BLOCK| MINUTE|            FACILITY|        LATITUDE_A|        LONGITUDE_A|MONTH|      NEIGHBOURHOOD|         SCHOOL_NAME|             STATION|                TYPE|  YEAR|                 _id|
+----+-----+-----------------+-------+--------------------+------------------+-------------------+-----+-------------------+--------------------+--------------------+--------------------+------+--------------------+
|20.0|  0.0|    1XX W 6TH AVE|   30.0| Tenth Avenue Church| 49.26556956967855| -123.1071887799872|  4.0|     Mount Pleasant|St Patrick Region...|    OLYMPIC\nVILLAGE|Break and Enter C...|2003.0|[5c084820792d8035...|
|16.0|  3.0|  32XX E 22ND AVE|    0.0|Grandview Calvary...| 49.25064827629371|-123.03445849473383|  5.0|Renfrew-Collingwood|Renfrew Comm