## Parb B

In [1]:
import os
os.environ['PYSPARK_SUBMIT_ARGS'] = '--packages org.mongodb.spark:mongo-spark-connector_2.11:2.4.0 pyspark-shell'
from pyspark.sql import SparkSession # Spark SQL
from pyspark import SparkContext # Sparkfrom datetime import datetime
from pyspark.sql.functions import col, udf 
from pyspark.sql.types import DateType
from datetime import datetime
from pyspark.sql import SQLContext
from pyspark.sql.functions import year, month, dayofmonth
import matplotlib.pyplot as plt




### Step 1

In [2]:
sc = SparkContext.getOrCreate() # create spark context

In [3]:
if (sc is None):
    sc = SparkContext(master="local[*]")
spark = SparkSession(sparkContext=sc)\
        .builder\
        .appName("MongoDB and Apache Spark Data Visualization")\
        .config("spark.jars.packages", "org.mongodb.spark:mongo-spark-connector_2.11:2.4.0")\
        .config("spark.mongodb.input.uri", "mongodb://127.0.0.1/fit5202_db.wk05_titanic_coll")\
        .config("spark.mongodb.output.uri", "mongodb://127.0.0.1/fit5202_db.wk05_titanic_coll")\
        .getOrCreate()

### Step 2

In [4]:
crime_df = spark.read.csv('Crime_Statistics_SA_2010_present.csv', header=True, inferSchema=True) # read crime stats csv


In [5]:
crime_df.take(5)

[Row(Reported Date='1/07/2010', Suburb - Incident='ADELAIDE', Postcode - Incident='5000', Offence Level 1 Description='OFFENCES AGAINST PROPERTY', Offence Level 2 Description='FRAUD DECEPTION AND RELATED OFFENCES', Offence Level 3 Description='Obtain benefit by deception', Offence Count=2),
 Row(Reported Date='1/07/2010', Suburb - Incident='ADELAIDE', Postcode - Incident='5000', Offence Level 1 Description='OFFENCES AGAINST PROPERTY', Offence Level 2 Description='PROPERTY DAMAGE AND ENVIRONMENTAL', Offence Level 3 Description='Other property damage and environmental', Offence Count=2),
 Row(Reported Date='1/07/2010', Suburb - Incident='ADELAIDE', Postcode - Incident='5000', Offence Level 1 Description='OFFENCES AGAINST PROPERTY', Offence Level 2 Description='SERIOUS CRIMINAL TRESPASS', Offence Level 3 Description='SCT - Non Residence', Offence Count=1),
 Row(Reported Date='1/07/2010', Suburb - Incident='ADELAIDE', Postcode - Incident='5000', Offence Level 1 Description='OFFENCES AGAINS

### Step 3

In [6]:
crime_df.write.format("com.mongodb.spark.sql.DefaultSource").mode("overwrite").save() #write to mongodb

In [7]:
crime_df.show()

+-------------+-----------------+-------------------+---------------------------+---------------------------+---------------------------+-------------+
|Reported Date|Suburb - Incident|Postcode - Incident|Offence Level 1 Description|Offence Level 2 Description|Offence Level 3 Description|Offence Count|
+-------------+-----------------+-------------------+---------------------------+---------------------------+---------------------------+-------------+
|    1/07/2010|         ADELAIDE|               5000|       OFFENCES AGAINST ...|       FRAUD DECEPTION A...|       Obtain benefit by...|            2|
|    1/07/2010|         ADELAIDE|               5000|       OFFENCES AGAINST ...|       PROPERTY DAMAGE A...|       Other property da...|            2|
|    1/07/2010|         ADELAIDE|               5000|       OFFENCES AGAINST ...|       SERIOUS CRIMINAL ...|        SCT - Non Residence|            1|
|    1/07/2010|         ADELAIDE|               5000|       OFFENCES AGAINST ...|       

### Step 4

In [8]:
crime_df_from_mongodb = spark.read.format("com.mongodb.spark.sql.DefaultSource").load() #read from mongodb

In [9]:
crime_df_from_mongodb.printSchema()

root
 |-- Offence Count: integer (nullable = true)
 |-- Offence Level 1 Description: string (nullable = true)
 |-- Offence Level 2 Description: string (nullable = true)
 |-- Offence Level 3 Description: string (nullable = true)
 |-- Postcode - Incident: string (nullable = true)
 |-- Reported Date: string (nullable = true)
 |-- Suburb - Incident: string (nullable = true)
 |-- _id: struct (nullable = true)
 |    |-- oid: string (nullable = true)



In [10]:
offencecount = crime_df_from_mongodb.groupBy('Offence Count', 'Reported Date').count().sort('Reported Date', ascending=False)

### Step 5

In [11]:
offencecount.collect()

[Row(Offence Count=3, Reported Date='9/12/2018', count=6),
 Row(Offence Count=6, Reported Date='9/12/2018', count=1),
 Row(Offence Count=2, Reported Date='9/12/2018', count=25),
 Row(Offence Count=5, Reported Date='9/12/2018', count=1),
 Row(Offence Count=1, Reported Date='9/12/2018', count=180),
 Row(Offence Count=5, Reported Date='9/12/2016', count=1),
 Row(Offence Count=4, Reported Date='9/12/2016', count=2),
 Row(Offence Count=1, Reported Date='9/12/2016', count=216),
 Row(Offence Count=3, Reported Date='9/12/2016', count=7),
 Row(Offence Count=2, Reported Date='9/12/2016', count=28),
 Row(Offence Count=6, Reported Date='9/12/2015', count=1),
 Row(Offence Count=4, Reported Date='9/12/2015', count=2),
 Row(Offence Count=3, Reported Date='9/12/2015', count=4),
 Row(Offence Count=1, Reported Date='9/12/2015', count=235),
 Row(Offence Count=2, Reported Date='9/12/2015', count=27),
 Row(Offence Count=2, Reported Date='9/12/2014', count=21),
 Row(Offence Count=1, Reported Date='9/12/2014

In [12]:
summarydf = crime_df_from_mongodb.select('Offence Count', 'Reported Date') # create sub dataframe

In [13]:
summarydf.describe().show() # describe

+-------+------------------+-------------+
|summary|     Offence Count|Reported Date|
+-------+------------------+-------------+
|  count|            727407|       727407|
|   mean|1.1715174585892079|         null|
| stddev|0.5787050930378118|         null|
|    min|                 1|    1/01/2011|
|    max|                28|    9/12/2018|
+-------+------------------+-------------+



### Step 6

In [14]:
crime_df_from_mongodb.na.drop(subset=["Reported Date"]) # drop all empty dates

DataFrame[Offence Count: int, Offence Level 1 Description: string, Offence Level 2 Description: string, Offence Level 3 Description: string, Postcode - Incident: string, Reported Date: string, Suburb - Incident: string, _id: struct<oid:string>]

In [15]:
func =  udf (lambda x:  datetime.strptime(str(x), '%d/%m/%Y'), DateType()) #udf to convert to datetime

# dated_crime_df = crime_df.withColumn('Modified Reported Date', func(col('Reported Date')))

# join_Df1.filter(join_Df1.FirstName.isNotNull()).show

dated_crime_df = crime_df_from_mongodb.filter(crime_df_from_mongodb['Reported Date'].isNotNull()).withColumn('Modified Reported Date', func(col('Reported Date'))) # convert to datetime


In [None]:
dated_crime_df.collect()

### Step 7

In [None]:
pipeline = [{ '$group': { '_id': { 'Level2Offence': '$Offence Level 2 Description' }, 'count': { '$sum': '$Offence Count' } } }]


In [None]:
level_2_offence_crime_df = spark.read.format("com.mongodb.spark.sql.DefaultSource").option("pipeline", pipeline).load()


In [None]:
level_2_offence_crime_df.count()

In [None]:
for row in level_2_offence_crime_df.toLocalIterator():
        print(row[0].Level2Offence)

In [None]:
dated_crime_df.filter(dated_crime_df['Offence Level 1 Description'] == "OFFENCES AGAINST THE PERSON").count()

In [None]:
tresspasses_crime_df = dated_crime_df.filter(dated_crime_df['Offence Level 2 Description'] == "SERIOUS CRIMINAL TRESPASS").filter(dated_crime_df['Offence Count'] > 1)

In [None]:
tresspasses_crime_df.count()

In [None]:
x1 = dated_crime_df.filter(dated_crime_df['Offence Level 1 Description'] == "OFFENCES AGAINST PROPERTY").count()
x2 = dated_crime_df.count()
(x1/x2)*100

### Step 8

In [None]:
month = dated_crime_df.select(month('Modified Reported Date').alias('month'))


In [None]:
month.count()

In [None]:
month_count = month.groupBy('month').count().sort('month', ascending=False)

In [None]:
month_count.collect()

In [None]:
montharray = []
montharraycount = []
for x in month_count.toLocalIterator():
    montharray.append(x[0])
    montharraycount.append(x[1])


In [None]:
plt.plot(montharray, montharraycount)
plt.show()

In [None]:
year = dated_crime_df.select(year('Modified Reported Date').alias('year'))


In [None]:
year.take(5)

In [None]:
year.count()

In [None]:
year_count = year.groupBy('year').count().sort('year', ascending=True)

In [None]:
year_count.collect()

In [None]:
type(year_count)

In [None]:
yeararray = []
yeararraycount = []
for x in year_count.toLocalIterator():
    yeararray.append(x[0])
    yeararraycount.append(x[1])


In [None]:
plt.plot(yeararray, yeararraycount)
plt.show()