# Introduction to Google Collab

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


## Import Libraries

In [None]:
!pip install pyspark
!pip install findspark



In [None]:
import pandas as pd
import numpy as np

## Install Spark

In [None]:
!ls


drive  sample_data


In [None]:
import findspark
from pyspark import SparkContext
from pyspark.sql import SparkSession

findspark.init()

## RDD
sc = SparkContext.getOrCreate()
sc

## DataFrame
spark = SparkSession.builder.getOrCreate()
spark

## Load Data

In [None]:
from pyspark.sql.functions import to_timestamp, col, lit

rc = spark.read.csv('./chicago-crimes-2001-present.csv', header=True)
rc = rc.withColumn('Date', to_timestamp(col('Date'), 'MM/dd/yyyy hh:mm:ss a'))



In [None]:
rc.show(5)

+-------------+----------------+----+------------------+----+------------+-----------+--------------------+------+--------+----+--------+----+--------------+--------+------------+------------+----+--------------------+------------+-------------+--------+
|           id|     case_number|Date|             block|iucr|primary_type|description|location_description|arrest|domestic|beat|district|ward|community_area|fbi_code|x_coordinate|y_coordinate|year|          updated_on|    latitude|    longitude|location|
+-------------+----------------+----+------------------+----+------------+-----------+--------------------+------+--------+----+--------+----+--------------+--------+------------+------------+----+--------------------+------------+-------------+--------+
|     13637238|        JH476216|NULL|   041XX W 13TH ST|0460|     BATTERY|     SIMPLE|              STREET| false|   false|1011|     010|  24|            29|     08B|     1149173|     1893724|2024|2024-10-28T15:40:...|41.864304053|-87.

In [None]:
rc.printSchema()

root
 |-- id: string (nullable = true)
 |-- case_number: string (nullable = true)
 |-- Date: timestamp (nullable = true)
 |-- block: string (nullable = true)
 |-- iucr: string (nullable = true)
 |-- primary_type: string (nullable = true)
 |-- description: string (nullable = true)
 |-- location_description: string (nullable = true)
 |-- arrest: string (nullable = true)
 |-- domestic: string (nullable = true)
 |-- beat: string (nullable = true)
 |-- district: string (nullable = true)
 |-- ward: string (nullable = true)
 |-- community_area: string (nullable = true)
 |-- fbi_code: string (nullable = true)
 |-- x_coordinate: string (nullable = true)
 |-- y_coordinate: string (nullable = true)
 |-- year: string (nullable = true)
 |-- updated_on: string (nullable = true)
 |-- latitude: string (nullable = true)
 |-- longitude: string (nullable = true)
 |-- location: string (nullable = true)



We can see that some of the column data type is not what we need it to be. So we,

### Explicitly Set Schema Data Types

In [None]:
rc.columns

['id',
 'case_number',
 'Date',
 'block',
 'iucr',
 'primary_type',
 'description',
 'location_description',
 'arrest',
 'domestic',
 'beat',
 'district',
 'ward',
 'community_area',
 'fbi_code',
 'x_coordinate',
 'y_coordinate',
 'year',
 'updated_on',
 'latitude',
 'longitude',
 'location']

In [None]:
from pyspark.sql.types import StructType, StructField, StringType, \
  TimestampType, BooleanType, DoubleType, IntegerType, DataType

schema = StructType([
  StructField('id', StringType(), True),
  StructField('case_number', StringType(), True),
  StructField('Date', TimestampType(), True),
  StructField('block', StringType(), True),
  StructField('iucr', StringType(), True),
  StructField('primary_type', StringType(), True),
  StructField('description', StringType(), True),
  StructField('location_description', StringType(), True),
  StructField('arrest', StringType(), True),
  StructField('domestic', BooleanType(), True),
  StructField('beat', StringType(), True),
  StructField('district', StringType(), True),
  StructField('ward', StringType(), True),
  StructField('community_area', StringType(), True),
  StructField('fbi_code', StringType(), True),
  StructField('x_coordinate', StringType(), True),
  StructField('y_coordinate', StringType(), True),
  StructField('year', IntegerType(), True),
  StructField('updated_on', StringType(), True),
  StructField('latitude', DoubleType(), True),
  StructField('longitude', DoubleType(), True),
  StructField('location', StringType(), True),
])

In [None]:
rc = spark.read.csv('./chicago-crimes-2001-present.csv', header=True, schema=schema)
rc.printSchema()

root
 |-- id: string (nullable = true)
 |-- case_number: string (nullable = true)
 |-- Date: timestamp (nullable = true)
 |-- block: string (nullable = true)
 |-- iucr: string (nullable = true)
 |-- primary_type: string (nullable = true)
 |-- description: string (nullable = true)
 |-- location_description: string (nullable = true)
 |-- arrest: string (nullable = true)
 |-- domestic: boolean (nullable = true)
 |-- beat: string (nullable = true)
 |-- district: string (nullable = true)
 |-- ward: string (nullable = true)
 |-- community_area: string (nullable = true)
 |-- fbi_code: string (nullable = true)
 |-- x_coordinate: string (nullable = true)
 |-- y_coordinate: string (nullable = true)
 |-- year: integer (nullable = true)
 |-- updated_on: string (nullable = true)
 |-- latitude: double (nullable = true)
 |-- longitude: double (nullable = true)
 |-- location: string (nullable = true)



In [None]:
rc.show(5)

+-------------+----------------+-------------------+------------------+----+------------+-----------+--------------------+------+--------+----+--------+----+--------------+--------+------------+------------+----+--------------------+------------+-------------+--------+
|           id|     case_number|               Date|             block|iucr|primary_type|description|location_description|arrest|domestic|beat|district|ward|community_area|fbi_code|x_coordinate|y_coordinate|year|          updated_on|    latitude|    longitude|location|
+-------------+----------------+-------------------+------------------+----+------------+-----------+--------------------+------+--------+----+--------+----+--------------+--------+------------+------------+----+--------------------+------------+-------------+--------+
|     13637238|        JH476216|2024-10-21 00:00:00|   041XX W 13TH ST|0460|     BATTERY|     SIMPLE|              STREET| false|   false|1011|     010|  24|            29|     08B|     1149

## Working with Columns

In [None]:
# Show specific column using different methods:

#rc.select('iucr').show(5)
#rc.select(rc.iucr).show(5)
#rc.select(col('iucr')).show(5)

rc.select('case_number', 'Date', 'block').show(4)


+----------------+-------------------+------------------+
|     case_number|               Date|             block|
+----------------+-------------------+------------------+
|        JH476216|2024-10-21 00:00:00|   041XX W 13TH ST|
|                |               NULL|              NULL|
| -87.727866959)"|               NULL|              NULL|
|        JH477346|2024-10-21 00:00:00|017XX N BURLING ST|
+----------------+-------------------+------------------+
only showing top 4 rows



### Working with Rows



In [None]:
# Group by date, descending, filter by dates before filter date
rc.orderBy(['Date'], ascending=False).select(['Date', 'block']).filter(col('Date') < lit('2024-11-12')).show(5)

+-------------------+--------------------+
|               Date|               block|
+-------------------+--------------------+
|2024-10-21 00:00:00|     041XX W 13TH ST|
|2024-10-21 00:00:00|  017XX N BURLING ST|
|2024-10-21 00:00:00|067XX N GLENWOOD AVE|
|2024-10-21 00:00:00| 044XX S PRAIRIE AVE|
|2024-10-21 00:00:00|   079XX S TRIPP AVE|
+-------------------+--------------------+
only showing top 5 rows



In [None]:
# Filter crimes from specific cutoff-day
one_day = rc.filter(col('Date') < lit('2024-10-20')).filter(col('Date') > lit('2024-10-19'))
one_day.count()


249

### RDD

In [None]:
rdd = sc.textFile('./chicago-crimes-2001-present.csv')
header = rdd.first()

In [None]:
rows = rdd.filter(lambda l: l != header)
rows.first()

'"13637238","JH476216","2024-10-21T00:00:00.000","041XX W 13TH ST","0460","BATTERY","SIMPLE","STREET","false","false","1011","010","24","29","08B","1149173","1893724","2024","2024-10-28T15:40:58.000","41.864304053","-87.727866959","'

In [None]:
rows.map(lambda l: l.split(',')).count()




2976

In [None]:
rows.map(lambda l: l.split(',')).collect()[0]

['"13637238"',
 '"JH476216"',
 '"2024-10-21T00:00:00.000"',
 '"041XX W 13TH ST"',
 '"0460"',
 '"BATTERY"',
 '"SIMPLE"',
 '"STREET"',
 '"false"',
 '"false"',
 '"1011"',
 '"010"',
 '"24"',
 '"29"',
 '"08B"',
 '"1149173"',
 '"1893724"',
 '"2024"',
 '"2024-10-28T15:40:58.000"',
 '"41.864304053"',
 '"-87.727866959"',
 '"']