In [1]:
# Must be included at the beginning of each new notebook. Remember to change the app name.
import findspark
findspark.init('/home/ubuntu/spark-2.1.1-bin-hadoop2.7')
import pyspark
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName('1').getOrCreate()

In [2]:
# Importing data.
df = spark.read.csv('dataset/ri_statewide_2019_02_25.csv', header=True, inferSchema=True)

In [3]:
#Intial look at the data
df.show()

+--------------+----------+--------+----+------------+-----------+-------------+---------+-----------+---------------+--------------+--------+----------------+----------------+------------------+------------------+----------------+---------------+----------------+------------+-----------------+--------------------+------------+-------------+
+--------------+----------+--------+----+------------+-----------+-------------+---------+-----------+---------------+--------------+--------+----------------+----------------+------------------+------------------+----------------+---------------+----------------+------------+-----------------+--------------------+------------+-------------+
|             1|2005-11-22|11:15:00|  X3|       white|       male|          200|vehicular|      FALSE|           TRUE|         FALSE|citation|              NA|              NA|                NA|                NA|           false|          FALSE|           FALSE|          NA|               NA|            Speed

In [4]:
#Print the schema
df.printSchema()

root
 |-- raw_row_number: integer (nullable = true)
 |-- date: string (nullable = true)
 |-- time: string (nullable = true)
 |-- zone: string (nullable = true)
 |-- subject_race: string (nullable = true)
 |-- subject_sex: string (nullable = true)
 |-- department_id: string (nullable = true)
 |-- type: string (nullable = true)
 |-- arrest_made: string (nullable = true)
 |-- citation_issued: string (nullable = true)
 |-- outcome: string (nullable = true)
 |-- contraband_found: string (nullable = true)
 |-- contraband_drugs: string (nullable = true)
 |-- contraband_weapons: string (nullable = true)
 |-- contraband_alcohol: string (nullable = true)
 |-- contraband_other: boolean (nullable = true)
 |-- frisk_performed: string (nullable = true)
 |-- search_conducted: string (nullable = true)
 |-- search_basis: string (nullable = true)
 |-- reason_for_search: string (nullable = true)
 |-- reason_for_stop: string (nullable = true)
 |-- vehicle_make: string (nullable = true)
 |-- vehicle_model:

In [5]:
#Total number of values
df.count()

509681

In [6]:
df.groupBy("subject_sex").count().show()

+-----------+------+
|subject_sex| count|
+-----------+------+
|         NA| 29097|
|     female|131138|
|       male|349446|
+-----------+------+



In [7]:
df.groupBy(["zone", "subject_sex"]).count().show()

+----+-----------+-----+
|zone|subject_sex|count|
+----+-----------+-----+
|  X3|     female|26653|
|  X3|         NA| 4627|
|  X3|       male|62778|
|  X4|         NA| 9679|
|  X4|       male|94953|
|  K3|     female|29097|
|  X1|         NA| 3491|
|  X1|     female| 2702|
|  K3|       male|79771|
|  X1|       male|10522|
|  K1|         NA| 2252|
|  K2|     female|28114|
|  K1|       male|32255|
|  NA|         NA|   10|
|  K3|         NA| 4916|
|  K1|     female|13855|
|  K2|         NA| 4122|
|  X4|     female|30717|
|  K2|       male|69167|
+----+-----------+-----+



In [8]:
df.groupBy(["subject_sex", "reason_for_stop"]).count().show()

+-----------+--------------------+------+
|subject_sex|     reason_for_stop| count|
+-----------+--------------------+------+
|     female|Other Traffic Vio...| 17911|
|       male|Special Detail/Di...| 12977|
|     female|Equipment/Inspect...| 14039|
|     female|                 APB|   109|
|     female|Violation of City...|   216|
|         NA|    Call for Service|     4|
|         NA|Equipment/Inspect...|     2|
|       male|    Call for Service|  5237|
|       male|Registration Viol...| 14181|
|       male|            Speeding|182538|
|         NA|            Speeding|     8|
|       male|Motorist Assist/C...|   657|
|       male|   Suspicious Person|   268|
|     female|  Seatbelt Violation|  3550|
|     female|Registration Viol...|  5649|
|       male|Other Traffic Vio...| 72317|
|         NA|                  NA| 29073|
|     female|   Suspicious Person|    74|
|         NA|  Seatbelt Violation|     3|
|       male|                 APB|   376|
+-----------+--------------------+

In [9]:
#Drop the rows from which subject_sex is missing
#Regular dropna wasn't working cuz data is a String "NA"
df = df.filter(df.subject_sex.endswith('ale'))

In [10]:
df.groupBy("subject_sex").count().show()

+-----------+------+
|subject_sex| count|
+-----------+------+
|     female|131138|
|       male|349446|
+-----------+------+



In [11]:
df.groupBy("contraband_found").count().show()

+----------------+------+
|contraband_found| count|
+----------------+------+
|           FALSE| 11183|
|              NA|462822|
|            TRUE|  6579|
+----------------+------+



In [12]:
#total number of values in data BEFORE dropping duplicates
df.count()

480584

In [13]:
#dropping duplicates from data
df = df.dropDuplicates()

In [14]:
#total number of values in data AFTER dropping duplicates
df.count()

480584

In [15]:
#For checking what to drop
df.printSchema()

root
 |-- raw_row_number: integer (nullable = true)
 |-- date: string (nullable = true)
 |-- time: string (nullable = true)
 |-- zone: string (nullable = true)
 |-- subject_race: string (nullable = true)
 |-- subject_sex: string (nullable = true)
 |-- department_id: string (nullable = true)
 |-- type: string (nullable = true)
 |-- arrest_made: string (nullable = true)
 |-- citation_issued: string (nullable = true)
 |-- outcome: string (nullable = true)
 |-- contraband_found: string (nullable = true)
 |-- contraband_drugs: string (nullable = true)
 |-- contraband_weapons: string (nullable = true)
 |-- contraband_alcohol: string (nullable = true)
 |-- contraband_other: boolean (nullable = true)
 |-- frisk_performed: string (nullable = true)
 |-- search_conducted: string (nullable = true)
 |-- search_basis: string (nullable = true)
 |-- reason_for_search: string (nullable = true)
 |-- reason_for_stop: string (nullable = true)
 |-- vehicle_make: string (nullable = true)
 |-- vehicle_model:

In [16]:
#Drop Prim keys from data
df = df.drop("raw_row_number", "department_id")

In [17]:
df.printSchema()

root
 |-- date: string (nullable = true)
 |-- time: string (nullable = true)
 |-- zone: string (nullable = true)
 |-- subject_race: string (nullable = true)
 |-- subject_sex: string (nullable = true)
 |-- type: string (nullable = true)
 |-- arrest_made: string (nullable = true)
 |-- citation_issued: string (nullable = true)
 |-- outcome: string (nullable = true)
 |-- contraband_found: string (nullable = true)
 |-- contraband_drugs: string (nullable = true)
 |-- contraband_weapons: string (nullable = true)
 |-- contraband_alcohol: string (nullable = true)
 |-- contraband_other: boolean (nullable = true)
 |-- frisk_performed: string (nullable = true)
 |-- search_conducted: string (nullable = true)
 |-- search_basis: string (nullable = true)
 |-- reason_for_search: string (nullable = true)
 |-- reason_for_stop: string (nullable = true)
 |-- vehicle_make: string (nullable = true)
 |-- vehicle_model: string (nullable = true)



In [18]:
#Check feature type
df.groupBy("type").count().show()

+---------+------+
|     type| count|
+---------+------+
|vehicular|480584|
+---------+------+



In [19]:
#Drop type feature from data, since it has only one value
df = df.drop('type')

In [20]:
#Checking if type was dropped
df.printSchema()

root
 |-- date: string (nullable = true)
 |-- time: string (nullable = true)
 |-- zone: string (nullable = true)
 |-- subject_race: string (nullable = true)
 |-- subject_sex: string (nullable = true)
 |-- arrest_made: string (nullable = true)
 |-- citation_issued: string (nullable = true)
 |-- outcome: string (nullable = true)
 |-- contraband_found: string (nullable = true)
 |-- contraband_drugs: string (nullable = true)
 |-- contraband_weapons: string (nullable = true)
 |-- contraband_alcohol: string (nullable = true)
 |-- contraband_other: boolean (nullable = true)
 |-- frisk_performed: string (nullable = true)
 |-- search_conducted: string (nullable = true)
 |-- search_basis: string (nullable = true)
 |-- reason_for_search: string (nullable = true)
 |-- reason_for_stop: string (nullable = true)
 |-- vehicle_make: string (nullable = true)
 |-- vehicle_model: string (nullable = true)



In [21]:
df.groupBy("subject_race").count().show()

+--------------------+------+
|        subject_race| count|
+--------------------+------+
|               white|344716|
|               black| 68577|
|            hispanic| 53123|
|       other/unknown|  1344|
|asian/pacific isl...| 12824|
+--------------------+------+



In [22]:
#NA Values in some fields
df.groupBy("search_conducted").count().show()

+----------------+------+
|search_conducted| count|
+----------------+------+
|           FALSE|462822|
|            TRUE| 17762|
+----------------+------+



In [23]:
#NA Values in some fields
df.groupBy("contraband_found").count().show()

+----------------+------+
|contraband_found| count|
+----------------+------+
|           FALSE| 11183|
|              NA|462822|
|            TRUE|  6579|
+----------------+------+



In [24]:
from pyspark.sql.functions import when
#Create a new column for contraband_found 
df = df.withColumn("contraband_found_resolved", when(df.contraband_found == "NA", 0).otherwise(1))

In [25]:
#Check newly created column
df.groupBy("contraband_found_resolved").count().show()

+-------------------------+------+
|contraband_found_resolved| count|
+-------------------------+------+
|                        1| 17762|
|                        0|462822|
+-------------------------+------+



In [26]:
df.printSchema()

root
 |-- date: string (nullable = true)
 |-- time: string (nullable = true)
 |-- zone: string (nullable = true)
 |-- subject_race: string (nullable = true)
 |-- subject_sex: string (nullable = true)
 |-- arrest_made: string (nullable = true)
 |-- citation_issued: string (nullable = true)
 |-- outcome: string (nullable = true)
 |-- contraband_found: string (nullable = true)
 |-- contraband_drugs: string (nullable = true)
 |-- contraband_weapons: string (nullable = true)
 |-- contraband_alcohol: string (nullable = true)
 |-- contraband_other: boolean (nullable = true)
 |-- frisk_performed: string (nullable = true)
 |-- search_conducted: string (nullable = true)
 |-- search_basis: string (nullable = true)
 |-- reason_for_search: string (nullable = true)
 |-- reason_for_stop: string (nullable = true)
 |-- vehicle_make: string (nullable = true)
 |-- vehicle_model: string (nullable = true)
 |-- contraband_found_resolved: integer (nullable = false)



In [None]:
from pyspark.sql.functions import to_date
#Creating a date time 
df['new_date'] = df.select(to_date(df.date).alias('date')).collect()

In [None]:
df.printSchema()