In [484]:
!pip install findspark
!pip install pyspark



In [485]:
import findspark
findspark.init()


In [486]:
from pyspark import SparkContext

In [487]:
from pyspark.sql import SparkSession

# Initialize SparkSession
spark = SparkSession.builder \
    .appName("CrimeDataAnalysis") \
    .getOrCreate()

# Now you can use the 'spark' variable to read CSV files or perform other Spark operations


In [594]:
from pyspark.sql.functions import to_timestamp,col,lit
read_csv = spark.read.csv('sample_data/crime.csv', header=True).withColumn('Date', to_timestamp(col('Date'), 'MM/dd/yyyy hh:mm:ss a'))
read_csv.show(5)


+--------+-----------+-------------------+-------------------+----+--------------------+--------------------+--------------------+------+--------+----+--------+----+--------------+--------+------------+------------+----+--------------------+--------+---------+--------+
|      ID|Case Number|               Date|              Block|IUCR|        Primary Type|         Description|Location Description|Arrest|Domestic|Beat|District|Ward|Community Area|FBI Code|X Coordinate|Y Coordinate|Year|          Updated On|Latitude|Longitude|Location|
+--------+-----------+-------------------+-------------------+----+--------------------+--------------------+--------------------+------+--------+----+--------+----+--------------+--------+------------+------------+----+--------------------+--------+---------+--------+
|12536164|   JE439378|2015-09-24 00:00:00|    031XX W 53RD PL|1753|OFFENSE INVOLVING...|SEXUAL ASSAULT OF...|           APARTMENT| false|    true|0923|     009|  14|            63|      02| 

**Schema Handling**

In [595]:
read_csv.printSchema()

root
 |-- ID: string (nullable = true)
 |-- Case Number: string (nullable = true)
 |-- Date: timestamp (nullable = true)
 |-- Block: string (nullable = true)
 |-- IUCR: string (nullable = true)
 |-- Primary Type: string (nullable = true)
 |-- Description: string (nullable = true)
 |-- Location Description: string (nullable = true)
 |-- Arrest: string (nullable = true)
 |-- Domestic: string (nullable = true)
 |-- Beat: string (nullable = true)
 |-- District: string (nullable = true)
 |-- Ward: string (nullable = true)
 |-- Community Area: string (nullable = true)
 |-- FBI Code: string (nullable = true)
 |-- X Coordinate: string (nullable = true)
 |-- Y Coordinate: string (nullable = true)
 |-- Year: string (nullable = true)
 |-- Updated On: string (nullable = true)
 |-- Latitude: string (nullable = true)
 |-- Longitude: string (nullable = true)
 |-- Location: string (nullable = true)



In [596]:
from pyspark.sql.types import StructType, StructField, StringType, TimestampType, BooleanType, DoubleType, IntegerType

In [597]:
read_csv.columns

['ID',
 'Case Number',
 'Date',
 'Block',
 'IUCR',
 'Primary Type',
 'Description',
 'Location Description',
 'Arrest',
 'Domestic',
 'Beat',
 'District',
 'Ward',
 'Community Area',
 'FBI Code',
 'X Coordinate',
 'Y Coordinate',
 'Year',
 'Updated On',
 'Latitude',
 'Longitude',
 'Location']

Just an example on how the type of Date (timestamp type has been changed to string). However, timestamp is the preferred data-type for the Date column.

In [598]:
labels= [('ID',StringType()),
('Date',StringType()),
('Block',StringType()),
('IUCR',StringType()),
('Primary Type',StringType()),
('Description',StringType()),
('Location Description',StringType()),
('Arrest',StringType()),
('Domestic', BooleanType()),
('Beat', StringType()),
('Ward',StringType()),
('FBI Code', StringType()),
('X Coordinate',StringType()),
('Y Coordinate',StringType()),
('Latitude', DoubleType()),
('Longitude', DoubleType()),
('Location', StringType())]

In [599]:
schema = StructType([StructField(x[0],x[1],True) for x in labels])


In [600]:
csv_df = spark.read.csv('sample_data/crime.csv', schema = schema)

In [601]:
csv_df.printSchema()

root
 |-- ID: string (nullable = true)
 |-- Date: string (nullable = true)
 |-- Block: string (nullable = true)
 |-- IUCR: string (nullable = true)
 |-- Primary Type: string (nullable = true)
 |-- Description: string (nullable = true)
 |-- Location Description: string (nullable = true)
 |-- Arrest: string (nullable = true)
 |-- Domestic: boolean (nullable = true)
 |-- Beat: string (nullable = true)
 |-- Ward: string (nullable = true)
 |-- FBI Code: string (nullable = true)
 |-- X Coordinate: string (nullable = true)
 |-- Y Coordinate: string (nullable = true)
 |-- Latitude: double (nullable = true)
 |-- Longitude: double (nullable = true)
 |-- Location: string (nullable = true)



In [602]:
csv_df.show(5)

+--------+-----------+--------------------+-------------------+------------+--------------------+--------------------+--------------------+--------+--------+----+--------+------------+--------------+--------+---------+------------+
|      ID|       Date|               Block|               IUCR|Primary Type|         Description|Location Description|              Arrest|Domestic|    Beat|Ward|FBI Code|X Coordinate|  Y Coordinate|Latitude|Longitude|    Location|
+--------+-----------+--------------------+-------------------+------------+--------------------+--------------------+--------------------+--------+--------+----+--------+------------+--------------+--------+---------+------------+
|      ID|Case Number|                Date|              Block|        IUCR|        Primary Type|         Description|Location Description|    NULL|Domestic|Beat|District|        Ward|Community Area|    NULL|     NULL|Y Coordinate|
|12536164|   JE439378|09/24/2015 12:00:...|    031XX W 53RD PL|        1

**Showing column**

In [603]:
read_csv.select('IUCR').show(5)

+----+
|IUCR|
+----+
|1753|
|1754|
|1752|
|1752|
|1752|
+----+
only showing top 5 rows



In [604]:
read_csv.select(col('IUCR')).show(5)

+----+
|IUCR|
+----+
|1753|
|1754|
|1752|
|1752|
|1752|
+----+
only showing top 5 rows



**Showing multiple cols**

In [605]:
read_csv.select('IUCR', 'Primary Type', 'Domestic').show(5)

+----+--------------------+--------+
|IUCR|        Primary Type|Domestic|
+----+--------------------+--------+
|1753|OFFENSE INVOLVING...|    true|
|1754|OFFENSE INVOLVING...|    true|
|1752|OFFENSE INVOLVING...|    true|
|1752|OFFENSE INVOLVING...|    true|
|1752|OFFENSE INVOLVING...|    true|
+----+--------------------+--------+
only showing top 5 rows



**Add new column named NEW COL with all entries 1s**

In [606]:
from pyspark.sql.functions import lit

In [607]:
read_csv.withColumn('NEW COLUMN', lit(1)).show(5)

+--------+-----------+-------------------+-------------------+----+--------------------+--------------------+--------------------+------+--------+----+--------+----+--------------+--------+------------+------------+----+--------------------+--------+---------+--------+----------+
|      ID|Case Number|               Date|              Block|IUCR|        Primary Type|         Description|Location Description|Arrest|Domestic|Beat|District|Ward|Community Area|FBI Code|X Coordinate|Y Coordinate|Year|          Updated On|Latitude|Longitude|Location|NEW COLUMN|
+--------+-----------+-------------------+-------------------+----+--------------------+--------------------+--------------------+------+--------+----+--------+----+--------------+--------+------------+------------+----+--------------------+--------+---------+--------+----------+
|12536164|   JE439378|2015-09-24 00:00:00|    031XX W 53RD PL|1753|OFFENSE INVOLVING...|SEXUAL ASSAULT OF...|           APARTMENT| false|    true|0923|     0

**Dropping column ` IUCR`**

In [608]:
read_csv.drop('IUCR').show(5)

+--------+-----------+-------------------+-------------------+--------------------+--------------------+--------------------+------+--------+----+--------+----+--------------+--------+------------+------------+----+--------------------+--------+---------+--------+
|      ID|Case Number|               Date|              Block|        Primary Type|         Description|Location Description|Arrest|Domestic|Beat|District|Ward|Community Area|FBI Code|X Coordinate|Y Coordinate|Year|          Updated On|Latitude|Longitude|Location|
+--------+-----------+-------------------+-------------------+--------------------+--------------------+--------------------+------+--------+----+--------+----+--------------+--------+------------+------------+----+--------------------+--------+---------+--------+
|12536164|   JE439378|2015-09-24 00:00:00|    031XX W 53RD PL|OFFENSE INVOLVING...|SEXUAL ASSAULT OF...|           APARTMENT| false|    true|0923|     009|  14|            63|      02|        NULL|        

#### WORKING WITH ROWS

In [609]:
read_csv.select('Date').show(5)

+-------------------+
|               Date|
+-------------------+
|2015-09-24 00:00:00|
|2015-05-15 00:00:00|
|2015-06-18 00:00:00|
|2015-08-30 00:00:00|
|2015-01-12 12:00:00|
+-------------------+
only showing top 5 rows



Filtering on Date

In [610]:
one_day_crime_df = read_csv.filter(col('Date').contains('2015-01-12'))
one_day_crime_df.count()



608

In [611]:
one_day_crime_df.show(5)

+--------+-----------+-------------------+-------------------+----+--------------------+--------------------+--------------------+------+--------+----+--------+----+--------------+--------+------------+------------+----+--------------------+------------+-------------+--------------------+
|      ID|Case Number|               Date|              Block|IUCR|        Primary Type|         Description|Location Description|Arrest|Domestic|Beat|District|Ward|Community Area|FBI Code|X Coordinate|Y Coordinate|Year|          Updated On|    Latitude|    Longitude|            Location|
+--------+-----------+-------------------+-------------------+----+--------------------+--------------------+--------------------+------+--------+----+--------+----+--------------+--------+------------+------------+----+--------------------+------------+-------------+--------------------+
|12998264|   JG171087|2015-01-12 12:00:00|   0000X E 122ND ST|1752|OFFENSE INVOLVING...|AGGRAVATED CRIMIN...|           RESIDENCE|

In [612]:
one_day_crime_df.count()

608

In [613]:
read_csv.count()

264840

In [614]:
new_csv = read_csv.union(one_day_crime_df)

In [615]:
new_csv.count()

265448

In [616]:
new_csv.orderBy('Date',ascending = False ).show(5)

+--------+-----------+-------------------+--------------------+----+---------------+--------------------+--------------------+------+--------+----+--------+----+--------------+--------+------------+------------+----+--------------------+------------+-------------+--------------------+
|      ID|Case Number|               Date|               Block|IUCR|   Primary Type|         Description|Location Description|Arrest|Domestic|Beat|District|Ward|Community Area|FBI Code|X Coordinate|Y Coordinate|Year|          Updated On|    Latitude|    Longitude|            Location|
+--------+-----------+-------------------+--------------------+----+---------------+--------------------+--------------------+------+--------+----+--------+----+--------------+--------+------------+------------+----+--------------------+------------+-------------+--------------------+
|10365064|   HZ100370|2015-12-31 23:59:00| 075XX S EMERALD AVE|1320|CRIMINAL DAMAGE|          TO VEHICLE|              STREET| false|   false|

In [617]:
crime_type = read_csv.select('Primary Type').distinct()

In [618]:
crime_type.count()


33

In [619]:
crime_type_list = crime_type.select('Primary Type').collect()
crime_list = [row['Primary Type'] for row in crime_type_list]
print(crime_list)

['OFFENSE INVOLVING CHILDREN', 'CRIMINAL SEXUAL ASSAULT', 'STALKING', 'PUBLIC PEACE VIOLATION', 'OBSCENITY', 'ARSON', 'GAMBLING', 'CRIMINAL TRESPASS', 'ASSAULT', 'NON - CRIMINAL', 'LIQUOR LAW VIOLATION', 'MOTOR VEHICLE THEFT', 'THEFT', 'BATTERY', 'ROBBERY', 'HOMICIDE', 'PUBLIC INDECENCY', 'CRIM SEXUAL ASSAULT', 'HUMAN TRAFFICKING', 'INTIMIDATION', 'PROSTITUTION', 'DECEPTIVE PRACTICE', 'CONCEALED CARRY LICENSE VIOLATION', 'SEX OFFENSE', 'CRIMINAL DAMAGE', 'NARCOTICS', 'NON-CRIMINAL', 'OTHER OFFENSE', 'KIDNAPPING', 'BURGLARY', 'WEAPONS VIOLATION', 'OTHER NARCOTIC VIOLATION', 'INTERFERENCE WITH PUBLIC OFFICER']


**Top 10 number of reported crimes by Primary type in descending order of occurance**

In [620]:
new_csv.groupBy('Primary Type').count().orderBy('count', ascending=False).show(10)

+-------------------+-----+
|       Primary Type|count|
+-------------------+-----+
|              THEFT|57499|
|            BATTERY|49015|
|    CRIMINAL DAMAGE|28734|
|          NARCOTICS|24004|
|      OTHER OFFENSE|17613|
|            ASSAULT|17083|
| DECEPTIVE PRACTICE|16497|
|           BURGLARY|13220|
|MOTOR VEHICLE THEFT|10089|
|            ROBBERY| 9664|
+-------------------+-----+
only showing top 10 rows



Percentage of Reported Crime that resulted in arrest


In [621]:
new_csv.select('Arrest').distinct().show()

+------+
|Arrest|
+------+
| false|
|  true|
+------+



In [622]:
new_csv.select('Arrest').printSchema()

root
 |-- Arrest: string (nullable = true)



In [623]:
new_csv.filter(col('Arrest')=='true').count()/new_csv.select('Arrest').count()

0.26463940206744824

Top 3 locations for reported crimes

In [624]:
new_csv.groupBy('Location Description').count().orderBy('count', ascending=False).show(3)

+--------------------+-----+
|Location Description|count|
+--------------------+-----+
|              STREET|60896|
|           RESIDENCE|41984|
|           APARTMENT|35153|
+--------------------+-----+
only showing top 3 rows



Built-In Functions

In [625]:
from pyspark.sql import functions
print(dir(functions))



Example: Display PRIMARY DESCRIPTION column in Lower, Upper and first four substring

In [626]:
from pyspark.sql.functions import lower, upper, substring


In [627]:
help(substring)

Help on function substring in module pyspark.sql.functions:

substring(str: 'ColumnOrName', pos: int, len: int) -> pyspark.sql.column.Column
    Substring starts at `pos` and is of length `len` when str is String type or
    returns the slice of byte array that starts at `pos` in byte and is of length `len`
    when str is Binary type.
    
    .. versionadded:: 1.5.0
    
    .. versionchanged:: 3.4.0
        Supports Spark Connect.
    
    Notes
    -----
    The position is not zero based, but 1 based index.
    
    Parameters
    ----------
    str : :class:`~pyspark.sql.Column` or str
        target column to work on.
    pos : int
        starting position in str.
    len : int
        length of chars.
    
    Returns
    -------
    :class:`~pyspark.sql.Column`
        substring of given value.
    
    Examples
    --------
    >>> df = spark.createDataFrame([('abcd',)], ['s',])
    >>> df.select(substring(df.s, 1, 2).alias('s')).collect()
    [Row(s='ab')]



In [628]:
new_csv.select(lower(col('Primary Type')), upper('Primary Type'), substring(col('Primary Type'),1,4)).show(5)

+--------------------+--------------------+-----------------------------+
| lower(Primary Type)| upper(Primary Type)|substring(Primary Type, 1, 4)|
+--------------------+--------------------+-----------------------------+
|offense involving...|OFFENSE INVOLVING...|                         OFFE|
|offense involving...|OFFENSE INVOLVING...|                         OFFE|
|offense involving...|OFFENSE INVOLVING...|                         OFFE|
|offense involving...|OFFENSE INVOLVING...|                         OFFE|
|offense involving...|OFFENSE INVOLVING...|                         OFFE|
+--------------------+--------------------+-----------------------------+
only showing top 5 rows



**Working with Dates**
> - Show the oldest date and most recent date
> - What is 3 days earlier than the oldest date and 3 days later than the most recent date ?

User-defined functions

In [629]:
 from pyspark.sql.functions import  min, max


In [630]:
new_csv.select(min(col('Date')), max(col('Date'))).show(1)

+-------------------+-------------------+
|          min(Date)|          max(Date)|
+-------------------+-------------------+
|2015-01-01 00:00:00|2015-12-31 23:59:00|
+-------------------+-------------------+



In [631]:
from pyspark.sql.functions import date_add, date_sub

In [632]:
help(date_add)

Help on function date_add in module pyspark.sql.functions:

date_add(start: 'ColumnOrName', days: Union[ForwardRef('ColumnOrName'), int]) -> pyspark.sql.column.Column
    Returns the date that is `days` days after `start`. If `days` is a negative value
    then these amount of days will be deducted from `start`.
    
    .. versionadded:: 1.5.0
    
    .. versionchanged:: 3.4.0
        Supports Spark Connect.
    
    Parameters
    ----------
    start : :class:`~pyspark.sql.Column` or str
        date column to work on.
    days : :class:`~pyspark.sql.Column` or str or int
        how many days after the given date to calculate.
        Accepts negative value as well to calculate backwards in time.
    
    Returns
    -------
    :class:`~pyspark.sql.Column`
        a date after/before given number of days.
    
    Examples
    --------
    >>> df = spark.createDataFrame([('2015-04-08', 2,)], ['dt', 'add'])
    >>> df.select(date_add(df.dt, 1).alias('next_date')).collect()
    [Ro

In [633]:
new_csv.select(date_sub(min(col('Date')),3), date_add(max(col('Date')),3)).show(1)

+----------------------+----------------------+
|date_sub(min(Date), 3)|date_add(max(Date), 3)|
+----------------------+----------------------+
|            2014-12-29|            2016-01-03|
+----------------------+----------------------+



**Join Example**

Reported Crime dataset has only the distict number, now add the district name using the police station dataset using join

In [634]:
police_station_data = spark.read.csv("sample_data/police_station.csv", header=True)
police_station_data.show(5)

+------------+-------------+-------------------+-------+-----+-----+--------------------+------------+------------+------------+------------+------------+-----------+------------+--------------------+
|    DISTRICT|DISTRICT NAME|            ADDRESS|   CITY|STATE|  ZIP|             WEBSITE|       PHONE|         FAX|         TTY|X COORDINATE|Y COORDINATE|   LATITUDE|   LONGITUDE|            LOCATION|
+------------+-------------+-------------------+-------+-----+-----+--------------------+------------+------------+------------+------------+------------+-----------+------------+--------------------+
|Headquarters| Headquarters|3510 S Michigan Ave|Chicago|   IL|60653|http://home.chica...|        NULL|        NULL|        NULL| 1177731.401| 1881697.404|41.83070169|-87.62339535|(41.8307016873, -...|
|          18|   Near North| 1160 N Larrabee St|Chicago|   IL|60610|http://home.chica...|312-742-5870|312-742-5771|312-742-5773| 1172080.029| 1908086.527|41.90324165|-87.64335214|(41.9032416531, -

In [635]:

from pyspark.sql.functions import col

# Assuming you have a PySpark DataFrame named 'df'

# Remove rows with value 'xyz' in a specific column
police_station_data = police_station_data.filter(col('DISTRICT') != 'Headquarters')

# Show the DataFrame after filtering
police_station_data.show()


+--------+--------------+--------------------+-------+-----+-----+--------------------+------------+------------+------------+------------+------------+-----------+------------+--------------------+
|DISTRICT| DISTRICT NAME|             ADDRESS|   CITY|STATE|  ZIP|             WEBSITE|       PHONE|         FAX|         TTY|X COORDINATE|Y COORDINATE|   LATITUDE|   LONGITUDE|            LOCATION|
+--------+--------------+--------------------+-------+-----+-----+--------------------+------------+------------+------------+------------+------------+-----------+------------+--------------------+
|      18|    Near North|  1160 N Larrabee St|Chicago|   IL|60610|http://home.chica...|312-742-5870|312-742-5771|312-742-5773| 1172080.029| 1908086.527|41.90324165|-87.64335214|(41.9032416531, -...|
|      19|     Town Hall|    850 W Addison St|Chicago|   IL|60613|http://home.chica...|312-744-8320|312-744-4481|312-744-8011| 1169730.744| 1924160.317|41.94740046|-87.65151202|(41.9474004564, -...|
|    

In [636]:
police_station_data.select('DISTRICT').distinct().show(50)

+--------+
|DISTRICT|
+--------+
|       7|
|      15|
|      11|
|       3|
|       8|
|      22|
|      16|
|       5|
|      18|
|      17|
|       6|
|      19|
|      25|
|      24|
|       9|
|       1|
|      20|
|      10|
|       4|
|      14|
|       2|
+--------+



In [637]:
new_csv.select(col('District')).distinct().show(50)

+--------+
|District|
+--------+
|     009|
|     012|
|     024|
|     031|
|     015|
|     006|
|     019|
|     020|
|     011|
|     025|
|     005|
|     003|
|     016|
|     018|
|     008|
|     022|
|     001|
|     014|
|     010|
|     004|
|     017|
|     007|
|     002|
+--------+



In [638]:
from pyspark.sql.functions import lpad

In [639]:
help(lpad)

Help on function lpad in module pyspark.sql.functions:

lpad(col: 'ColumnOrName', len: int, pad: str) -> pyspark.sql.column.Column
    Left-pad the string column to width `len` with `pad`.
    
    .. versionadded:: 1.5.0
    
    .. versionchanged:: 3.4.0
        Supports Spark Connect.
    
    Parameters
    ----------
    col : :class:`~pyspark.sql.Column` or str
        target column to work on.
    len : int
        length of the final string.
    pad : str
        chars to prepend.
    
    Returns
    -------
    :class:`~pyspark.sql.Column`
        left padded result.
    
    Examples
    --------
    >>> df = spark.createDataFrame([('abcd',)], ['s',])
    >>> df.select(lpad(df.s, 6, '#').alias('s')).collect()
    [Row(s='##abcd')]



In [640]:
police_station_data.select(lpad(col('DISTRICT'),3,'0')).show()

+--------------------+
|lpad(DISTRICT, 3, 0)|
+--------------------+
|                 018|
|                 019|
|                 020|
|                 022|
|                 024|
|                 025|
|                 001|
|                 002|
|                 003|
|                 004|
|                 005|
|                 006|
|                 007|
|                 008|
|                 009|
|                 010|
|                 011|
|                 014|
|                 015|
|                 016|
+--------------------+
only showing top 20 rows



In [641]:
ps = police_station_data.withColumn('FormattedDistrict', lpad(col('DISTRICT'),3,'0'))

In [642]:
police_station_data.columns

['DISTRICT',
 'DISTRICT NAME',
 'ADDRESS',
 'CITY',
 'STATE',
 'ZIP',
 'WEBSITE',
 'PHONE',
 'FAX',
 'TTY',
 'X COORDINATE',
 'Y COORDINATE',
 'LATITUDE',
 'LONGITUDE',
 'LOCATION']

In [643]:
ps.show(5)

+--------+-------------+-------------------+-------+-----+-----+--------------------+------------+------------+------------+------------+------------+-----------+------------+--------------------+-----------------+
|DISTRICT|DISTRICT NAME|            ADDRESS|   CITY|STATE|  ZIP|             WEBSITE|       PHONE|         FAX|         TTY|X COORDINATE|Y COORDINATE|   LATITUDE|   LONGITUDE|            LOCATION|FormattedDistrict|
+--------+-------------+-------------------+-------+-----+-----+--------------------+------------+------------+------------+------------+------------+-----------+------------+--------------------+-----------------+
|      18|   Near North| 1160 N Larrabee St|Chicago|   IL|60610|http://home.chica...|312-742-5870|312-742-5771|312-742-5773| 1172080.029| 1908086.527|41.90324165|-87.64335214|(41.9032416531, -...|              018|
|      19|    Town Hall|   850 W Addison St|Chicago|   IL|60613|http://home.chica...|312-744-8320|312-744-4481|312-744-8011| 1169730.744| 19

In [644]:
police_station_data.show(5)

+--------+-------------+-------------------+-------+-----+-----+--------------------+------------+------------+------------+------------+------------+-----------+------------+--------------------+
|DISTRICT|DISTRICT NAME|            ADDRESS|   CITY|STATE|  ZIP|             WEBSITE|       PHONE|         FAX|         TTY|X COORDINATE|Y COORDINATE|   LATITUDE|   LONGITUDE|            LOCATION|
+--------+-------------+-------------------+-------+-----+-----+--------------------+------------+------------+------------+------------+------------+-----------+------------+--------------------+
|      18|   Near North| 1160 N Larrabee St|Chicago|   IL|60610|http://home.chica...|312-742-5870|312-742-5771|312-742-5773| 1172080.029| 1908086.527|41.90324165|-87.64335214|(41.9032416531, -...|
|      19|    Town Hall|   850 W Addison St|Chicago|   IL|60613|http://home.chica...|312-744-8320|312-744-4481|312-744-8011| 1169730.744| 1924160.317|41.94740046|-87.65151202|(41.9474004564, -...|
|      20|     

In [651]:
new_csv.join(ps, new_csv.District == ps.FormattedDistrict, 'left_outer').drop(
 'ADDRESS',
 'CITY',
 'STATE',
 'ZIP',
 'WEBSITE',
 'PHONE',
 'FAX',
 'TTY',
 'X COORDINATE',
 'Y COORDINATE',
 'LATITUDE',
 'LONGITUDE',
 'LOCATION', 'FormattedDistrict').show(5)

+--------+-----------+-------------------+-------------------+----+--------------------+--------------------+--------------------+------+--------+----+--------+----+--------------+--------+----+--------------------+--------+-------------+
|      ID|Case Number|               Date|              Block|IUCR|        Primary Type|         Description|Location Description|Arrest|Domestic|Beat|District|Ward|Community Area|FBI Code|Year|          Updated On|DISTRICT|DISTRICT NAME|
+--------+-----------+-------------------+-------------------+----+--------------------+--------------------+--------------------+------+--------+----+--------+----+--------------+--------+----+--------------------+--------+-------------+
|12536164|   JE439378|2015-09-24 00:00:00|    031XX W 53RD PL|1753|OFFENSE INVOLVING...|SEXUAL ASSAULT OF...|           APARTMENT| false|    true|0923|     009|  14|            63|      02|2015|09/14/2023 03:41:...|       9|      Deering|
|13188119|   JG397237|2015-05-15 00:00:00|  