In [1]:
# Dependencies
import pandas as pd
from pathlib import Path

In [2]:
# Name of the CSV file
file = Path('Crime_Data_from_2020_to_Present_20241017.csv')

In [3]:
# The correct encoding must be used to read the CSV in pandas
df = pd.read_csv(file)

In [8]:
df.columns

Index(['DR_NO', 'Date Rptd', 'DATE OCC', 'TIME OCC', 'AREA', 'AREA NAME',
       'Rpt Dist No', 'Part 1-2', 'Crm Cd', 'Crm Cd Desc', 'Mocodes',
       'Vict Age', 'Vict Sex', 'Vict Descent', 'Premis Cd', 'Premis Desc',
       'Weapon Used Cd', 'Weapon Desc', 'Status', 'Status Desc', 'Crm Cd 1',
       'Crm Cd 2', 'Crm Cd 3', 'Crm Cd 4', 'LOCATION', 'Cross Street', 'LAT',
       'LON'],
      dtype='object')

In [9]:
victim_descent = df['Vict Descent'].unique()
victim_descent

array(['O', 'X', 'H', 'B', 'W', nan, 'A', 'K', 'C', 'J', 'F', 'I', 'V',
       'S', 'P', 'Z', 'G', 'U', 'D', 'L', '-'], dtype=object)

In [4]:
# Preview of the DataFrame
# Note that Memo_CD is likely a meaningless column
df.head()

Unnamed: 0,DR_NO,Date Rptd,DATE OCC,TIME OCC,AREA,AREA NAME,Rpt Dist No,Part 1-2,Crm Cd,Crm Cd Desc,...,Status,Status Desc,Crm Cd 1,Crm Cd 2,Crm Cd 3,Crm Cd 4,LOCATION,Cross Street,LAT,LON
0,190326475,03/01/2020 12:00:00 AM,03/01/2020 12:00:00 AM,2130,7,Wilshire,784,1,510,VEHICLE - STOLEN,...,AA,Adult Arrest,510.0,998.0,,,1900 S LONGWOOD AV,,34.0375,-118.3506
1,200106753,02/09/2020 12:00:00 AM,02/08/2020 12:00:00 AM,1800,1,Central,182,1,330,BURGLARY FROM VEHICLE,...,IC,Invest Cont,330.0,998.0,,,1000 S FLOWER ST,,34.0444,-118.2628
2,200320258,11/11/2020 12:00:00 AM,11/04/2020 12:00:00 AM,1700,3,Southwest,356,1,480,BIKE - STOLEN,...,IC,Invest Cont,480.0,,,,1400 W 37TH ST,,34.021,-118.3002
3,200907217,05/10/2023 12:00:00 AM,03/10/2020 12:00:00 AM,2037,9,Van Nuys,964,1,343,SHOPLIFTING-GRAND THEFT ($950.01 & OVER),...,IC,Invest Cont,343.0,,,,14000 RIVERSIDE DR,,34.1576,-118.4387
4,220614831,08/18/2022 12:00:00 AM,08/17/2020 12:00:00 AM,1200,6,Hollywood,666,2,354,THEFT OF IDENTITY,...,IC,Invest Cont,354.0,,,,1900 TRANSIENT,,34.0944,-118.3277


In [10]:
df.dtypes

DR_NO               int64
Date Rptd          object
DATE OCC           object
TIME OCC            int64
AREA                int64
AREA NAME          object
Rpt Dist No         int64
Part 1-2            int64
Crm Cd              int64
Crm Cd Desc        object
Mocodes            object
Vict Age            int64
Vict Sex           object
Vict Descent       object
Premis Cd         float64
Premis Desc        object
Weapon Used Cd    float64
Weapon Desc        object
Status             object
Status Desc        object
Crm Cd 1          float64
Crm Cd 2          float64
Crm Cd 3          float64
Crm Cd 4          float64
LOCATION           object
Cross Street       object
LAT               float64
LON               float64
dtype: object

In [11]:
# Identify incomplete rows
df.count()

DR_NO             986500
Date Rptd         986500
DATE OCC          986500
TIME OCC          986500
AREA              986500
AREA NAME         986500
Rpt Dist No       986500
Part 1-2          986500
Crm Cd            986500
Crm Cd Desc       986500
Mocodes           840065
Vict Age          986500
Vict Sex          846925
Vict Descent      846914
Premis Cd         986486
Premis Desc       985915
Weapon Used Cd    326368
Weapon Desc       326368
Status            986499
Status Desc       986500
Crm Cd 1          986489
Crm Cd 2           68912
Crm Cd 3            2310
Crm Cd 4              64
LOCATION          986500
Cross Street      152270
LAT               986500
LON               986500
dtype: int64

In [12]:
crime_codes = df['Crm Cd'].unique()
crime_codes

array([510, 330, 480, 343, 354, 624, 821, 812, 230, 956, 341, 930, 668,
       420, 813, 440, 626, 762, 441, 310, 331, 210, 662, 860, 236, 820,
       661, 810, 901, 442, 740, 946, 761, 649, 845, 121, 745, 627, 653,
       928, 815, 940, 625, 352, 648, 886, 666, 921, 805, 932, 900, 903,
       439, 954, 434, 235, 220, 654, 922, 760, 670, 850, 237, 763, 345,
       888, 320, 122, 753, 822, 520, 806, 906, 437, 410, 350, 623, 522,
       450, 890, 755, 231, 664, 251, 951, 920, 250, 470, 902, 647, 651,
       910, 110, 351, 421, 444, 814, 756, 433, 931, 435, 438, 443, 660,
       950, 622, 943, 487, 949, 933, 865, 474, 652, 113, 446, 475, 471,
       451, 436, 485, 944, 349, 942, 347, 353, 870, 473, 880, 452, 924,
       840, 948, 884, 904, 830, 432, 882, 445, 926, 453], dtype=int64)

In [13]:
len(crime_codes)

140

In [14]:
crime_codes1 = df['Crm Cd 1'].unique()
crime_codes1

array([510., 330., 480., 343., 354., 624., 812., 230., 956., 341., 930.,
       668., 420., 813., 440., 626., 762., 441., 310., 331., 210., 662.,
       860., 236., 661., 810., 901., 442., 740., 946., 761., 649., 845.,
       121., 745., 627., 653., 821., 928., 815., 940., 625., 352., 648.,
       886., 666., 921., 805., 932., 900., 820., 903., 439., 760., 954.,
       434., 235., 220., 654., 922., 670., 850., 237., 763., 345., 888.,
       320., 122., 753., 822., 520., 806., 906., 437., 410., 350., 623.,
       522., 450., 890., 755., 231., 664., 251., 951., 920., 250., 470.,
       902., 647., 651., 910., 110., 351., 421., 444., 814., 756., 433.,
       931., 435., 438., 443., 660., 950., 622., 943., 487.,  nan, 949.,
       933., 865., 474., 652., 113., 446., 475., 471., 451., 436., 521.,
       485., 944., 349., 942., 347., 353., 430., 870., 473., 880., 452.,
       924., 840., 948., 884., 904., 830., 432., 882., 445., 926., 453.])

In [15]:
len(crime_codes1)

143

In [16]:
crime_desc = df['Crm Cd Desc'].unique()
crime_desc

array(['VEHICLE - STOLEN', 'BURGLARY FROM VEHICLE', 'BIKE - STOLEN',
       'SHOPLIFTING-GRAND THEFT ($950.01 & OVER)', 'THEFT OF IDENTITY',
       'BATTERY - SIMPLE ASSAULT',
       'SODOMY/SEXUAL CONTACT B/W PENIS OF ONE PERS TO ANUS OTH',
       'CRM AGNST CHLD (13 OR UNDER) (14-15 & SUSP 10 YRS OLDER)',
       'ASSAULT WITH DEADLY WEAPON, AGGRAVATED ASSAULT',
       'LETTERS, LEWD  -  TELEPHONE CALLS, LEWD',
       'THEFT-GRAND ($950.01 & OVER)EXCPT,GUNS,FOWL,LIVESTK,PROD',
       'CRIMINAL THREATS - NO WEAPON DISPLAYED',
       'EMBEZZLEMENT, GRAND THEFT ($950.01 & OVER)',
       'THEFT FROM MOTOR VEHICLE - PETTY ($950 & UNDER)',
       'CHILD ANNOYING (17YRS & UNDER)',
       'THEFT PLAIN - PETTY ($950 & UNDER)',
       'INTIMATE PARTNER - SIMPLE ASSAULT', 'LEWD CONDUCT',
       'THEFT PLAIN - ATTEMPT', 'BURGLARY',
       'THEFT FROM MOTOR VEHICLE - GRAND ($950.01 AND OVER)', 'ROBBERY',
       'BUNCO, GRAND THEFT', 'BATTERY WITH SEXUAL CONTACT',
       'INTIMATE PARTNER - AGGRAVA

In [17]:
df.describe(include=['object'])

Unnamed: 0,Date Rptd,DATE OCC,AREA NAME,Crm Cd Desc,Mocodes,Vict Sex,Vict Descent,Premis Desc,Weapon Desc,Status,Status Desc,LOCATION,Cross Street
count,986500,986500,986500,986500,840065,846925,846914,985915,326368,986499,986500,986500,152270
unique,1749,1749,21,140,309567,5,20,306,79,6,6,66322,10337
top,02/02/2023 12:00:00 AM,01/01/2020 12:00:00 AM,Central,VEHICLE - STOLEN,344,M,H,STREET,"STRONG-ARM (HANDS, FIST, FEET OR BODILY FORCE)",IC,Invest Cont,800 N ALAMEDA ST,BROADWAY
freq,929,1157,68166,111632,41144,397948,293088,254978,174484,788335,788335,2556,2467


In [18]:
renamed_data = df.rename(columns={"DR_NO" : "dr_no","DATE OCC": "date_occ", "TIME OCC":"time_occ", "AREA":"division", "AREA NAME":"div_name", "Crm Cd": "crime_id", "Crm Cd Desc":"crime_desc","Vict Age":"vict_age","Vict Sex":"vict_sex","Vict Descent":"vict_descent", "Crm Cd 1":"crime_id1","LOCATION":"loc_desc", "LAT":"lat", "LON":"lon"})
renamed_data.count()

dr_no             986500
Date Rptd         986500
date_occ          986500
time_occ          986500
division          986500
div_name          986500
Rpt Dist No       986500
Part 1-2          986500
crime_id          986500
crime_desc        986500
Mocodes           840065
vict_age          986500
vict_sex          846925
vict_descent      846914
Premis Cd         986486
Premis Desc       985915
Weapon Used Cd    326368
Weapon Desc       326368
Status            986499
Status Desc       986500
crime_id1         986489
Crm Cd 2           68912
Crm Cd 3            2310
Crm Cd 4              64
loc_desc          986500
Cross Street      152270
lat               986500
lon               986500
dtype: int64

In [19]:
reduced_data = renamed_data[["dr_no","date_occ","time_occ","division","div_name","crime_id","crime_desc","vict_age","vict_sex","vict_descent","crime_id1","loc_desc", "lat","lon"]]
reduced_data.dtypes

dr_no             int64
date_occ         object
time_occ          int64
division          int64
div_name         object
crime_id          int64
crime_desc       object
vict_age          int64
vict_sex         object
vict_descent     object
crime_id1       float64
loc_desc         object
lat             float64
lon             float64
dtype: object

In [20]:
converted_data = reduced_data.copy()
converted_data = converted_data.astype({"date_occ":"datetime64[ns]"})
converted_data.dtypes

dr_no                    int64
date_occ        datetime64[ns]
time_occ                 int64
division                 int64
div_name                object
crime_id                 int64
crime_desc              object
vict_age                 int64
vict_sex                object
vict_descent            object
crime_id1              float64
loc_desc                object
lat                    float64
lon                    float64
dtype: object

In [21]:
reduced_data.head()

Unnamed: 0,dr_no,date_occ,time_occ,division,div_name,crime_id,crime_desc,vict_age,vict_sex,vict_descent,crime_id1,loc_desc,lat,lon
0,190326475,03/01/2020 12:00:00 AM,2130,7,Wilshire,510,VEHICLE - STOLEN,0,M,O,510.0,1900 S LONGWOOD AV,34.0375,-118.3506
1,200106753,02/08/2020 12:00:00 AM,1800,1,Central,330,BURGLARY FROM VEHICLE,47,M,O,330.0,1000 S FLOWER ST,34.0444,-118.2628
2,200320258,11/04/2020 12:00:00 AM,1700,3,Southwest,480,BIKE - STOLEN,19,X,X,480.0,1400 W 37TH ST,34.021,-118.3002
3,200907217,03/10/2020 12:00:00 AM,2037,9,Van Nuys,343,SHOPLIFTING-GRAND THEFT ($950.01 & OVER),19,M,O,343.0,14000 RIVERSIDE DR,34.1576,-118.4387
4,220614831,08/17/2020 12:00:00 AM,1200,6,Hollywood,354,THEFT OF IDENTITY,28,M,H,354.0,1900 TRANSIENT,34.0944,-118.3277


In [22]:
date_data = reduced_data.copy()
date_data = date_data.loc[(date_data["date_occ"] > '09/01/2023' )]
date_data.count()

dr_no           302294
date_occ        302294
time_occ        302294
division        302294
div_name        302294
crime_id        302294
crime_desc      302294
vict_age        302294
vict_sex        258823
vict_descent    258822
crime_id1       302290
loc_desc        302294
lat             302294
lon             302294
dtype: int64

In [None]:
# converted_data['time_occ'] = pd.to_datetime(converted_data['time_occ'], format='%H:%M').dt.time
# converted_data.dtypes

In [23]:
totals_division = date_data['division'].value_counts()
totals_division

division
1     21520
12    18360
14    17681
3     17429
6     15897
20    15250
15    15192
7     14791
18    14778
13    14773
2     14488
8     13904
11    12817
10    12757
9     12571
17    12416
5     12343
21    12269
19    11995
4     11213
16     9850
Name: count, dtype: int64

In [24]:
totals_crime = date_data['crime_id'].value_counts()
totals_crime

crime_id
510    34862
624    23133
330    19085
354    18282
310    17986
       ...  
906        1
830        1
432        1
347        1
446        1
Name: count, Length: 137, dtype: int64

In [25]:
totals_crime = date_data['crime_desc'].value_counts()
totals_crime

crime_desc
VEHICLE - STOLEN                                34862
BATTERY - SIMPLE ASSAULT                        23133
BURGLARY FROM VEHICLE                           19085
THEFT OF IDENTITY                               18282
BURGLARY                                        17986
                                                ...  
FIREARMS RESTRAINING ORDER (FIREARMS RO)            1
INCEST (SEXUAL ACTS BETWEEN BLOOD RELATIVES)        1
BLOCKING DOOR INDUCTION CENTER                      1
GRAND THEFT / INSURANCE FRAUD                       1
PETTY THEFT - AUTO REPAIR                           1
Name: count, Length: 137, dtype: int64

In [26]:
groups_crime = date_data.groupby(["crime_id"])
groups_crime.count()

Unnamed: 0_level_0,dr_no,date_occ,time_occ,division,div_name,crime_desc,vict_age,vict_sex,vict_descent,crime_id1,loc_desc,lat,lon
crime_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
110,515,515,515,515,515,515,515,515,515,515,515,515,515
113,4,4,4,4,4,4,4,4,4,4,4,4,4
121,1167,1167,1167,1167,1167,1167,1167,1167,1167,1167,1167,1167,1167
122,102,102,102,102,102,102,102,102,102,102,102,102,102
210,10039,10039,10039,10039,10039,10039,10039,10035,10035,10039,10039,10039,10039
...,...,...,...,...,...,...,...,...,...,...,...,...,...
949,33,33,33,33,33,33,33,33,33,33,33,33,33
950,17,17,17,17,17,17,17,17,17,17,17,17,17
951,112,112,112,112,112,112,112,112,112,112,112,112,112
954,12,12,12,12,12,12,12,12,12,12,12,12,12


In [27]:
date_data.columns

Index(['dr_no', 'date_occ', 'time_occ', 'division', 'div_name', 'crime_id',
       'crime_desc', 'vict_age', 'vict_sex', 'vict_descent', 'crime_id1',
       'loc_desc', 'lat', 'lon'],
      dtype='object')

In [28]:
date_data.to_csv('date_data.csv', index=False)

In [None]:
date_data.to_json('date_data.json', orient='records')

In [None]:
crime_details = date_data.loc[(date_data["crime_id"] == 113)]
crime_details

In [None]:
crime_desc2 = date_data['crime_desc'].unique()
len(crime_desc2)

In [None]:
crime_id2 = date_data['crime_id'].unique()
len(crime_id2)

In [None]:
crimes = date_data.loc[(date_data["crime_id"] > 100) & (date_data["crime_id"] < 200) ]
crimes

In [None]:
crimes['crime_id'].unique()

In [None]:
crimes['crime_desc'].unique()

In [None]:
descriptions = date_data[f"{'crime_desc'}"]
descriptions.unique()

In [None]:
crimes2 = date_data.loc[(date_data["crime_id"] >= 200) & (date_data["crime_id"] < 300) ]

In [None]:
crimes2['crime_id'].unique()

In [None]:
crimes2['crime_desc'].unique()

In [None]:
crimes3 = date_data.loc[(date_data["crime_id"] >= 300) & (date_data["crime_id"] < 400) ]

In [None]:
crimes3['crime_id'].unique()

In [None]:
crimes3['crime_desc'].unique()

In [None]:
crimes4 = date_data.loc[(date_data["crime_id"] >= 400) & (date_data["crime_id"] < 500) ]

In [None]:
crimes4['crime_id'].unique()

In [None]:
crimes4['crime_desc'].unique()

In [None]:
crimes5 = date_data.loc[(date_data["crime_id"] >= 500) & (date_data["crime_id"] < 600) ]

In [None]:
crimes5['crime_id'].unique()

In [None]:
crimes5['crime_desc'].unique()

In [None]:
crimes6['crime_id'].unique()

In [None]:
crimes6['crime_desc'].unique()

In [None]:
crimes7 = date_data.loc[(date_data["crime_id"] >= 700) & (date_data["crime_id"] < 800) ]

In [None]:
crimes7['crime_id'].unique()

In [None]:
crimes7['crime_desc'].unique()

In [None]:
crimes8 = date_data.loc[(date_data["crime_id"] >= 800) & (date_data["crime_id"] < 900) ]

In [None]:
crimes8['crime_id'].unique()

In [None]:
crimes8['crime_desc'].unique()

In [None]:
crimes9 = date_data.loc[(date_data["crime_id"] >= 900) & (date_data["crime_id"] < 1000) ]

In [None]:
crimes9['crime_id'].unique()

In [None]:
crimes9['crime_desc'].unique()