In [1]:
import pandas as pd
from datetime import datetime
from datetime import time

In [2]:
# reading all call records
calls_22 = pd.read_csv("data/policecalls2022.csv")
calls_23 = pd.read_csv("data/policecalls2023.csv")
calls_24 = pd.read_csv("data/policecalls2024.csv")

In [3]:
calls_data = pd.concat([calls_22, calls_23, calls_24])

## Data Cleaning

In [4]:
calls_data.dtypes

CDTS                object
EID                  int64
START_DATE          object
CALL_NUMBER         object
PRIORITY             int64
REPORT_DATE         object
OFFENSE_DATE        object
OFFENSE_TIME        object
CALLTYPE_CODE       object
CALL_TYPE           object
FINAL_DISPO_CODE    object
FINAL_DISPO         object
ADDRESS             object
CITY                object
STATE               object
dtype: object

In [5]:
# let us first make column names lowercase
calls_data.columns = calls_data.columns.str.lower()

In [6]:
# EID is a identification so let us make it an object
calls_data['eid'] = calls_data['eid'].astype(str)

In [7]:
calls_data.calltype_code.value_counts()

1195      439816
415       370679
WELCK     324931
1033A     279998
22500     199219
           ...  
602.11         7
10751          7
308            7
BARCK          7
CURF           7
Name: calltype_code, Length: 212, dtype: int64

In [8]:
# checking for missingness
calls_data.isna().sum()

# we are only missing addresses in call the df

cdts                     0
eid                      0
start_date               0
call_number              0
priority                 0
report_date              0
offense_date             0
offense_time             0
calltype_code            0
call_type                0
final_dispo_code         0
final_dispo              0
address             113995
city                     0
state                    0
dtype: int64

In [9]:
# date_format = '%m/%d/%Y %I:%M:%S %p'

# def to_date(s):
#     # change str to datetime type
#     return datetime.strptime(s, date_format).date()

In [10]:
# change all start_date
# calls_data['start_date'] = calls_data.start_date.apply(to_date)

In [11]:
# # change all report_date
# calls_data['report_date'] = calls_data['report_date'].apply(to_date)


In [12]:
# # change all offense_date
# calls_data['offense_date'] = calls_data['offense_date'].apply(to_date)

In [13]:
calls_data['offense_time']

0         02:49:02
1         03:05:51
2         04:35:25
3         03:38:14
4         23:25:49
            ...   
130529    14:08:34
130530    14:06:06
130531    13:14:37
130532    13:53:41
130533    14:32:08
Name: offense_time, Length: 3998463, dtype: object

In [14]:
to_time = lambda x: time(*map(int, x.split(':')))

In [15]:
calls_data['offense_time'] = calls_data['offense_time'].apply(to_time)

In [16]:
# dropping city and state as they are all San Jose, CA
calls_data = calls_data.drop(columns=['city','state'])

In [17]:
calls_data.shape[0]

3998463

In [18]:
calls_data['cdts'].nunique()

602431

In [19]:
calls_data['eid'].nunique()

590411

In [20]:
calls_data['call_number'].nunique()

591141

In [21]:
calls_data['calltype_code'].value_counts()

1195      439816
415       370679
WELCK     324931
1033A     279998
22500     199219
           ...  
602.11         7
10751          7
308            7
BARCK          7
CURF           7
Name: calltype_code, Length: 212, dtype: int64

In [22]:
call_types = calls_data['call_type'].value_counts()

In [23]:
call_types[call_types > 2500]

VEHICLE STOP                   439816
DISTURBANCE                    370679
WELFARE CHECK                  324931
ALARM, AUDIBLE                 279998
PARKING VIOLATION              199219
                                ...  
FEMALE CALLING FOR HELP          3001
ELDER/DEPENDENT ADULT ABUSE      2904
POSSESSION OF NARCOTICS          2811
ILLEGAL WEAPONS                  2808
ARSON (447A)                     2757
Name: call_type, Length: 93, dtype: int64

In [24]:
calls_data["Timestamp"] = pd.to_datetime(calls_data["cdts"].str[:14], format='%Y%m%d%H%M%S', errors='coerce')

Add timestamp column

In [25]:
different_dates = calls_data[calls_data["report_date"] != calls_data["offense_date"]]
different_dates

Unnamed: 0,cdts,eid,start_date,call_number,priority,report_date,offense_date,offense_time,calltype_code,call_type,final_dispo_code,final_dispo,address,Timestamp


Every value for report_date and offense_date is same

Remove unnecessary columns: report_date, offense_date, start_date, cdts

In [30]:
pp_calls_data = calls_data.drop(columns=['eid',
                                     'call_number',
                                     'report_date', 
                                     'offense_date', 
                                     'start_date', 
                                     'cdts'])

In [39]:
pp_calls_data.dtypes

priority                     int64
offense_time                object
calltype_code               object
call_type                   object
final_dispo_code            object
final_dispo                 object
address                     object
Timestamp           datetime64[ns]
dtype: object

In [36]:
pp_calls_data.to_csv('police_calls.csv', index=False)