In [1]:
import json
import pandas as pd
from pathlib import Path
file_path = Path('/home/li2seo4/workspace/capstone/2025_nypd.json')

with open(file_path, 'r') as f:
    data = json.load(f)

print(f"Total record number: {len(data):,}")
print(f"\nRecord Structure:")
print(f"  attributes: {len(data[0])}")
print(f"\nStarting 15 Attributes:")
for i, key in enumerate(list(data[0].keys())[:15], 1):
    print(f"  {i:2d}. {key}")

remaining = len(data[0]) - 15
if remaining > 0:
    print(f"  ... remaining {remaining} Attributes")

print(f"\nFirst Record Sample: ")
sample = data[0]
for key in list(sample.keys())[:8]:
    val = sample[key]
    if isinstance(val, str) and len(val) > 50:
        val = val[:47] + "..."
    print(f"  {key}: {val}")
# Check null values
null_counts = {}
for record in data:
    for key, val in record.items():
        if val is None:
            null_counts[key] = null_counts.get(key, 0) + 1

print("\nAttributes with most nulls:")
for key in sorted(null_counts.keys(), key=lambda x: null_counts[x], reverse=True)[:10]:
    count = null_counts[key]
    pct = (count / len(data)) * 100
    print(f"  {key:20s}: {count:7,d} ({pct:5.1f}%)")
summary = {
    'Total Records': f"{len(data):,}",
    'Total Fields': len(data[0]),
    'Fields with Nulls': len(null_counts),
}

print("\nSummary:")
for key, val in summary.items():
    print(f"  {key:20s}: {val}")


df = pd.DataFrame(data)
df.to_csv('raw.csv', index=False)

Total record number: 202,732

Record Structure:
  attributes: 46

Starting 15 Attributes:
   1. type
   2. cmplntNum
   3. precinct
   4. borough
   5. cmplntStartDate
   6. cmplntEndDate
   7. dateId
   8. cmplntStartTime
   9. cmplntEndTime
  10. timeId
  11. suspId
  12. vicId
  13. crimeStatus
  14. jurisdictionCode
  15. jurisdictionDescription
  ... remaining 31 Attributes

First Record Sample: 
  type: COMPLAINT
  cmplntNum: 298725583
  precinct: 34
  borough: MANHATTAN
  cmplntStartDate: 1/1/2025
  cmplntEndDate: 1/1/2025
  dateId: 1/1/2025
  cmplntStartTime: 7:00:00

Attributes with most nulls:
  incidentId          : 202,562 ( 99.9%)
  locationType        : 202,562 ( 99.9%)
  venueType           : 202,562 ( 99.9%)
  fatalFlag           : 202,562 ( 99.9%)
  perpId              : 202,562 ( 99.9%)
  arrestKey           : 131,490 ( 64.9%)
  keyCode             : 131,490 ( 64.9%)
  lawCode             : 131,490 ( 64.9%)
  latitude            : 131,323 ( 64.8%)
  longitude         

#### Process Longitude and Latitude fields 
- The lonLat field and longitude/latitude fields are redundant, yet there are different format of lonLat
- POINT(lat, lon) and POINT(lon lat)

In [2]:
import numpy as np
df = pd.read_csv('raw.csv')

  df = pd.read_csv('raw.csv')


- First, check UNKNOWN, blanks ,and nulls of lonlat

In [3]:
is_unknown = df['lonLat'].astype(str).str.strip().str.upper().eq('UNKNOWN')
is_na = df['lonLat'].isna()
is_blank = df['lonLat'].astype(str).str.strip().eq('')
print(f"Unknown: {is_unknown.sum()}, NA: {is_na.sum()}, Blank: {is_blank.sum()}")
unknown = df[is_unknown].copy()
unknown[['lonLat','longitude','latitude']] 

Unknown: 6, NA: 0, Blank: 0


Unnamed: 0,lonLat,longitude,latitude
10103,UNKNOWN,,
50990,UNKNOWN,,
102433,UNKNOWN,,
202632,UNKNOWN,,
202640,UNKNOWN,,
202689,UNKNOWN,,


In [4]:
import re
import math
def parse_lon_lat(value):
    # blank and unknown processing
    if pd.isna(value) or str(value).strip().upper() == 'UNKNOWN' or str(value).strip() == '':
        return np.nan, np.nan
    # match with regexp
    value_str = str(value).strip().upper()
    pattern = r'POINT\s*\(\s*([-0-9.]+)\s*(?:,|\s+)\s*([-0-9.]+)\s*\)'
    match = re.search(pattern, value_str)
    if not match:
        return np.nan, np.nan
    # judge the order of longitude and latitude. Condering the location, use value of acquired numbers themselves
    x1 = float(match.group(1))
    x2 = float(match.group(2))
    if x1>0 and x2<0:
        lat = x1
        lon = x2
    else:
        lat = x2
        lon = x1

    return (lon, lat)
df[['lon', 'lat']] = df['lonLat'].apply(lambda x: pd.Series(parse_lon_lat(x)))
check = df[['lon', 'lat','longitude','latitude']].copy()
check.head()

Unnamed: 0,lon,lat,longitude,latitude
0,-73.928393,40.866479,,
1,-73.978747,40.68136,,
2,-74.157942,40.620452,,
3,-73.990868,40.750664,,
4,-74.215793,40.540771,,


In [5]:
# Now, compared parsed lon/lat with original longitude/latitude columns. skip if original longitude/latitude is NA or blank or unknown
def is_close(row):
    if pd.isna(row['longitude']) or str(row['longitude']).strip().upper() == 'UNKNOWN' or str(row['longitude']).strip() == '':
        return True
    if pd.isna(row['latitude']) or str(row['latitude']).strip().upper() == 'UNKNOWN' or str(row['latitude']).strip() == '':
        return True
    return math.isclose(row['lon'], row['longitude'], abs_tol=1e-6) and math.isclose(row['lat'], row['latitude'], abs_tol=1e-6)
check['is_close'] = check.apply(is_close, axis=1)
print(f"Number of close matches: {check['is_close'].sum()} out of {len(check)}")

Number of close matches: 202732 out of 202732


In [6]:
# count coverted nas
nacount = check[['lon', 'lat']].isna().sum()
print(nacount)
checkna = check[check[['lon', 'lat']].isna().any(axis=1)]
checkna.head()

lon    6
lat    6
dtype: int64


Unnamed: 0,lon,lat,longitude,latitude,is_close
10103,,,,,True
50990,,,,,True
102433,,,,,True
202632,,,,,True
202640,,,,,True


In [7]:
check.dropna(subset=['longitude', 'latitude'], inplace=True)
check.head()

Unnamed: 0,lon,lat,longitude,latitude,is_close
131320,-73.868498,40.669378,-73.868498,40.669378,True
131321,-73.904319,40.692065,-73.904319,40.692065,True
131322,-73.878308,40.803914,-73.878308,40.803914,True
131323,-73.994069,40.761556,-73.994069,40.761556,True
131324,-73.82374,40.751279,-73.82374,40.751279,True


- We can conclude that just using the extracted lon and lat would be enough. Drop original lonlat, longitude and latitude in df, rename the extracted ones. Save it for next step.

In [8]:
df.drop(columns=['lonLat', 'longitude','latitude'], inplace=True)
df.rename(columns={'lon': 'longitude', 'lat': 'latitude'}, inplace=True)
df.to_csv('step1.csv', index=False)


#### split the dataset into three parts by type

In [9]:
cmplt = df[df['type']=='COMPLAINT']
arrest = df[df['type']=='ARREST']
shooting = df[df['type']=='SHOOTING']
print(f"COMPLAINT records: {len(cmplt):,}")
print(f"ARREST records: {len(arrest):,}")
print(f"SHOOTING records: {len(shooting):,}")
print(f"Total records: {len(df):,}")

COMPLAINT records: 131,320
ARREST records: 71,242
SHOOTING records: 170
Total records: 202,732


#### Check complaint type records
- drop all null columns
- find redundant columns

In [10]:
empty_col_cmplt = cmplt.columns[cmplt.isnull().all()]
print(empty_col_cmplt)
cmplt.dropna(axis=1, how='all', inplace=True)
print(cmplt.columns)
print(cmplt.head())

Index(['arrestKey', 'keyCode', 'lawCode', 'perpAgeGroup', 'perpSex',
       'perpRace', 'xCoord', 'yCoord', 'incidentId', 'locationType',
       'venueType', 'fatalFlag', 'perpId'],
      dtype='object')
Index(['type', 'cmplntNum', 'precinct', 'borough', 'cmplntStartDate',
       'cmplntEndDate', 'dateId', 'cmplntStartTime', 'cmplntEndTime', 'timeId',
       'suspId', 'vicId', 'crimeStatus', 'jurisdictionCode',
       'jurisdictionDescription', 'offenseCode', 'offenseId', 'lawCategory',
       'spatialContext', 'offenseDescription', 'nypdCode', 'policeDescription',
       'premisesDescription', 'reportDate', 'suspAgeGroup', 'suspRace',
       'suspSex', 'vicAgeGroup', 'vicRace', 'vicSex', 'longitude', 'latitude'],
      dtype='object')
        type    cmplntNum  precinct        borough cmplntStartDate  \
0  COMPLAINT  298725583.0        34      MANHATTAN        1/1/2025   
1  COMPLAINT  298902928.0        78       BROOKLYN        1/1/2025   
2  COMPLAINT  298725764.0       121  STATEN 

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  cmplt.dropna(axis=1, how='all', inplace=True)


In [11]:
cmplt['cmplntNum'] = cmplt['cmplntNum'].astype(int, errors='ignore')
filter = cmplt['cmplntNum'].astype(str) != cmplt['suspId'].str[:-2]
filtered = cmplt[filter]
print(f"Records with different suspect and complaint IDs: {len(filtered):,}")
filter = cmplt['cmplntNum'].astype(str) != cmplt['vicId'].str[:-2]
filtered = cmplt[filter]
print(f"Records with different victim and complaint IDs: {len(filtered):,}")
filter = cmplt['cmplntNum'].astype(str) != cmplt['offenseId'].str[:-2]
filtered = cmplt[filter]
print(f"Records with different offense and complaint IDs: {len(filtered):,}")
filtered_date = cmplt[filter]
print(f"Records with different complaint start date and date ID: {len(filtered_date):,}")
filter = cmplt['cmplntStartTime'] !=cmplt['timeId']
filtered_time = cmplt[filter]
print(f"Records with different complaint start time and time ID: {len(filtered_time):,}")
cmplt.head()


Records with different suspect and complaint IDs: 0
Records with different victim and complaint IDs: 0
Records with different offense and complaint IDs: 0
Records with different complaint start date and date ID: 0
Records with different complaint start time and time ID: 0


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  cmplt['cmplntNum'] = cmplt['cmplntNum'].astype(int, errors='ignore')


Unnamed: 0,type,cmplntNum,precinct,borough,cmplntStartDate,cmplntEndDate,dateId,cmplntStartTime,cmplntEndTime,timeId,...,premisesDescription,reportDate,suspAgeGroup,suspRace,suspSex,vicAgeGroup,vicRace,vicSex,longitude,latitude
0,COMPLAINT,298725583,34,MANHATTAN,1/1/2025,1/1/2025,1/1/2025,7:00:00,7:10:00,7:00:00,...,STREET,01/01/2025,UNKNOWN,BLACK HISPANIC,M,18-24,BLACK HISPANIC,M,-73.928393,40.866479
1,COMPLAINT,298902928,78,BROOKLYN,1/1/2025,1/2/2025,1/1/2025,20:00:00,17:00:00,20:00:00,...,RESIDENCE-HOUSE,01/02/2025,UNKNOWN,UNKNOWN,UNKNOWN,45-64,WHITE,M,-73.978747,40.68136
2,COMPLAINT,298725764,121,STATEN ISLAND,1/1/2025,1/1/2025,1/1/2025,12:30:00,12:40:00,12:30:00,...,RESIDENCE-HOUSE,01/01/2025,18-24,BLACK,M,45-64,BLACK,M,-74.157942,40.620452
3,COMPLAINT,298712881,14,MANHATTAN,1/1/2025,1/1/2025,1/1/2025,5:10:00,5:23:00,5:10:00,...,TRANSIT FACILITY (OTHER),01/01/2025,18-24,WHITE,M,UNKNOWN,UNKNOWN,E,-73.990868,40.750664
4,COMPLAINT,298949885,123,STATEN ISLAND,1/1/2025,1/1/2025,1/1/2025,0:00:00,0:15:00,0:00:00,...,RESIDENCE-HOUSE,01/05/2025,25-44,UNKNOWN,F,45-64,WHITE,F,-74.215793,40.540771


In [12]:
naid_s = cmplt[cmplt['suspId'].isna()]
naid_v = cmplt[cmplt['vicId'].isna()]
naid_o = cmplt[cmplt['offenseId'].isna()]
print(f"Records with null suspect ID: {len(naid_s):,}")
print(f"Records with null victim ID: {len(naid_v):,}")
print(f"Records with null offense ID: {len(naid_o):,}")

Records with null suspect ID: 0
Records with null victim ID: 0
Records with null offense ID: 0


- suspect id, victim id, offense id are the same. We can drop them.
- combine starttime, startdate, endtime, enddate as datetime type
- rename cmpIntNum as id

In [13]:
cmplt.rename(columns={'cmplntNum': 'id'}, inplace=True)
cmplt['start_datetime'] = pd.to_datetime(cmplt['cmplntStartDate']+' '+cmplt['cmplntStartTime'], errors='coerce')
cmplt['end_datetime'] = pd.to_datetime(cmplt['cmplntEndDate']+' '+cmplt['cmplntEndTime'], errors='coerce')
cmplt['end_datetime'].replace({pd.NaT:'UNKNOWN'}, inplace=True)
cmplt.drop(columns=['cmplntStartDate', 'cmplntStartTime', 'cmplntEndDate', 'cmplntEndTime','vicId', 'suspId', 'offenseId'], inplace=True)  # Not dropping dateId or timeId, other sections are using them
id = cmplt.pop('id')
cmplt.insert(0, 'id', id)
print(cmplt.isna().mean().sort_values(ascending=False))
cmplt.head()

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  cmplt.rename(columns={'cmplntNum': 'id'}, inplace=True)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  cmplt['start_datetime'] = pd.to_datetime(cmplt['cmplntStartDate']+' '+cmplt['cmplntStartTime'], errors='coerce')
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  cmplt['end_datetime'] = pd.to_datetime(cmplt['cmplntEndDate']+' '+cmplt['cmplntEndTime'], err

latitude                   0.000023
longitude                  0.000023
id                         0.000000
policeDescription          0.000000
start_datetime             0.000000
vicSex                     0.000000
vicRace                    0.000000
vicAgeGroup                0.000000
suspSex                    0.000000
suspRace                   0.000000
suspAgeGroup               0.000000
reportDate                 0.000000
premisesDescription        0.000000
nypdCode                   0.000000
type                       0.000000
offenseDescription         0.000000
spatialContext             0.000000
lawCategory                0.000000
offenseCode                0.000000
jurisdictionDescription    0.000000
jurisdictionCode           0.000000
crimeStatus                0.000000
timeId                     0.000000
dateId                     0.000000
borough                    0.000000
precinct                   0.000000
end_datetime               0.000000
dtype: float64


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  cmplt.drop(columns=['cmplntStartDate', 'cmplntStartTime', 'cmplntEndDate', 'cmplntEndTime','vicId', 'suspId', 'offenseId'], inplace=True)  # Not dropping dateId or timeId, other sections are using them


Unnamed: 0,id,type,precinct,borough,dateId,timeId,crimeStatus,jurisdictionCode,jurisdictionDescription,offenseCode,...,suspAgeGroup,suspRace,suspSex,vicAgeGroup,vicRace,vicSex,longitude,latitude,start_datetime,end_datetime
0,298725583,COMPLAINT,34,MANHATTAN,1/1/2025,7:00:00,COMPLETED,0,N.Y. POLICE DEPT,105,...,UNKNOWN,BLACK HISPANIC,M,18-24,BLACK HISPANIC,M,-73.928393,40.866479,2025-01-01 07:00:00,2025-01-01 07:10:00
1,298902928,COMPLAINT,78,BROOKLYN,1/1/2025,20:00:00,COMPLETED,0,N.Y. POLICE DEPT,341,...,UNKNOWN,UNKNOWN,UNKNOWN,45-64,WHITE,M,-73.978747,40.68136,2025-01-01 20:00:00,2025-01-02 17:00:00
2,298725764,COMPLAINT,121,STATEN ISLAND,1/1/2025,12:30:00,COMPLETED,0,N.Y. POLICE DEPT,121,...,18-24,BLACK,M,45-64,BLACK,M,-74.157942,40.620452,2025-01-01 12:30:00,2025-01-01 12:40:00
3,298712881,COMPLAINT,14,MANHATTAN,1/1/2025,5:10:00,COMPLETED,17,MTA POLICE DEPT,236,...,18-24,WHITE,M,UNKNOWN,UNKNOWN,E,-73.990868,40.750664,2025-01-01 05:10:00,2025-01-01 05:23:00
4,298949885,COMPLAINT,123,STATEN ISLAND,1/1/2025,0:00:00,COMPLETED,0,N.Y. POLICE DEPT,351,...,25-44,UNKNOWN,F,45-64,WHITE,F,-74.215793,40.540771,2025-01-01 00:00:00,2025-01-01 00:15:00


#### Check arrest type records
- Check if arrestKey and offenseId are the same
- drop all null columns

In [14]:
arrest.dropna(axis = 1, how='all', inplace=True)
arrest['arrestKey'] = arrest['arrestKey'].astype(int)
naid_o = arrest[arrest['offenseId'].isna()]
print(f"ARREST records with null offense ID: {len(naid_o):,}") 
filtered = arrest[arrest['arrestKey'].astype(str) != arrest['offenseId'].str[:-2]]
print(f"ARREST records with different arrest key and offense ID: {len(filtered):,}")

ARREST records with null offense ID: 0
ARREST records with different arrest key and offense ID: 0


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  arrest.dropna(axis = 1, how='all', inplace=True)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  arrest['arrestKey'] = arrest['arrestKey'].astype(int)


- We can drop offenseId. Reset arrestKey as id

In [15]:
arrest.drop(columns = ['offenseId'], inplace=True)
arrest.rename(columns={'arrestKey': 'id'}, inplace=True)
id = arrest.pop('id')
arrest.insert(0, 'id', id)
print(arrest.isna().mean().sort_values(ascending=False))
arrest.head()

id                    0.0
type                  0.0
longitude             0.0
yCoord                0.0
xCoord                0.0
perpRace              0.0
perpSex               0.0
perpAgeGroup          0.0
lawCode               0.0
keyCode               0.0
policeDescription     0.0
nypdCode              0.0
offenseDescription    0.0
lawCategory           0.0
offenseCode           0.0
jurisdictionCode      0.0
dateId                0.0
borough               0.0
precinct              0.0
latitude              0.0
dtype: float64


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  arrest.drop(columns = ['offenseId'], inplace=True)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  arrest.rename(columns={'arrestKey': 'id'}, inplace=True)


Unnamed: 0,id,type,precinct,borough,dateId,jurisdictionCode,offenseCode,lawCategory,offenseDescription,nypdCode,policeDescription,keyCode,lawCode,perpAgeGroup,perpSex,perpRace,xCoord,yCoord,longitude,latitude
131320,298725428,ARREST,75,BROOKLYN,1/1/2025,0,0,MISDEMEANOR,ASSAULT 3 & RELATED OFFENSES,101,ASSAULT 3,344,PL 1200001,25-44,M,BLACK,1020730.0,183173.0,-73.868498,40.669378
131321,298736076,ARREST,83,BROOKLYN,1/1/2025,0,0,MISDEMEANOR,VEHICLE AND TRAFFIC LAWS,922,"TRAFFIC,UNCLASSIFIED MISDEMEAN",348,VTL0511001,45-64,M,WHITE HISPANIC,1010783.0,191425.0,-73.904319,40.692065
131322,298746553,ARREST,41,BRONX,1/1/2025,72,0,MISDEMEANOR,OFFENSES AGAINST PUBLIC ADMINI,759,"PUBLIC ADMINISTATION,UNCLASS M",359,PL 2052002,18-24,M,BLACK HISPANIC,1017940.0,232184.0,-73.878308,40.803914
131323,298706302,ARREST,18,MANHATTAN,1/1/2025,0,0,FELONY,FELONY ASSAULT,105,STRANGULATION 1ST,106,PL 1211200,25-44,M,BLACK,985893.0,216732.0,-73.994069,40.761556
131324,298725497,ARREST,109,QUEENS,1/1/2025,0,0,MISDEMEANOR,ASSAULT 3 & RELATED OFFENSES,101,ASSAULT 3,344,PL 1200001,25-44,F,WHITE HISPANIC,1033086.0,213033.0,-73.82374,40.751279


#### Check shooting type records
- check if vicId,incidentId, perpId are the same.


In [16]:
shooting.dropna(axis = 1, how='all', inplace=True)
shooting['incidentId'] = shooting['incidentId'].astype(int)
filtered = shooting[shooting['incidentId'].astype(str) != shooting['perpId'].str[:-2]]
print(f"SHOOTING records with different incident ID and perpetrator ID: {len(filtered):,}")
filtered = shooting[shooting['incidentId'].astype(str) != shooting['vicId'].str[:-2]]
print(f"SHOOTING records with different incident ID and victim ID: {len(filtered):,}")
naid_p = shooting[shooting['incidentId'].isna()]
naid_v = shooting[shooting['vicId'].isna()]
print(f"SHOOTING records with null perpetrator ID: {len(naid_p):,}")
print(f"SHOOTING records with null victim ID: {len(naid_v):,}")

SHOOTING records with different incident ID and perpetrator ID: 0
SHOOTING records with different incident ID and victim ID: 0
SHOOTING records with null perpetrator ID: 0
SHOOTING records with null victim ID: 0


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  shooting.dropna(axis = 1, how='all', inplace=True)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  shooting['incidentId'] = shooting['incidentId'].astype(int)


- We can drop victimId and perpId. Set incidentId as id

In [17]:
shooting.drop(columns=['perpId', 'vicId'], inplace=True)
shooting.rename(columns={'incidentId': 'id'}, inplace=True)
id = shooting.pop('id')
shooting.insert(0, 'id', id)
print(shooting.isna().mean().sort_values(ascending=False))
shooting.head()

latitude            0.017647
longitude           0.017647
vicSex              0.000000
fatalFlag           0.000000
venueType           0.000000
locationType        0.000000
yCoord              0.000000
xCoord              0.000000
perpRace            0.000000
perpSex             0.000000
perpAgeGroup        0.000000
id                  0.000000
type                0.000000
vicAgeGroup         0.000000
nypdCode            0.000000
spatialContext      0.000000
offenseCode         0.000000
jurisdictionCode    0.000000
timeId              0.000000
dateId              0.000000
borough             0.000000
precinct            0.000000
vicRace             0.000000
dtype: float64


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  shooting.drop(columns=['perpId', 'vicId'], inplace=True)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  shooting.rename(columns={'incidentId': 'id'}, inplace=True)


Unnamed: 0,id,type,precinct,borough,dateId,timeId,jurisdictionCode,offenseCode,spatialContext,nypdCode,...,perpAgeGroup,perpSex,perpRace,xCoord,yCoord,locationType,venueType,fatalFlag,longitude,latitude
202562,298756519,SHOOTING,103,QUEENS,1/1/2025,23:18:00,0,0,OUTSIDE,0,...,UNKNOWN,UNKNOWN,UNKNOWN,1036944.0,194475.0,COMMERCIAL,BAR/NIGHT CLUB,False,-73.809959,40.700318
202563,298756516,SHOOTING,40,BRONX,1/1/2025,21:22:00,2,0,OUTSIDE,0,...,UNKNOWN,UNKNOWN,UNKNOWN,1000688.0,236005.0,HOUSE,HOUSE,False,-73.940614,40.814448
202564,298756517,SHOOTING,44,BRONX,1/1/2025,5:15:00,0,0,OUTSIDE,0,...,45-64,M,BLACK,1009255.0,244278.0,STREET,UNKNOWN,True,-73.909635,40.837127
202565,298756513,SHOOTING,88,BROOKLYN,1/1/2025,0:16:00,0,0,INSIDE,0,...,UNKNOWN,UNKNOWN,UNKNOWN,990117.0,192144.0,HOUSE,HOUSE,False,-73.978842,40.694076
202566,298756517,SHOOTING,44,BRONX,1/1/2025,5:15:00,0,0,OUTSIDE,0,...,45-64,M,BLACK,1009255.0,244278.0,STREET,UNKNOWN,False,-73.909635,40.837127


#### Find shared columns of all three sections.

In [18]:
shared_cols = set(cmplt.columns) & set(arrest.columns) & set(shooting.columns)
print(f"Shared columns: {shared_cols}")

Shared columns: {'jurisdictionCode', 'id', 'dateId', 'longitude', 'offenseCode', 'precinct', 'borough', 'nypdCode', 'latitude', 'type'}


In [19]:
cols = ['id','type','dateId','borough','precinct','nypdCode','latitude','longitude','jurisdictionCode','offenseCode']
cmplt_main = cmplt[cols].copy()
arrest_main = arrest[cols].copy()
shooting_main = shooting[cols].copy()
main = pd.concat([cmplt_main, arrest_main, shooting_main], ignore_index=True)
print(main.shape)
print(main.head())


(202732, 10)
          id       type    dateId        borough  precinct  nypdCode  \
0  298725583  COMPLAINT  1/1/2025      MANHATTAN        34       382   
1  298902928  COMPLAINT  1/1/2025       BROOKLYN        78       353   
2  298725764  COMPLAINT  1/1/2025  STATEN ISLAND       121       269   
3  298712881  COMPLAINT  1/1/2025      MANHATTAN        14       782   
4  298949885  COMPLAINT  1/1/2025  STATEN ISLAND       123       254   

    latitude  longitude  jurisdictionCode  offenseCode  
0  40.866479 -73.928393                 0          105  
1  40.681360 -73.978747                 0          341  
2  40.620452 -74.157942                 0          121  
3  40.750664 -73.990868                17          236  
4  40.540771 -74.215793                 0          351  


In [20]:
print(cmplt.head())
cmplt.drop(columns=cols[1:], inplace=True)
cmplt.head()

          id       type  precinct        borough    dateId    timeId  \
0  298725583  COMPLAINT        34      MANHATTAN  1/1/2025   7:00:00   
1  298902928  COMPLAINT        78       BROOKLYN  1/1/2025  20:00:00   
2  298725764  COMPLAINT       121  STATEN ISLAND  1/1/2025  12:30:00   
3  298712881  COMPLAINT        14      MANHATTAN  1/1/2025   5:10:00   
4  298949885  COMPLAINT       123  STATEN ISLAND  1/1/2025   0:00:00   

  crimeStatus  jurisdictionCode jurisdictionDescription  offenseCode  ...  \
0   COMPLETED                 0        N.Y. POLICE DEPT          105  ...   
1   COMPLETED                 0        N.Y. POLICE DEPT          341  ...   
2   COMPLETED                 0        N.Y. POLICE DEPT          121  ...   
3   COMPLETED                17         MTA POLICE DEPT          236  ...   
4   COMPLETED                 0        N.Y. POLICE DEPT          351  ...   

  suspAgeGroup        suspRace  suspSex  vicAgeGroup         vicRace vicSex  \
0      UNKNOWN  BLACK HIS

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  cmplt.drop(columns=cols[1:], inplace=True)


Unnamed: 0,id,timeId,crimeStatus,jurisdictionDescription,lawCategory,spatialContext,offenseDescription,policeDescription,premisesDescription,reportDate,suspAgeGroup,suspRace,suspSex,vicAgeGroup,vicRace,vicSex,start_datetime,end_datetime
0,298725583,7:00:00,COMPLETED,N.Y. POLICE DEPT,FELONY,FRONT OF,ROBBERY,"ROBBERY,NECKCHAIN/JEWELRY",STREET,01/01/2025,UNKNOWN,BLACK HISPANIC,M,18-24,BLACK HISPANIC,M,2025-01-01 07:00:00,2025-01-01 07:10:00
1,298902928,20:00:00,COMPLETED,N.Y. POLICE DEPT,MISDEMEANOR,FRONT OF,PETIT LARCENY,"LARCENY,PETIT FROM BUILDING,UNATTENDED, PACKAG...",RESIDENCE-HOUSE,01/02/2025,UNKNOWN,UNKNOWN,UNKNOWN,45-64,WHITE,M,2025-01-01 20:00:00,2025-01-02 17:00:00
2,298725764,12:30:00,COMPLETED,N.Y. POLICE DEPT,FELONY,INSIDE,CRIMINAL MISCHIEF & RELATED OF,"MISCHIEF,CRIMINAL, UNCL 2ND",RESIDENCE-HOUSE,01/01/2025,18-24,BLACK,M,45-64,BLACK,M,2025-01-01 12:30:00,2025-01-01 12:40:00
3,298712881,5:10:00,COMPLETED,MTA POLICE DEPT,MISDEMEANOR,INSIDE,DANGEROUS WEAPONS,"WEAPONS, POSSESSION, ETC",TRANSIT FACILITY (OTHER),01/01/2025,18-24,WHITE,M,UNKNOWN,UNKNOWN,E,2025-01-01 05:10:00,2025-01-01 05:23:00
4,298949885,0:00:00,COMPLETED,N.Y. POLICE DEPT,MISDEMEANOR,FRONT OF,CRIMINAL MISCHIEF & RELATED OF,"MISCHIEF, CRIMINAL 4, OF MOTOR",RESIDENCE-HOUSE,01/05/2025,25-44,UNKNOWN,F,45-64,WHITE,F,2025-01-01 00:00:00,2025-01-01 00:15:00


In [21]:
# timeId is redundant with cmplntStartTime, so we can drop it
cmplt.drop(columns=['timeId'], inplace=True)
# Move start and end datetime after id
cmplt.insert(1, 'start_datetime', cmplt.pop('start_datetime'))
cmplt.insert(2, 'end_datetime', cmplt.pop('end_datetime'))
cmplt.head()

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  cmplt.drop(columns=['timeId'], inplace=True)


Unnamed: 0,id,start_datetime,end_datetime,crimeStatus,jurisdictionDescription,lawCategory,spatialContext,offenseDescription,policeDescription,premisesDescription,reportDate,suspAgeGroup,suspRace,suspSex,vicAgeGroup,vicRace,vicSex
0,298725583,2025-01-01 07:00:00,2025-01-01 07:10:00,COMPLETED,N.Y. POLICE DEPT,FELONY,FRONT OF,ROBBERY,"ROBBERY,NECKCHAIN/JEWELRY",STREET,01/01/2025,UNKNOWN,BLACK HISPANIC,M,18-24,BLACK HISPANIC,M
1,298902928,2025-01-01 20:00:00,2025-01-02 17:00:00,COMPLETED,N.Y. POLICE DEPT,MISDEMEANOR,FRONT OF,PETIT LARCENY,"LARCENY,PETIT FROM BUILDING,UNATTENDED, PACKAG...",RESIDENCE-HOUSE,01/02/2025,UNKNOWN,UNKNOWN,UNKNOWN,45-64,WHITE,M
2,298725764,2025-01-01 12:30:00,2025-01-01 12:40:00,COMPLETED,N.Y. POLICE DEPT,FELONY,INSIDE,CRIMINAL MISCHIEF & RELATED OF,"MISCHIEF,CRIMINAL, UNCL 2ND",RESIDENCE-HOUSE,01/01/2025,18-24,BLACK,M,45-64,BLACK,M
3,298712881,2025-01-01 05:10:00,2025-01-01 05:23:00,COMPLETED,MTA POLICE DEPT,MISDEMEANOR,INSIDE,DANGEROUS WEAPONS,"WEAPONS, POSSESSION, ETC",TRANSIT FACILITY (OTHER),01/01/2025,18-24,WHITE,M,UNKNOWN,UNKNOWN,E
4,298949885,2025-01-01 00:00:00,2025-01-01 00:15:00,COMPLETED,N.Y. POLICE DEPT,MISDEMEANOR,FRONT OF,CRIMINAL MISCHIEF & RELATED OF,"MISCHIEF, CRIMINAL 4, OF MOTOR",RESIDENCE-HOUSE,01/05/2025,25-44,UNKNOWN,F,45-64,WHITE,F


In [22]:
arrest.head()

Unnamed: 0,id,type,precinct,borough,dateId,jurisdictionCode,offenseCode,lawCategory,offenseDescription,nypdCode,policeDescription,keyCode,lawCode,perpAgeGroup,perpSex,perpRace,xCoord,yCoord,longitude,latitude
131320,298725428,ARREST,75,BROOKLYN,1/1/2025,0,0,MISDEMEANOR,ASSAULT 3 & RELATED OFFENSES,101,ASSAULT 3,344,PL 1200001,25-44,M,BLACK,1020730.0,183173.0,-73.868498,40.669378
131321,298736076,ARREST,83,BROOKLYN,1/1/2025,0,0,MISDEMEANOR,VEHICLE AND TRAFFIC LAWS,922,"TRAFFIC,UNCLASSIFIED MISDEMEAN",348,VTL0511001,45-64,M,WHITE HISPANIC,1010783.0,191425.0,-73.904319,40.692065
131322,298746553,ARREST,41,BRONX,1/1/2025,72,0,MISDEMEANOR,OFFENSES AGAINST PUBLIC ADMINI,759,"PUBLIC ADMINISTATION,UNCLASS M",359,PL 2052002,18-24,M,BLACK HISPANIC,1017940.0,232184.0,-73.878308,40.803914
131323,298706302,ARREST,18,MANHATTAN,1/1/2025,0,0,FELONY,FELONY ASSAULT,105,STRANGULATION 1ST,106,PL 1211200,25-44,M,BLACK,985893.0,216732.0,-73.994069,40.761556
131324,298725497,ARREST,109,QUEENS,1/1/2025,0,0,MISDEMEANOR,ASSAULT 3 & RELATED OFFENSES,101,ASSAULT 3,344,PL 1200001,25-44,F,WHITE HISPANIC,1033086.0,213033.0,-73.82374,40.751279


In [23]:
arrest.drop(columns=cols[1:], inplace=True)
arrest.head()

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  arrest.drop(columns=cols[1:], inplace=True)


Unnamed: 0,id,lawCategory,offenseDescription,policeDescription,keyCode,lawCode,perpAgeGroup,perpSex,perpRace,xCoord,yCoord
131320,298725428,MISDEMEANOR,ASSAULT 3 & RELATED OFFENSES,ASSAULT 3,344,PL 1200001,25-44,M,BLACK,1020730.0,183173.0
131321,298736076,MISDEMEANOR,VEHICLE AND TRAFFIC LAWS,"TRAFFIC,UNCLASSIFIED MISDEMEAN",348,VTL0511001,45-64,M,WHITE HISPANIC,1010783.0,191425.0
131322,298746553,MISDEMEANOR,OFFENSES AGAINST PUBLIC ADMINI,"PUBLIC ADMINISTATION,UNCLASS M",359,PL 2052002,18-24,M,BLACK HISPANIC,1017940.0,232184.0
131323,298706302,FELONY,FELONY ASSAULT,STRANGULATION 1ST,106,PL 1211200,25-44,M,BLACK,985893.0,216732.0
131324,298725497,MISDEMEANOR,ASSAULT 3 & RELATED OFFENSES,ASSAULT 3,344,PL 1200001,25-44,F,WHITE HISPANIC,1033086.0,213033.0


In [24]:
shooting.head()

Unnamed: 0,id,type,precinct,borough,dateId,timeId,jurisdictionCode,offenseCode,spatialContext,nypdCode,...,perpAgeGroup,perpSex,perpRace,xCoord,yCoord,locationType,venueType,fatalFlag,longitude,latitude
202562,298756519,SHOOTING,103,QUEENS,1/1/2025,23:18:00,0,0,OUTSIDE,0,...,UNKNOWN,UNKNOWN,UNKNOWN,1036944.0,194475.0,COMMERCIAL,BAR/NIGHT CLUB,False,-73.809959,40.700318
202563,298756516,SHOOTING,40,BRONX,1/1/2025,21:22:00,2,0,OUTSIDE,0,...,UNKNOWN,UNKNOWN,UNKNOWN,1000688.0,236005.0,HOUSE,HOUSE,False,-73.940614,40.814448
202564,298756517,SHOOTING,44,BRONX,1/1/2025,5:15:00,0,0,OUTSIDE,0,...,45-64,M,BLACK,1009255.0,244278.0,STREET,UNKNOWN,True,-73.909635,40.837127
202565,298756513,SHOOTING,88,BROOKLYN,1/1/2025,0:16:00,0,0,INSIDE,0,...,UNKNOWN,UNKNOWN,UNKNOWN,990117.0,192144.0,HOUSE,HOUSE,False,-73.978842,40.694076
202566,298756517,SHOOTING,44,BRONX,1/1/2025,5:15:00,0,0,OUTSIDE,0,...,45-64,M,BLACK,1009255.0,244278.0,STREET,UNKNOWN,False,-73.909635,40.837127


In [25]:
shooting.drop(columns=cols[1:], inplace=True)
shooting.rename(columns = {'timeId':'time'}, inplace=True)
shooting.head()

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  shooting.drop(columns=cols[1:], inplace=True)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  shooting.rename(columns = {'timeId':'time'}, inplace=True)


Unnamed: 0,id,time,spatialContext,vicAgeGroup,vicRace,vicSex,perpAgeGroup,perpSex,perpRace,xCoord,yCoord,locationType,venueType,fatalFlag
202562,298756519,23:18:00,OUTSIDE,<18,BLACK,F,UNKNOWN,UNKNOWN,UNKNOWN,1036944.0,194475.0,COMMERCIAL,BAR/NIGHT CLUB,False
202563,298756516,21:22:00,OUTSIDE,<18,WHITE HISPANIC,M,UNKNOWN,UNKNOWN,UNKNOWN,1000688.0,236005.0,HOUSE,HOUSE,False
202564,298756517,5:15:00,OUTSIDE,18-24,BLACK,M,45-64,M,BLACK,1009255.0,244278.0,STREET,UNKNOWN,True
202565,298756513,0:16:00,INSIDE,25-44,BLACK,M,UNKNOWN,UNKNOWN,UNKNOWN,990117.0,192144.0,HOUSE,HOUSE,False
202566,298756517,5:15:00,OUTSIDE,18-24,BLACK,M,45-64,M,BLACK,1009255.0,244278.0,STREET,UNKNOWN,False


- Now, we need to check duplicates so that id can serve as primary keys

In [26]:
cmplt_dup = cmplt[cmplt['id'].duplicated(keep=False)].sort_values('id')
cmplt_dup.head()


Unnamed: 0,id,start_datetime,end_datetime,crimeStatus,jurisdictionDescription,lawCategory,spatialContext,offenseDescription,policeDescription,premisesDescription,reportDate,suspAgeGroup,suspRace,suspSex,vicAgeGroup,vicRace,vicSex


In [27]:
arrest_dup = arrest[arrest['id'].duplicated(keep=False)].sort_values('id')
arrest_dup.head()

Unnamed: 0,id,lawCategory,offenseDescription,policeDescription,keyCode,lawCode,perpAgeGroup,perpSex,perpRace,xCoord,yCoord


In [28]:
shooting_dup = shooting[shooting['id'].duplicated(keep=False)].sort_values('id')
shooting_dup.head()

Unnamed: 0,id,time,spatialContext,vicAgeGroup,vicRace,vicSex,perpAgeGroup,perpSex,perpRace,xCoord,yCoord,locationType,venueType,fatalFlag
202564,298756517,5:15:00,OUTSIDE,18-24,BLACK,M,45-64,M,BLACK,1009255.0,244278.0,STREET,UNKNOWN,True
202566,298756517,5:15:00,OUTSIDE,18-24,BLACK,M,45-64,M,BLACK,1009255.0,244278.0,STREET,UNKNOWN,False
202569,298756517,5:15:00,OUTSIDE,45-64,BLACK,M,45-64,M,BLACK,1009255.0,244278.0,STREET,UNKNOWN,True
202571,298756517,5:15:00,OUTSIDE,45-64,BLACK,M,45-64,M,BLACK,1009255.0,244278.0,STREET,UNKNOWN,False
202562,298756519,23:18:00,OUTSIDE,<18,BLACK,F,UNKNOWN,UNKNOWN,UNKNOWN,1036944.0,194475.0,COMMERCIAL,BAR/NIGHT CLUB,False


In [29]:
main_dup = main[main['id'].duplicated(keep=False)].sort_values('id')
main_dup.head(30)
diff_cols = (
    main_dup.groupby('id')
       .nunique(dropna=False)
       .gt(1)
       .any()
)

diff_cols = diff_cols[diff_cols].index.tolist()
diff_cols

['type', 'offenseCode']

- there are no duplicates within arrest or complaints, but there exists multiple shooting records that corresponds to same id. We will need to add self-increment PK for this table
- As for the records in main. we can see one complaint may be related to multiple shooting record, making there duplicate records in the main. It means that we will have to one-hot encode type. On the other hand, offense code may also be different, with 0 and non-zero values. In this case, set the record to the non-zero value.

In [30]:
main['type'] = main['type'].astype(str).str.strip().str.upper()
main['offenseCode'] = pd.to_numeric(main['offenseCode'], errors='coerce')
main.loc[main['offenseCode'] == 0, 'offenseCode'] = np.nan
types = ['COMPLAINT', 'ARREST', 'SHOOTING']
flags = (
    pd.crosstab(main['id'], main['type'])
      .reindex(columns=types, fill_value=0)
      .clip(upper=1)
      .astype(int)
      .rename(columns={
          'COMPLAINT': 'is_complaint',
          'ARREST': 'is_arrest',
          'SHOOTING': 'is_shooting'
      })
      .reset_index()
)
cols_keep = [c for c in main.columns if c not in ['type']]
agg_dict = {c: 'first' for c in cols_keep}
agg_dict['offenseCode'] = 'max'
main_event = (
    main.groupby('id', as_index=False)
        .agg(agg_dict)
        .merge(flags, on='id', how='left')
)
main_event['offenseCode'] = main_event['offenseCode'].fillna(0).astype('Int64')
main_event.head()

Unnamed: 0,id,dateId,borough,precinct,nypdCode,latitude,longitude,jurisdictionCode,offenseCode,is_complaint,is_arrest,is_shooting
0,298704321,1/1/2025,QUEENS,101,792,40.59527,-73.756726,0,118,1,0,0
1,298704329,1/1/2025,BROOKLYN,62,419,40.602472,-74.003462,0,0,0,1,0
2,298704338,1/1/2025,MANHATTAN,9,639,40.726287,-73.987795,0,0,0,1,0
3,298704341,1/1/2025,QUEENS,101,792,40.59527,-73.756726,0,0,0,1,0
4,298704516,1/1/2025,MANHATTAN,14,406,40.758228,-73.989071,0,109,1,0,0


In [31]:
main_dup = main_event[main_event['id'].duplicated(keep=False)].sort_values('id')
print(main_dup)

Empty DataFrame
Columns: [id, dateId, borough, precinct, nypdCode, latitude, longitude, jurisdictionCode, offenseCode, is_complaint, is_arrest, is_shooting]
Index: []


In [32]:
main_event.to_csv('event.csv', index=False)
cmplt.to_csv('complaint.csv', index=False)
arrest.to_csv('arrest.csv', index=False)
shooting.to_csv('shooting.csv', index=False)