## NYPD Dataset

Dataset description at 
https://data.cityofnewyork.us/Public-Safety/NYPD-Complaint-Data-Historic/qgea-i56i



| Column | Description |
|--------|-------------------|
| CMPLNT_NUM | Randomly generated persistent ID for each complaint  |
| CMPLNT_FR_DT | Exact date of occurrence for the reported event (or starting date of occurrence, if CMPLNT_TO_DT exists) |
| CMPLNT_FR_TM | Exact time of occurrence for the reported event (or starting time of occurrence, if CMPLNT_TO_TM exists) |
| CMPLNT_TO_DT | Ending date of occurrence for the reported event, if exact time of occurrence is unknown |
| CMPLNT_TO_TM | Ending time of occurrence for the reported event, if exact time of occurrence is unknown |
| RPT_DT | Date event was reported to police  |
| KY_CD | Three digit offense classification code |
| OFNS_DESC | Description of offense corresponding with key code |
| PD_CD | Three digit internal classification code (more granular than Key Code) |
| PD_DESC | Description of internal classification corresponding with PD code (more granular than Offense Description) |
| CRM_ATPT_CPTD_CD | Indicator of whether crime was successfully completed or attempted, but failed or was interrupted prematurely |
| LAW_CAT_CD | Level of offense: felony, misdemeanor, violation  |
| JURIS_DESC | Jurisdiction responsible for incident. Either internal, like Police, Transit, and Housing; or external, like Correction, Port Authority, etc. |
| BORO_NM | The name of the borough in which the incident occurred |
| ADDR_PCT_CD | The precinct in which the incident occurred |
| LOC_OF_OCCUR_DESC | Specific location of occurrence in or around the premises; inside, opposite of, front of, rear of |
| PREM_TYP_DESC | Specific description of premises; grocery store, residence, street, etc. |
| PARKS_NM | Name of NYC park, playground or greenspace of occurrence, if applicable (state parks are not included) |
| HADEVELOPT | Name of NYCHA housing development of occurrence, if applicable |
| X_COORD_CD | X-coordinate for New York State Plane Coordinate System, Long Island Zone, NAD 83, units feet (FIPS 3104) |
| Y_COORD_CD | Y-coordinate for New York State Plane Coordinate System, Long Island Zone, NAD 83, units feet (FIPS 3104) |
| Latitude | Latitude coordinate for Global Coordinate System, WGS 1984, decimal degrees (EPSG 4326)  |
| Longitude | Longitude coordinate for Global Coordinate System, WGS 1984, decimal degrees (EPSG 4326) |
|--------|-------------------|

In [None]:
# From https://data.cityofnewyork.us/Public-Safety/NYPD-Complaint-Data-Historic/qgea-i56i/data
# !curl 'https://data.cityofnewyork.us/api/views/qgea-i56i/rows.csv?accessType=DOWNLOAD' -o nypd.csv

In [None]:
import pandas as pd
import numpy as np

In [None]:
df = pd.read_csv("nypd.csv", low_memory = False)

In [None]:
len(df)

In [None]:
# df [ df.CMPLNT_FR_DT.str.contains('1015') == True ]

In [None]:
# There are a few rows that contain year 1015, 1016, ... that trigger an error during date conversion
# We replace all years written as 10XX with 20XX
# Note the usage of regular expressions
df.CMPLNT_FR_DT.replace(to_replace = '(\d\d)/(\d\d)/10(\d\d)', value=r'\1/\2/20\3', inplace = True, regex=True )
df.CMPLNT_TO_DT.replace(to_replace = '(\d\d)/(\d\d)/10(\d\d)', value=r'\1/\2/20\3', inplace = True, regex=True )

In [None]:
# Similarly, a few hours are written as 24:00:00, which also triggers errors.
# We fix these hours
df.CMPLNT_FR_TM.replace(to_replace = '24:00:00', value='00:00:00', inplace = True)
df.CMPLNT_TO_TM.replace(to_replace = '24:00:00', value='00:00:00', inplace = True)

In [None]:
# Convert the two separate date and time columns into single datetime columns
df['CMPLNT_FR'] = pd.to_datetime(df.CMPLNT_FR_DT + ' ' + df.CMPLNT_FR_TM, format='%m/%d/%Y %H:%M:%S')
df['CMPLNT_TO'] = pd.to_datetime(df.CMPLNT_TO_DT + ' ' + df.CMPLNT_TO_TM, format='%m/%d/%Y %H:%M:%S')

# Convert RPT_DT to date
df.RPT_DT = pd.to_datetime(df.RPT_DT, format="%m/%d/%Y")

In [None]:
# MISD means MISDEMEANOR
df.LAW_CAT_CD.replace(to_replace = 'MISD', value='MISDEMEANOR', inplace = True )

# Replace ' ' with NULL
df.LOC_OF_OCCUR_DESC.replace(to_replace = ' ', value=np.nan, inplace = True)

##### We should have a discussion about data exploration/cleaning here

In [None]:
# Find the unique values in each column
# 
# df.describe(include = [np.object, 'category']).T['unique']
df.describe(include = 'all').T['unique'].sort_values()

In [None]:
df.CRM_ATPT_CPTD_CD.value_counts()

In [None]:
df.LAW_CAT_CD.value_counts()

In [None]:
df.BORO_NM.value_counts()

In [None]:
df.LOC_OF_OCCUR_DESC.value_counts()

In [None]:
# We will mark these as categorical, and we will make these ENUMs in the database
# As a matter of personal preference, I convert to ENUMS datatypes with cardinality less than 10
# as these are typically the attributes that do not have additional values appearing
# 
df.CRM_ATPT_CPTD_CD = pd.Categorical(df.CRM_ATPT_CPTD_CD) # COMPLETED / ATTEMPTED
df.LAW_CAT_CD = pd.Categorical(df.LAW_CAT_CD) # MISDEMEANOR / FELONY / VIOLATION / MISD     
df.BORO_NM = pd.Categorical(df.BORO_NM) 
df.LOC_OF_OCCUR_DESC = pd.Categorical(df.LOC_OF_OCCUR_DESC) 

In [None]:
# Find unique values and maximum length of various columns
for column in df.columns.values:
    datatype = df[column].dtype.name
    unique_values = len(df[column].value_counts())
    print(column, '\t', datatype, '\t', unique_values)
    if datatype == 'object' or datatype =='category':
        m = max([len(str(x)) for x in df[column].value_counts().index.values])
        print("Max length:", m)


In [None]:
df.KY_CD = pd.Categorical(df.KY_CD) #these are codes, not numbers
df.PD_CD = pd.Categorical(df.PD_CD) #these are codes, not numbers
df.ADDR_PCT_CD = pd.Categorical(df.ADDR_PCT_CD) #these are codes, not numbers

In [None]:
# These columns are redundant
# 
df.drop('Lat_Lon', axis=1, inplace=True)
df.drop('CMPLNT_FR_DT', axis=1, inplace=True)
df.drop('CMPLNT_TO_DT', axis=1, inplace=True)
df.drop('CMPLNT_FR_TM', axis=1, inplace=True)
df.drop('CMPLNT_TO_TM', axis=1, inplace=True)

In [None]:
 df.dtypes

The fields 

PD_CD, PD_DESC    
KY_CD, OFNS_DESC  
JURIS_DESC    
PREM_TYP_DESC    
HADEVELOPT    
PARKS_NM                     

would be better off as foreign keys or enums. They take too much space as strings.

### Writing a Pandas Dataframe in a MySQL Table

Now we will connect to our MySQL server. We will use the SQLAlchemy library of Python.

If you do not have the library, you need to install it by typing in the shell:

In [None]:
!sudo -H pip3 install -U sqlalchemy

In [None]:
from sqlalchemy import create_engine

conn_string = 'mysql://{user}:{password}@{host}/?charset=utf8mb4'.format(
    host = 'db.ipeirotis.org', 
    user = 'root',
    password = 'sO5wAOyHFg2b8pf2',
    encoding = 'utf8mb4')

engine = create_engine(conn_string)
con = engine.connect()

Once we have connected successfully, we need to create our database:

In [None]:
# Query to create a database
db_name = 'nypd_complaints'
create_db_query = "CREATE DATABASE IF NOT EXISTS {db} DEFAULT CHARACTER SET 'utf8mb4'".format(db=db_name)

# Create a database
engine.execute(create_db_query)


In [None]:
# And lets switch to the database
engine.execute("USE {db}".format(db=db_name))

In [None]:
# In principle, we can let Pandas create the table, but we want to be a bit more predise
# with the data types, and we want to add documentation for each column
# from https://data.cityofnewyork.us/Public-Safety/NYPD-Complaint-Data-Historic/qgea-i56i

create_table_sql = '''
CREATE TABLE nypd (
  CMPLNT_NUM bigint(20) COMMENT 'Randomly generated persistent ID for each complaint ',
  CMPLNT_FR datetime COMMENT 'Exact date/time of occurrence for the reported event  (or starting date/time of occurrence, if CMPLNT_TO_DTTM exists)',
  CMPLNT_TO datetime  COMMENT 'Ending date/time of occurrence for the reported event, if exact time of occurrence is unknown',
  RPT_DT date COMMENT 'Date event was reported to police',
  KY_CD char(3)  COMMENT 'Three digit offense classification code',
  OFNS_DESC varchar(60)  COMMENT 'Description of offense corresponding with key code',
  PD_CD char(3) COMMENT 'Three digit internal classification code (more granular than Key Code)',
  PD_DESC varchar(60)  COMMENT 'Description of internal classification corresponding with PD code (more granular than Offense Description',
  CRM_ATPT_CPTD_CD enum('COMPLETED','ATTEMPTED')  COMMENT 'Indicator of whether crime was successfully completed or attempted, but failed or was interrupted prematurely',
  LAW_CAT_CD enum('FELONY','MISDEMEANOR','VIOLATION')  COMMENT 'Level of offense: felony, misdemeanor, violation',
  JURIS_DESC varchar(60)  COMMENT 'Jurisdiction responsible for incident. Either internal, like Police, Transit, and Housing; or external, like Correction, Port Authority, etc.',
  BORO_NM enum('BRONX','BROOKLYN','MANHATTAN','QUEENS','STATEN ISLAND') COMMENT 'The name of the borough in which the incident occurred',
  ADDR_PCT_CD char(3)  COMMENT 'The precinct in which the incident occurred',
  LOC_OF_OCCUR_DESC enum('FRONT OF','INSIDE','OPPOSITE OF','OUTSIDE','REAR OF') COMMENT 'Specific location of occurrence in or around the premises; inside, opposite of, front of, rear of',
  PREM_TYP_DESC varchar(60)  COMMENT 'Specific description of premises; grocery store, residence, street, etc.',
  PARKS_NM varchar(80)  COMMENT 'Name of NYC park, playground or greenspace of occurrence, if applicable (state parks are not included)',
  HADEVELOPT varchar(60)  COMMENT 'Name of NYCHA housing development of occurrence, if applicable',
  X_COORD_CD char(9)  COMMENT 'X-coordinate for New York State Plane Coordinate System, Long Island Zone, NAD 83, units feet (FIPS 3104)',
  Y_COORD_CD char(9)  COMMENT 'Y-coordinate for New York State Plane Coordinate System, Long Island Zone, NAD 83, units feet (FIPS 3104)',
  Latitude double COMMENT 'Latitude coordinate for Global Coordinate System, WGS\n1984, decimal degrees (EPSG 4326)',
  Longitude double  COMMENT 'Longitude coordinate for Global Coordinate System, WGS 1984, decimal degrees (EPSG 4326)',
  PRIMARY KEY (CMPLNT_NUM)
) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4;
'''
engine.execute(create_table_sql)

In [None]:
# Create a table
# See http://pandas.pydata.org/pandas-docs/stable/generated/pandas.DataFrame.to_sql.html for the documentation
from tqdm import tqdm
batchsize = 50000
batches = len(df) // batchsize + 1

t = tqdm(range(batches))

table_name = 'nypd'

for i in t:
    start = batchsize * i
    end = batchsize * (i+1)
    df[start:end].to_sql(
        name = table_name, 
        schema = db_name, 
        con = engine,
        if_exists = 'append',
        index = False, 
        chunksize = 1000)

In [None]:
# Once we have the data in the table, we also specify a primary key
# If we had FOREIGN KEYS we can add them in the same way
# add_key_query = 'ALTER TABLE nypd ADD PRIMARY KEY(CMPLNT_NUM)'
# engine.execute(add_key_query)

In [None]:
query =  "SELECT * FROM nypd LIMIT 100"
pd.read_sql(query, con=engine).head(5)


In [None]:
query =  "show full columns from nypd_complaints.nypd;"
pd.read_sql(query, con=engine)[['Field', 'Type', 'Comment']]


And remember that from Pandas it is also possible to export in other formats, such as Excel of CSV.

In [None]:
# The necessary library to write in Excel
# !sudo pip3 install -U xlwt

In [None]:
# Data quality issues to fix: KY_CD, OFNS_DESC

query = '''
SELECT KY_CD, OFNS_DESC, COUNT(*)
FROM nypd WHERE KY_CD IN (
SELECT KY_CD
FROM nypd
WHERE OFNS_DESC IS NOT NULL
GROUP BY KY_CD
HAVING COUNT(DISTINCT OFNS_DESC)>1)
GROUP BY KY_CD, OFNS_DESC
'''

df = pd.read_sql(query, con=engine)
df

In [None]:
# Data quality issues to fix: PD_CD, PD_DESC

query = '''
SELECT PD_CD, PD_DESC, COUNT(*)
FROM nypd WHERE PD_DESC IN (
SELECT PD_DESC
FROM nypd
WHERE PD_DESC IS NOT NULL
GROUP BY PD_DESC
HAVING COUNT(DISTINCT PD_CD)>1)
GROUP BY PD_CD, PD_DESC
'''

df = pd.read_sql(query, con=engine)
df