# A Solution Approach to Crime Classification

**Author**: Jhon Adrián Cerón-Guzmán.<br>
**Date**: March 2020.<br>
**Description**: This is my solution approach to the [San Francisco Crime Classification](https://www.kaggle.com/c/sf-crime/) challenge proposed on Kaggle.

# 0. Requirements

In order to encourage reproducibility, the following is a list of technologies used, as well as their respective version:

1. Python **3.7.2**.
2. NumPy **1.17.2**.
3. SciPy **1.3.1**.
4. Scikit-learn **0.21.3**.
5. pandas **0.25.1**.
6. Matplotlib **3.1.1**.
7. Numba **0.48.0**.

In [None]:
import copy
import datetime
import os
import re

import numba
import numpy as np
import pandas as pd

In [None]:
CURRENT_PATH = os.path.abspath(os.getcwd())
DATA_PATH = os.path.join(CURRENT_PATH, 'data')

DATE_FORMAT = '%Y-%m-%d %H:%M:%S'
RANDOM_STATE = 91

In [None]:
FNAMES = {
    'original-train': 'train.csv',
    'train': 'projected-train.csv',
    'original-test': 'test.csv',
    'test': 'projected-test.csv',
    'sample-submission': 'sampleSubmission.csv',
    'crime-dataset': 'crime-dataset.csv'
    }

FNAMES = {key: os.path.join(DATA_PATH, fname) for key, fname in FNAMES.items()}

# 1. Data Preprocessing

Let's map each crime category to its numerical representation.

In [None]:
with open(FNAMES['sample-submission']) as f:
    for i, row in enumerate(f):
        row = row.rstrip('\n')
        CRIME_CATEGORY = {category: j-1 for j, category in enumerate(row.split(',')) if j > 0}
        
        break

In [None]:
PROJECTION = ['Id', 'Dates', 'Category', 'DayOfWeek', 'PdDistrict', 'X', 'Y']

As specified by the variable `PROJECTION`, let's project (or filter) the datasets.

In [None]:
def dataset_projection(in_fname, out_fname, projection=PROJECTION):
    valid_columns = []
    
    insert_id = False
    header, data = [], []
    with open(in_fname) as f:
        for i, row in enumerate(f):
            row = row.rstrip('\n')
            
            if i == 0:
                for j, col in enumerate(row.split(',')):
                    if col in projection:
                        header.append(col)
                        valid_columns.append(j)
                        
                insert_id = True if 'Id' not in header else False

                continue
                
            for old in re.findall(r'"[^"]+"', row):
                new = re.sub(r',', '|', old)                
                row = row.replace(old, new, 1)
                
            record = [
                re.sub(r'\|', ',', col).strip()
                for j, col in enumerate(row.split(',')) if j in valid_columns
                ]
            
            if len(record) != len(valid_columns):
                print('({}), Malformed columns at line {}'.format(in_fname, i+1))
                continue
            elif insert_id:
                record.insert(0, i-1)
                
            data.append(record)
            
    if insert_id:
        header.insert(0, 'Id')
    
    return header, data

In [None]:
datasets = [
    [FNAMES['original-train'], FNAMES['train']],
    [FNAMES['original-test'], FNAMES['test']]
    ]
for in_fname, out_fname in datasets:
    if os.path.isfile(out_fname):
        continue
    
    header, data = dataset_projection(in_fname, out_fname)
    df = pd.DataFrame(data, columns=header)
    
    df['Dates'] = pd.to_datetime(df['Dates'], format=DATE_FORMAT)
    df = df.sort_values(by=['Dates'])
    df['Dates'] = df['Dates'].dt.strftime(DATE_FORMAT)
    
    df.to_csv(out_fname, index=False)

Finally, let's append the test set to the training one.

In [None]:
if not os.path.isfile(FNAMES['crime-dataset']):
    columns = copy.deepcopy(PROJECTION)
    columns.remove('Category')
    columns.insert(0, 'Dataset')
    
    df = None
    for in_fname, dataset_type in [[FNAMES['train'], 'train'], [FNAMES['test'], 'test']]:
        dataset = pd.read_csv(in_fname)
        dataset['Dataset'] = dataset_type
        dataset = dataset[columns]
        
        df = dataset.copy(deep=True) if df is None else df.append(dataset)
        
    df['Dates'] = pd.to_datetime(df['Dates'], format=DATE_FORMAT)
    df = df.sort_values(by=['Dates', 'Dataset', 'Id'])
    df['Dates'] = df['Dates'].dt.strftime(DATE_FORMAT)
    
    df = df[columns]
    
    df.to_csv(FNAMES['crime-dataset'], index=False)

# 2. Feature Engineering

In [None]:
df = pd.read_csv(FNAMES['crime-dataset'])

df['Dates'] = pd.to_datetime(df['Dates'], format=DATE_FORMAT)

In [None]:
DATASET_TO_IDX = {dataset: i for i, dataset in enumerate(df['Dataset'].unique())}
IDX_TO_DATASET = {i: dataset for dataset, i in DATASET_TO_IDX.items()}

df['Dataset'] = df['Dataset'].map(DATASET_TO_IDX)

In [None]:
DISTRICT_TO_IDX = {district: i for i, district in enumerate(df['PdDistrict'].unique())}
IDX_TO_DISTRICT = {i: district for district, i in DISTRICT_TO_IDX.items()}

df['PdDistrict'] = df['PdDistrict'].map(DISTRICT_TO_IDX)

In [None]:
PROJECTION = (['Dataset']
              + [col for col in PROJECTION if col not in ['Category', 'DayOfWeek']])

df = df[PROJECTION]
df = df.sort_values(by=['Dates', 'Dataset', 'Id'])

crimes = df.copy(deep=True)
crimes['ts'] = crimes['Dates'].values.astype(np.int64) // 10 ** 9
crimes = crimes[[('ts' if col == 'Dates' else col) for col in PROJECTION]].to_numpy().astype(float)

In [None]:
TIME_WINDOWS = [12, 24, 72, 168, 336]

In [None]:
@numba.jit(nopython=True)
def compute_distance(
        lat_1, lon_1,
        lat_2, lon_2):
    """Compute distance between two locations.
    
    Returns
    -------
    float
        Distance in KM.
    
    Source: <https://stackoverflow.com/questions/19412462/>
    """
    # Approximate radius of earth in KM
    earth_radius = 6373.0
    
    lat_1 = np.radians(lat_1)
    lon_1 = np.radians(lon_1)
    
    lat_2 = np.radians(lat_2)
    lon_2 = np.radians(lon_2)
    
    lon_dist = lon_2 - lon_1
    lat_dist = lat_2 - lat_1
    
    a = (np.square(np.sin(lat_dist/2))
         + np.cos(lat_1)
         * np.cos(lat_2)
         * np.square(np.sin(lon_dist/2)))
    
    c = 2 * np.arctan2(np.sqrt(a), np.sqrt(1-a))
    
    return earth_radius * c

In [None]:
@numba.jit(nopython=True)
def compute_aggregated_features(data, time_window, crime_radius):
    """Compute aggregated features.
    
    Parameters
    ----------
    data : np.ndarray, dtype('int64')
        A Numpy-like array of shape "(n, m)", where "n" is the number
        of records and "m" is the number of columns (or attributes).
        The strict order of the columns is presented below:
            Dataset,
            Id,
            Dates,
            PdDistrict,
            X - Longitude,
            Y - Latitude
    time_window : int
        Time window (in hours).
    crime_radius : list
        List of integers, each of which representing a radius in kilometers.
    """
    n = len(data)
    
    # Let's transform the time window into seconds
    time_window = time_window * 60 * 60
    
    aggregated_features = []
    for i in range(10000):
        ts = data[i,2]    
        
        lower_ts = ts - time_window
        
        mask = ((lower_ts < data[:,2])
                & (data[:,2] < ts))
        
        historical_data = data[mask]
        m = len(historical_data)
        
        police_district = data[i,3]
        
        feature_vector = [
            int(data[i,0]),
            int(data[i,1]),
            m, # number of crimes within the time window
            0 # number of crimes attended by the same police department district
            ]
        feature_vector = feature_vector + [0 for j in crime_radius]
        
        lat_1 = data[i,5]
        lon_1 = data[i,4]
        
        for j in range(m):
            feature_vector[3] += 1 if police_district == historical_data[j,3] else 0
            
            lat_2 = historical_data[j,5]
            lon_2 = historical_data[j,4]
            
            # Let's compute the number of crimes within each given radius
            distance = compute_distance(lat_1, lon_1, lat_2, lon_2)
            
            for k, rad in enumerate(crime_radius):
                feature_vector[4+k] += 1 if distance <= rad else 0
                
        aggregated_features.append(feature_vector)
        
    return aggregated_features

In [None]:
def feature_engineering(
        df, data, time_windows,
        idx_to_dataset, idx_to_district,
        crime_radius=[1, 2, 4, 8, 16]):
    """Compute the process of feature engineering."""
    agg_ds_fname = os.path.join(DATA_PATH, 'agg-dataset-{}Hrs.csv')
    cum_ds_fname = os.path.join(DATA_PATH, 'agg-dataset-{}Hrs-cumulative.csv')
    
    crime_radius = np.array(crime_radius, dtype=int)
    
    for i, time_window in enumerate(time_windows):
        compute_aggregated_features(data, time_window, crime_radius)
        
        break

In [None]:
feature_engineering(df, crimes, TIME_WINDOWS, IDX_TO_DATASET, IDX_TO_DISTRICT)