This notebook merges the data from the two datasets, and then stores it in a .csv file.

In [1]:
import os
import time
import glob
from datetime import datetime

import numpy as np
import pandas as pd

import seaborn as sns
from pprint import pprint
import matplotlib.pyplot as plt

In [2]:
# Load the two datasets
dataset1 = pd.read_csv("../data/SFPD_Crime_Data_2008_2018.csv")
dataset2 = pd.read_csv("../data/SFPD_Crime_Data_2018_Present.csv")

# Define the subsets of the data that we want to keep.
subset1 = ["Category", "DayOfWeek", "Date", "Time", "PdDistrict", "X", "Y"]
subset2 = [
    "Incident Category", 
    "Incident Day of Week",
    "Incident Date", 
    "Incident Time",
    "Police District", 
    "Latitude",
    "Longitude",
]
master_columns = ["Category", "Day of Week", "Date", "Time", "District", "Longitude", "Latitude"]

# Define the dictionaries that are used for renaming the categories.
subset1_rename = {"DayOfWeek": "Day of Week", "PdDistrict": "District", "X": "Longitude", "Y": "Latitude"}
subset2_rename = {"Incident Category": "Category", "Incident Day of Week": "Day of Week", "Incident Date": "Date", "Incident Time": "Time", "Police District": "District"}

#### Modifying Dataset 1

In [3]:
# Drop the NULL/invalid values.
dataset1.dropna(subset=subset1, inplace=True)

# Select the subset of columns.
dataset1_subset = dataset1[subset1]

# Rename the columns.
dataset1_subset.rename(columns=subset1_rename, inplace=True)

# Remove "Out of SF" from the District column if it exists.
dataset1_subset["District"] = dataset1_subset["District"][dataset1_subset["District"] != "Out of SF"]

# Capitalize the "Day Of Week" column.
dataset1_subset["Day of Week"] = dataset1_subset["Day of Week"].apply(lambda x : x.upper())

dataset1_subset.head()

Unnamed: 0,Category,Day of Week,Date,Time,District,Longitude,Latitude
0,WARRANTS,FRIDAY,07/01/2011,08:00,NORTHERN,-122.439758,37.802151
1,DRUG/NARCOTIC,TUESDAY,10/18/2005,14:30,TENDERLOIN,-122.414318,37.779944
2,VEHICLE THEFT,SATURDAY,01/29/2005,13:45,BAYVIEW,-122.388799,37.737576
3,NON-CRIMINAL,THURSDAY,06/02/2011,02:52,CENTRAL,-122.414354,37.803109
4,VEHICLE THEFT,SATURDAY,02/01/2003,08:00,BAYVIEW,-122.401097,37.724556


#### Modifying Dataset 2

In [5]:
# Drop the NULL/invalid values.
dataset2.dropna(subset=subset2, inplace=True)

# Select the subset of columns.
dataset2_subset = dataset2[subset2]

# Rename the columns
dataset2_subset.rename(columns=subset2_rename, inplace=True)

# Change the format of the dates.
dataset2_subset["Date"] = dataset2_subset["Date"].apply(lambda x : datetime.strptime(x, "%Y/%m/%d").strftime("%m/%d/%Y"))

# Capitalize the "Day of Week" column.
dataset2_subset["Day of Week"] = dataset2_subset["Day of Week"].apply(lambda x : x.upper())

# Capitalize the "Category" column.
dataset2_subset["Category"] = dataset2_subset["Category"].apply(lambda x : x.upper())

# Capitalize the "District" column.
dataset2_subset["District"] = dataset2_subset["District"].apply(lambda x : x.upper())

# Switch the Longitude and Latitude columns.
dataset2_subset = dataset2_subset[master_columns]

dataset2_subset.head()

Unnamed: 0,Category,Day of Week,Date,Time,District,Longitude,Latitude
3,ASSAULT,SUNDAY,08/16/2020,03:13,BAYVIEW,-122.397729,37.754827
4,MALICIOUS MISCHIEF,SUNDAY,08/16/2020,03:38,MISSION,-122.422044,37.76654
5,NON-CRIMINAL,SUNDAY,08/16/2020,13:40,SOUTHERN,-122.403712,37.784044
6,WEAPONS OFFENSE,SUNDAY,08/16/2020,16:18,TARAVAL,-122.507416,37.751003
7,MISSING PERSON,WEDNESDAY,08/12/2020,22:00,NORTHERN,-122.43214,37.780496


### Merge the Datasets Together

In [10]:
datasets = [dataset1_subset, dataset2_subset]
dataset = pd.concat(datasets, ignore_index=True)

In [11]:
dataset.tail()

Unnamed: 0,Category,Day of Week,Date,Time,District,Longitude,Latitude
2534373,LARCENY THEFT,SATURDAY,08/15/2020,20:44,CENTRAL,-122.411886,37.788808
2534374,NON-CRIMINAL,SATURDAY,08/15/2020,08:00,NORTHERN,-122.436204,37.792263
2534375,BURGLARY,SATURDAY,08/15/2020,15:47,CENTRAL,-122.408402,37.788293
2534376,RECOVERED VEHICLE,SATURDAY,08/15/2020,21:52,TARAVAL,-122.474494,37.741234
2534377,BURGLARY,SATURDAY,08/15/2020,23:50,CENTRAL,-122.408402,37.788293


The dataset has been successfully concatenated. Now we can save it as a .csv file for further loading.

In [13]:
dataset.to_csv("../data/SFPD_Crime_Data_Concatenated.csv")