In [1]:
import os
import time

import pickle
import pandas as pd

In [2]:
# Load the data.
data_path = "/Users/administrator/Documents/Projects/abq_crime/data/processed_data.pkl"
extracted_data = pickle.load(open(data_path, "rb"))

# Create the Pandas DataFrame.
data_columns = ["Object_ID", "Location", "Description", "Date", "Latitude", "Longitude"]
dataset = pd.DataFrame(extracted_data, columns=data_columns)

In [3]:
# Convert the date into a more readable format.
time_start = time.time()
dataset["Date"] = dataset["Date"].apply(lambda t: time.ctime(t / 1000.0))
time_end = time.time()

print(dataset["Date"].head())
print("Entire operation took {} seconds.".format(time_end - time_start))

0    Tue Sep 22 17:00:00 2020
1    Tue Sep 22 17:00:00 2020
2    Tue Sep 22 17:00:00 2020
3    Tue Sep 22 17:00:00 2020
4    Tue Sep 22 17:00:00 2020
Name: Date, dtype: object
Entire operation took 0.0959169864654541 seconds.


The date contains the day of the week along with the timestamp (which is always set at 17:00:00). We need to split the day of the week, extract the (year, month, day) data, and then put it in a format that is understandable (i.e. 2020/09/22).

In [4]:
# Extract the day of the week data from the entire date data.
conversion_key = {"Mon": "Monday", "Tue": "Tuesday", "Wed": "Wednesday", "Thu": "Thursday", "Fri": "Friday", "Sat": "Saturday", "Sun": "Sunday"}
dataset["Day of Week"] = dataset["Date"].apply(lambda x: conversion_key[x.split()[0]])

In [5]:
def pad_day(day):
    """Pads the day data if day < 10.
    
    Parameters
    ----------
    day : string
        String containing the day information.

    Returns
    -------
    day : string
        String containing the padded day information, if necessary.
    
    """
    if (int(day) < 10):
        day = "0{}".format(day)

    return day

def extract_date_from_full(full_date, month_key):
    """Extracts and converts the date into the format YYYY/MM/DD.

    Parameters
    ----------
    full_date : string
        String containing the full date data.
    month_key : dict
        Dictionary containing the conversion from month in words, to month in integers.

    Returns
    -------
    extracted_date : string
        String containing the converted date.

    """
    parts = full_date.split()
    year = parts[-1]
    month = parts[1]
    day = parts[2]

    return "{}/{}/{}".format(year, month_key[month], pad_day(day))

In [6]:
# Extract the date information from the entire date data.
month_key = {"Jan": "01", "Feb": "02", "Mar": "03", "Apr": "04", "May": "05", "Jun": "06", "Jul": "07", "Aug": "08", "Sep": "09", "Oct": "10", "Nov": "11", "Dec": "12"}

time_start = time.time()
dataset["Date"] = dataset["Date"].apply(extract_date_from_full, args=(month_key,))
time_end = time.time()

print("Entire operation took {} seconds.".format(time_end - time_start))

Entire operation took 0.06806111335754395 seconds.


In [7]:
# Reorder the columns.
data_columns = ["Object_ID", "Location", "Description", "Day of Week", "Date", "Latitude", "Longitude"]
dataset = dataset.reindex(columns=data_columns)

In [8]:
dataset.head()

Unnamed: 0,Object_ID,Location,Description,Day of Week,Date,Latitude,Longitude
0,45420062,I25 NORTHBOUND SE / COAL AV SE,TRAFFIC STOP,Tuesday,2020/09/22,35.078682,-106.63735
1,45420063,COORS BL NW / SEQUOIA RD NW,TRAFFIC STOP,Tuesday,2020/09/22,35.121313,-106.701621
2,45420064,COORS BL NW / EAGLE RANCH RD NW,DIRECT TRAFFIC,Tuesday,2020/09/22,35.174464,-106.673685
3,45420065,BRYN MAWR DR NE / MENAUL BL NE,TRAFFIC STOP,Tuesday,2020/09/22,35.109201,-106.60912
4,45420066,I25 NORTHBOUND SE / COAL AV SE,TRAFFIC STOP,Tuesday,2020/09/22,35.078682,-106.63735


In [11]:
# Save the Pandas DataFrame as a pkl.
save_path = "/Users/administrator/Documents/Projects/abq_crime/data/processed_dataframe.pkl"
dataset.to_pickle(save_path)