# SF Crime Data Preparation

- Authors: Henry Gräser, Jonas Müller, Thomas Wolff, Hannes Harnisch
- Created on: June 28, 2024
- Description: Preparing the data of the kaggle dataset SF Crime
- Kaggle competition: [SF Crime Classification](https://www.kaggle.com/c/sf-crime/data) 

## Libraries

In [1]:
import pandas as pd
# For holidays
from pandas.tseries.holiday import USFederalHolidayCalendar as calendar
# For timezone and sun position
import pytz
from astral import LocationInfo
from astral.sun import sun

import re
import os

from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split

## Data import

In [2]:
full_train = pd.read_csv('data/train.csv')

full_train.head()

Unnamed: 0,Dates,Category,Descript,DayOfWeek,PdDistrict,Resolution,Address,X,Y
0,2015-05-13 23:53:00,WARRANTS,WARRANT ARREST,Wednesday,NORTHERN,"ARREST, BOOKED",OAK ST / LAGUNA ST,-122.425892,37.774599
1,2015-05-13 23:53:00,OTHER OFFENSES,TRAFFIC VIOLATION ARREST,Wednesday,NORTHERN,"ARREST, BOOKED",OAK ST / LAGUNA ST,-122.425892,37.774599
2,2015-05-13 23:33:00,OTHER OFFENSES,TRAFFIC VIOLATION ARREST,Wednesday,NORTHERN,"ARREST, BOOKED",VANNESS AV / GREENWICH ST,-122.424363,37.800414
3,2015-05-13 23:30:00,LARCENY/THEFT,GRAND THEFT FROM LOCKED AUTO,Wednesday,NORTHERN,NONE,1500 Block of LOMBARD ST,-122.426995,37.800873
4,2015-05-13 23:30:00,LARCENY/THEFT,GRAND THEFT FROM LOCKED AUTO,Wednesday,PARK,NONE,100 Block of BRODERICK ST,-122.438738,37.771541


In [3]:
print(f"Number of data points: {full_train.shape[0]}")

Number of data points: 878049


## Train Test Split

Train test split should have following configuration:
- 20% of the full_train dataset should be test data
- All data points in the year of 2015 should be included to the test dataset and won't be in the train dataset
- The remaining necessary data points to reach 20% will be filled randomly

In [4]:
def filter_2015_data_points(df, filter: bool):
    df_copy = df.copy()
    # Create a temporary column to convert 'Dates' to datetime
    df_copy['TempDates'] = pd.to_datetime(df['Dates'])
    # Filter rows where the year in 'TempDates' column is 2015
    if filter:
        df_2015 = df_copy[df_copy['TempDates'].dt.year != 2015]
    else:
        df_2015 = df_copy[df_copy['TempDates'].dt.year == 2015]
    # Drop the temporary column
    df_2015 = df_2015.drop(columns=['TempDates'])
    return df_2015

In [5]:
train = filter_2015_data_points(full_train, True)
test_2015 = filter_2015_data_points(full_train, False)

In [6]:
print(f"Number of data points: {train.shape[0]}")

Number of data points: 850465


In [7]:
print(f"Number of data points: {test_2015.shape[0]}")

Number of data points: 27584


In [8]:
num_test_points = 148026 # Number of data points which must be added to test set randomly
test_size = num_test_points / len(train) # Determine the test size as a proportion of the total dataset
train_df, test_df = train_test_split(train, test_size=test_size, random_state=42)


In [9]:
train = train_df
test = pd.concat([test_2015, test_df], ignore_index=True)

In [10]:
print(f"Number of total data points: {full_train.shape[0]}")
print(f"Number of train data points: {train.shape[0]}")
print(f"Number of test data points: {test.shape[0]}")

Number of total data points: 878049
Number of train data points: 702439
Number of test data points: 175610


In [11]:
train.head()

Unnamed: 0,Dates,Category,Descript,DayOfWeek,PdDistrict,Resolution,Address,X,Y
457329,2008-11-29 00:17:00,OTHER OFFENSES,TRAFFIC VIOLATION,Saturday,RICHMOND,"ARREST, CITED",GEARY BL / 4TH AV,-122.462141,37.78111
562937,2007-06-01 08:00:00,BAD CHECKS,"CHECKS, NON-SUFFICIENT FUNDS (FELONY)",Friday,CENTRAL,COMPLAINANT REFUSES TO PROSECUTE,0 Block of POST ST,-122.403183,37.78901
780045,2004-04-27 06:29:00,OTHER OFFENSES,BEYOND PARENTAL CONTROL,Tuesday,BAYVIEW,NONE,0 Block of ERVINE ST,-122.40837,37.717568
784974,2004-04-01 15:49:00,ASSAULT,THREATENING PHONE CALL(S),Thursday,NORTHERN,NONE,3100 Block of FRANKLIN ST,-122.426594,37.803485
760313,2004-07-25 16:19:00,ASSAULT,INFLICT INJURY ON COHABITEE,Sunday,TENDERLOIN,"ARREST, BOOKED",EDDY ST / MASON ST,-122.409313,37.784348


In [12]:
test.head()

Unnamed: 0,Dates,Category,Descript,DayOfWeek,PdDistrict,Resolution,Address,X,Y
0,2015-05-13 23:53:00,WARRANTS,WARRANT ARREST,Wednesday,NORTHERN,"ARREST, BOOKED",OAK ST / LAGUNA ST,-122.425892,37.774599
1,2015-05-13 23:53:00,OTHER OFFENSES,TRAFFIC VIOLATION ARREST,Wednesday,NORTHERN,"ARREST, BOOKED",OAK ST / LAGUNA ST,-122.425892,37.774599
2,2015-05-13 23:33:00,OTHER OFFENSES,TRAFFIC VIOLATION ARREST,Wednesday,NORTHERN,"ARREST, BOOKED",VANNESS AV / GREENWICH ST,-122.424363,37.800414
3,2015-05-13 23:30:00,LARCENY/THEFT,GRAND THEFT FROM LOCKED AUTO,Wednesday,NORTHERN,NONE,1500 Block of LOMBARD ST,-122.426995,37.800873
4,2015-05-13 23:30:00,LARCENY/THEFT,GRAND THEFT FROM LOCKED AUTO,Wednesday,PARK,NONE,100 Block of BRODERICK ST,-122.438738,37.771541


## Data cleaning

### Removing Duplicates

In [13]:
train.drop_duplicates(inplace=True)
train

Unnamed: 0,Dates,Category,Descript,DayOfWeek,PdDistrict,Resolution,Address,X,Y
457329,2008-11-29 00:17:00,OTHER OFFENSES,TRAFFIC VIOLATION,Saturday,RICHMOND,"ARREST, CITED",GEARY BL / 4TH AV,-122.462141,37.781110
562937,2007-06-01 08:00:00,BAD CHECKS,"CHECKS, NON-SUFFICIENT FUNDS (FELONY)",Friday,CENTRAL,COMPLAINANT REFUSES TO PROSECUTE,0 Block of POST ST,-122.403183,37.789010
780045,2004-04-27 06:29:00,OTHER OFFENSES,BEYOND PARENTAL CONTROL,Tuesday,BAYVIEW,NONE,0 Block of ERVINE ST,-122.408370,37.717568
784974,2004-04-01 15:49:00,ASSAULT,THREATENING PHONE CALL(S),Thursday,NORTHERN,NONE,3100 Block of FRANKLIN ST,-122.426594,37.803485
760313,2004-07-25 16:19:00,ASSAULT,INFLICT INJURY ON COHABITEE,Sunday,TENDERLOIN,"ARREST, BOOKED",EDDY ST / MASON ST,-122.409313,37.784348
...,...,...,...,...,...,...,...,...,...
286762,2011-06-14 14:15:00,ASSAULT,BATTERY,Tuesday,PARK,NONE,1700 Block of EDDY ST,-122.436443,37.780849
393422,2009-10-30 22:15:00,LARCENY/THEFT,GRAND THEFT FROM LOCKED AUTO,Friday,SOUTHERN,NONE,5TH ST / HOWARD ST,-122.404934,37.781499
159516,2013-04-04 08:30:00,NON-CRIMINAL,LOST PROPERTY,Thursday,SOUTHERN,NONE,3RD ST / MARKET ST,-122.403434,37.787643
698739,2005-06-17 15:00:00,LARCENY/THEFT,PETTY THEFT WITH PRIOR,Friday,SOUTHERN,NONE,300 Block of 4TH ST,-122.399834,37.781043


### Coordinate Cleaning

Renaming Coordinate Columns and removing wrong data that is not in SF in train dataset

In [14]:
train = train.rename(columns={'X': 'long', 'Y': 'lat'})
test = test.rename(columns={'X': 'long', 'Y': 'lat'})
train = train[train['lat'] != train['lat'].max()]

### Setting Types

In [15]:
for df in [train, test]:
    df['DayOfWeek'] = df['DayOfWeek'].astype('category')
    df['PdDistrict'] = df['PdDistrict'].astype('category')

## Feature Development

### Temporal Features

We want to format the Dates collumn and also extract temporal features such as Year, Month, Day, Hour, Minute

In [16]:
train["Dates"] = pd.to_datetime(train["Dates"], format="%Y-%m-%d %H:%M:%S")
test["Dates"] = pd.to_datetime(test["Dates"], format="%Y-%m-%d %H:%M:%S")

In [17]:
def create_column(df, datetime_column, part_name):
    df[part_name.capitalize()] = df[datetime_column].map(lambda x: getattr(x, part_name, None))

for df in [train, test]:
    for part in ["year", "month", "day", "hour", "minute"]:
        create_column(df, "Dates", part)
train.head()

Unnamed: 0,Dates,Category,Descript,DayOfWeek,PdDistrict,Resolution,Address,long,lat,Year,Month,Day,Hour,Minute
457329,2008-11-29 00:17:00,OTHER OFFENSES,TRAFFIC VIOLATION,Saturday,RICHMOND,"ARREST, CITED",GEARY BL / 4TH AV,-122.462141,37.78111,2008,11,29,0,17
562937,2007-06-01 08:00:00,BAD CHECKS,"CHECKS, NON-SUFFICIENT FUNDS (FELONY)",Friday,CENTRAL,COMPLAINANT REFUSES TO PROSECUTE,0 Block of POST ST,-122.403183,37.78901,2007,6,1,8,0
780045,2004-04-27 06:29:00,OTHER OFFENSES,BEYOND PARENTAL CONTROL,Tuesday,BAYVIEW,NONE,0 Block of ERVINE ST,-122.40837,37.717568,2004,4,27,6,29
784974,2004-04-01 15:49:00,ASSAULT,THREATENING PHONE CALL(S),Thursday,NORTHERN,NONE,3100 Block of FRANKLIN ST,-122.426594,37.803485,2004,4,1,15,49
760313,2004-07-25 16:19:00,ASSAULT,INFLICT INJURY ON COHABITEE,Sunday,TENDERLOIN,"ARREST, BOOKED",EDDY ST / MASON ST,-122.409313,37.784348,2004,7,25,16,19


### Holiday Feature

Enriching data with information of Holidays:

In [18]:
cal = calendar()
for df in [train, test]:
    holidays = cal.holidays(start=df['Dates'].min(), end=df['Dates'].max())
    df['Holiday'] = (df['Dates'].dt.date.astype('datetime64[ns]').isin(holidays)).astype(int)
    print(df['Holiday'].value_counts())

Holiday
0    680434
1     20349
Name: count, dtype: int64
Holiday
0    170338
1      5272
Name: count, dtype: int64


### Night Feature

Enriching data with information on If it is at night or during the daylight:

In [19]:
def get_all_sunset_sunrise_sf(x):
    city = LocationInfo("San Francisco", "USA", "America/Los_Angeles", 37.7749, -122.4194)
    timezone = pytz.timezone(city.timezone)
    return {
        f"{day['Day']}-{day['Month']}-{day['Year']}": sun(
            city.observer,
            date=pd.Timestamp(year=day['Year'], month=day['Month'], day=day['Day'], tz=timezone).date(),
            tzinfo=city.timezone
        )
        for _, day in x.iterrows()
    }

def is_at_night(date, sun_info):
    dusk = sun_info['dusk'].replace(tzinfo=None)
    dawn = sun_info['dawn'].replace(tzinfo=None)
    # Keine Änderung hier, da die Logik korrekt ist, aber stellen Sie sicher, dass 'date' auch ohne Zeitzone ist
    if dawn < dusk:  # Für Fälle, in denen der Sonnenaufgang als am nächsten Tag betrachtet wird
        return date > dusk or date < dawn
    else:
        return dusk < date < dawn

for df in [train, test]:
    unique_days = df[['Day', 'Month', 'Year']].drop_duplicates()
    sun_info = get_all_sunset_sunrise_sf(unique_days)
    df['Night'] = df['Dates'].map(lambda x: int(is_at_night(x, sun_info[f"{x.day}-{x.month}-{x.year}"])))
    print(df['Night'].value_counts()) 

# Checking that there is no values during the middle of the day that are marked as night
train[(train['Night'] == True) & (train['Hour'] < 17) & (train['Hour'] > 7)].shape

Night
0    434989
1    265794
Name: count, dtype: int64
Night
0    108155
1     67455
Name: count, dtype: int64


(0, 16)

### Season Feature

In [20]:
seasons = { 1: 'Winter', 2: 'Spring', 3: 'Summer', 4: 'Fall' }
for df in [train, test]:
    df['Season'] = df['Month'].map(lambda x: seasons[(x%12 + 3)//3])

train.head()

Unnamed: 0,Dates,Category,Descript,DayOfWeek,PdDistrict,Resolution,Address,long,lat,Year,Month,Day,Hour,Minute,Holiday,Night,Season
457329,2008-11-29 00:17:00,OTHER OFFENSES,TRAFFIC VIOLATION,Saturday,RICHMOND,"ARREST, CITED",GEARY BL / 4TH AV,-122.462141,37.78111,2008,11,29,0,17,0,1,Fall
562937,2007-06-01 08:00:00,BAD CHECKS,"CHECKS, NON-SUFFICIENT FUNDS (FELONY)",Friday,CENTRAL,COMPLAINANT REFUSES TO PROSECUTE,0 Block of POST ST,-122.403183,37.78901,2007,6,1,8,0,0,0,Summer
780045,2004-04-27 06:29:00,OTHER OFFENSES,BEYOND PARENTAL CONTROL,Tuesday,BAYVIEW,NONE,0 Block of ERVINE ST,-122.40837,37.717568,2004,4,27,6,29,0,0,Spring
784974,2004-04-01 15:49:00,ASSAULT,THREATENING PHONE CALL(S),Thursday,NORTHERN,NONE,3100 Block of FRANKLIN ST,-122.426594,37.803485,2004,4,1,15,49,0,0,Spring
760313,2004-07-25 16:19:00,ASSAULT,INFLICT INJURY ON COHABITEE,Sunday,TENDERLOIN,"ARREST, BOOKED",EDDY ST / MASON ST,-122.409313,37.784348,2004,7,25,16,19,0,0,Summer


### Address Features

In [21]:
def get_block(address):
    match = re.search(r'(\d+)\s+block of', address, re.IGNORECASE)
    if match:
        # The block number is divided by 100 because they always increase by 100 and then increased by 1 to leave 0 for no block
        return int(match.group(1)) // 100 + 1
    return 0

def get_street_type(address):
    # See also data-understanding.ipynb
    street_types = ['AV', 'ST', 'CT', 'PZ', 'LN', 'DR', 'PL', 'HY', 'FY', 'WY', 'TR', 'RD', 'BL', 'WAY', 'CR', 'AL', 'I-80', 'RW', 'WK']
    match = re.findall(r'\b(?:' + '|'.join(street_types) + r')\b', address, re.IGNORECASE)
    if len(match) > 1 and '/' in address:
        return "INT"
    if len(match) == 1:
        return match[0]
    return "OTHER"


for df in [train, test]:
    df['Block'] = df['Address'].map(get_block)
    df['StreetType'] = df['Address'].map(get_street_type)

In [22]:
train['Block'].value_counts()

Block
0     208025
1      60994
2      41569
9      41313
3      30638
       ...  
82         6
79         4
84         3
81         3
80         1
Name: count, Length: 85, dtype: int64

In [23]:
train['StreetType'].value_counts()

StreetType
ST       356911
INT      207870
AV        97544
BL        11835
DR         9521
WY         4182
RD         2922
CT         2253
PZ         1871
HY         1819
LN         1328
TR          877
PL          875
CR          284
I-80        266
OTHER       228
AL          147
WAY          44
WK            5
RW            1
Name: count, dtype: int64

In [24]:
for df in [train, test]:
    df['Season'] = df['Season'].astype('category')
    df['StreetType'] = df['StreetType'].astype('category')

In [25]:
# Check that an intersection has no block number
train[(train['Block'] != 0) & (train['StreetType'] == 'INT')].shape

(0, 19)

### Dropping unnecesary Dimensions

In [26]:
# Sort the columns to be more organized
sorded_column_sequence = ['DayOfWeek', 'Day', 'Month', 'Hour', 'Season', 'Night', 'Holiday', 'Block', 'StreetType', 'PdDistrict', 'lat', 'long', 'Category']

train = train[sorded_column_sequence]
test = test[sorded_column_sequence]

train.head()

Unnamed: 0,DayOfWeek,Day,Month,Hour,Season,Night,Holiday,Block,StreetType,PdDistrict,lat,long,Category
457329,Saturday,29,11,0,Fall,1,0,0,INT,RICHMOND,37.78111,-122.462141,OTHER OFFENSES
562937,Friday,1,6,8,Summer,0,0,1,ST,CENTRAL,37.78901,-122.403183,BAD CHECKS
780045,Tuesday,27,4,6,Spring,0,0,1,ST,BAYVIEW,37.717568,-122.40837,OTHER OFFENSES
784974,Thursday,1,4,15,Spring,0,0,32,ST,NORTHERN,37.803485,-122.426594,ASSAULT
760313,Sunday,25,7,16,Summer,0,0,0,INT,TENDERLOIN,37.784348,-122.409313,ASSAULT


Summarize categories with low occuences in test dataset

In [27]:
def get_unique_categories_count(df):
    return len(df["Category"].value_counts().reset_index())

In [28]:
print(get_unique_categories_count(train))
print(train["Category"].value_counts())

39
Category
LARCENY/THEFT                  137749
OTHER OFFENSES                 101023
NON-CRIMINAL                    73124
ASSAULT                         61520
DRUG/NARCOTIC                   43867
VEHICLE THEFT                   43292
VANDALISM                       35862
WARRANTS                        33879
BURGLARY                        29398
SUSPICIOUS OCC                  25080
MISSING PERSON                  20635
ROBBERY                         18461
FRAUD                           13287
FORGERY/COUNTERFEITING           8631
SECONDARY CODES                  7928
WEAPON LAWS                      6755
PROSTITUTION                     6093
TRESPASS                         5823
STOLEN PROPERTY                  3598
DISORDERLY CONDUCT               3511
SEX OFFENSES FORCIBLE            3485
DRUNKENNESS                      3463
RECOVERED VEHICLE                2562
KIDNAPPING                       1871
DRIVING UNDER THE INFLUENCE      1822
RUNAWAY                          1555


Summarizing crime categories with count below 1000 in train dataset.

In [29]:
# Train
train['Category'] = train['Category'].replace('TREA', 'OTHER OFFENSES')
train['Category'] = train['Category'].replace('PORNOGRAPHY/OBSCENE MAT', 'OTHER OFFENSES')
train['Category'] = train['Category'].replace('GAMBLING', 'OTHER OFFENSES')
train['Category'] = train['Category'].replace('SEX OFFENSES NON FORCIBLE', 'OTHER OFFENSES')
train['Category'] = train['Category'].replace('EXTORTION', 'OTHER OFFENSES')
train['Category'] = train['Category'].replace('BRIBERY', 'OTHER OFFENSES')
train['Category'] = train['Category'].replace('BAD CHECKS', 'OTHER OFFENSES')
train['Category'] = train['Category'].replace('FAMILY OFFENSES', 'OTHER OFFENSES')
train['Category'] = train['Category'].replace('SUICIDE', 'OTHER OFFENSES')

# Test
test['Category'] = test['Category'].replace('TREA', 'OTHER OFFENSES')
test['Category'] = test['Category'].replace('PORNOGRAPHY/OBSCENE MAT', 'OTHER OFFENSES')
test['Category'] = test['Category'].replace('GAMBLING', 'OTHER OFFENSES')
test['Category'] = test['Category'].replace('SEX OFFENSES NON FORCIBLE', 'OTHER OFFENSES')
test['Category'] = test['Category'].replace('EXTORTION', 'OTHER OFFENSES')
test['Category'] = test['Category'].replace('BRIBERY', 'OTHER OFFENSES')
test['Category'] = test['Category'].replace('BAD CHECKS', 'OTHER OFFENSES')
test['Category'] = test['Category'].replace('FAMILY OFFENSES', 'OTHER OFFENSES')
test['Category'] = test['Category'].replace('SUICIDE', 'OTHER OFFENSES')

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  test['Category'] = test['Category'].replace('TREA', 'OTHER OFFENSES')
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  test['Category'] = test['Category'].replace('PORNOGRAPHY/OBSCENE MAT', 'OTHER OFFENSES')
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  test['Category'] = test['Category'].replace('G

In [30]:
get_unique_categories_count(train)

30

### Export prepared data

In [31]:
tmp_dir = 'data/tmp'

# Check if the directory exists, and create it if it doesn't
if not os.path.exists(tmp_dir):
    os.makedirs(tmp_dir)
    
train.to_csv('data/tmp/prepared_train.csv', index=True)
test.to_csv('data/tmp/prepared_test.csv', index=True)

## Data encoding

In [32]:
encoded_train = train.copy() 
encoded_test = test.copy()

In [33]:
# Transforming Categorical attributes -> Nummerical Attributes according to Slide 25 Data Perparation
def create_columns_for_unique_values(train_df, test_df, column):
    unique_train_values = train_df[column].unique()
    unique_test_values = test_df[column].unique()
    unique_values = list(set(list(unique_train_values) + list(unique_test_values)))
    print(unique_values)
    for value in unique_values:
        train_df[column + "-" + value] = (train_df[column] == value).astype(int)
        test_df[column + "-" + value] = (test_df[column] == value).astype(int)

columns = ['DayOfWeek', 'PdDistrict', 'StreetType', 'Season']

for column in columns:
    create_columns_for_unique_values(encoded_train, encoded_test, column)

encoded_train.drop(columns=columns, inplace=True)
encoded_test.drop(columns=columns, inplace=True)

['Wednesday', 'Thursday', 'Saturday', 'Tuesday', 'Friday', 'Sunday', 'Monday']
['CENTRAL', 'NORTHERN', 'TARAVAL', 'BAYVIEW', 'MISSION', 'RICHMOND', 'SOUTHERN', 'TENDERLOIN', 'INGLESIDE', 'PARK']
['AV', 'DR', 'HY', 'WAY', 'RD', 'INT', 'WY', 'RW', 'AL', 'CR', 'PZ', 'ST', 'OTHER', 'PL', 'TR', 'I-80', 'LN', 'WK', 'BL', 'CT']
['Winter', 'Spring', 'Summer', 'Fall']


In [34]:
encoded_test.head()

Unnamed: 0,Day,Month,Hour,Night,Holiday,Block,lat,long,Category,DayOfWeek-Wednesday,...,StreetType-TR,StreetType-I-80,StreetType-LN,StreetType-WK,StreetType-BL,StreetType-CT,Season-Winter,Season-Spring,Season-Summer,Season-Fall
0,13,5,23,1,0,0,37.774599,-122.425892,WARRANTS,1,...,0,0,0,0,0,0,0,1,0,0
1,13,5,23,1,0,0,37.774599,-122.425892,OTHER OFFENSES,1,...,0,0,0,0,0,0,0,1,0,0
2,13,5,23,1,0,0,37.800414,-122.424363,OTHER OFFENSES,1,...,0,0,0,0,0,0,0,1,0,0
3,13,5,23,1,0,16,37.800873,-122.426995,LARCENY/THEFT,1,...,0,0,0,0,0,0,0,1,0,0
4,13,5,23,1,0,2,37.771541,-122.438738,LARCENY/THEFT,1,...,0,0,0,0,0,0,0,1,0,0


In [35]:
scaler = StandardScaler()
encoded_train[['lat', 'long']] = scaler.fit_transform(encoded_train[['lat', 'long']])
encoded_test[['lat', 'long']] = scaler.transform(encoded_test[['lat', 'long']])

encoded_train.head()

Unnamed: 0,Day,Month,Hour,Night,Holiday,Block,lat,long,Category,DayOfWeek-Wednesday,...,StreetType-TR,StreetType-I-80,StreetType-LN,StreetType-WK,StreetType-BL,StreetType-CT,Season-Winter,Season-Spring,Season-Summer,Season-Fall
457329,29,11,0,1,0,0,0.584478,-1.557336,OTHER OFFENSES,0,...,0,0,0,0,0,0,0,0,0,1
562937,1,6,8,0,0,1,0.911468,0.775401,OTHER OFFENSES,0,...,0,0,0,0,0,0,0,0,1,0
780045,27,4,6,0,0,1,-2.045603,0.570183,OTHER OFFENSES,0,...,0,0,0,0,0,0,0,1,0,0
784974,1,4,15,0,0,32,1.510611,-0.150875,ASSAULT,0,...,0,0,0,0,0,0,0,1,0,0
760313,25,7,16,0,0,0,0.718501,0.532851,ASSAULT,0,...,0,0,0,0,0,0,0,0,1,0


### Save encoded data

In [36]:
# Save processed data
encoded_train.to_csv("data/tmp/encoded_train.csv", index=False)
encoded_test.to_csv("data/tmp/encoded_test.csv", index=False)