# SF Crime Prediction

- Authors: Henry Gräser, Jonas Müller, Thomas Wolff, Hannes Harnisch
- Created on: June 28, 2024
- Description: Understanding the data of the kaggle dataset SF Crime
- Kaggle competition: [SF Crime Classification](https://www.kaggle.com/c/sf-crime/data) 

## Libraries

In [1]:
import pandas as pd


## Data Preparation

In [2]:
train = pd.read_csv('data/train.csv')
test = pd.read_csv('data/test.csv')

train.head()

Unnamed: 0,Dates,Category,Descript,DayOfWeek,PdDistrict,Resolution,Address,X,Y
0,2015-05-13 23:53:00,WARRANTS,WARRANT ARREST,Wednesday,NORTHERN,"ARREST, BOOKED",OAK ST / LAGUNA ST,-122.425892,37.774599
1,2015-05-13 23:53:00,OTHER OFFENSES,TRAFFIC VIOLATION ARREST,Wednesday,NORTHERN,"ARREST, BOOKED",OAK ST / LAGUNA ST,-122.425892,37.774599
2,2015-05-13 23:33:00,OTHER OFFENSES,TRAFFIC VIOLATION ARREST,Wednesday,NORTHERN,"ARREST, BOOKED",VANNESS AV / GREENWICH ST,-122.424363,37.800414
3,2015-05-13 23:30:00,LARCENY/THEFT,GRAND THEFT FROM LOCKED AUTO,Wednesday,NORTHERN,NONE,1500 Block of LOMBARD ST,-122.426995,37.800873
4,2015-05-13 23:30:00,LARCENY/THEFT,GRAND THEFT FROM LOCKED AUTO,Wednesday,PARK,NONE,100 Block of BRODERICK ST,-122.438738,37.771541


In [3]:
# TODO: do something else with the wrong longitude and latitude
# rename X to longitude and Y to latitude
train = train.rename(columns={'X': 'long', 'Y': 'lat'})
test = test.rename(columns={'X': 'long', 'Y': 'lat'})
train = train[train['lat'] != train['lat'].max()]

### Temporal Dimensions

We want to format the Dates collumn and also extract temporal features such as Year, Month, Day, Hour, Minute

In [4]:
# Transforming the Date into a python datetime object
train["Dates"] = pd.to_datetime(train["Dates"], format="%Y-%m-%d %H:%M:%S")
test["Dates"] = pd.to_datetime(test["Dates"], format="%Y-%m-%d %H:%M:%S")

In [5]:
def create_column(df, datetime_column, part_name):
    df[part_name.capitalize()] = df[datetime_column].map(lambda x: getattr(x, part_name, None))

for df in [train, test]:
    for part in ["year", "month", "day", "hour", "minute"]:
        create_column(df, "Dates", part)
test.head()

Unnamed: 0,Id,Dates,DayOfWeek,PdDistrict,Address,long,lat,Year,Month,Day,Hour,Minute
0,0,2015-05-10 23:59:00,Sunday,BAYVIEW,2000 Block of THOMAS AV,-122.399588,37.735051,2015,5,10,23,59
1,1,2015-05-10 23:51:00,Sunday,BAYVIEW,3RD ST / REVERE AV,-122.391523,37.732432,2015,5,10,23,51
2,2,2015-05-10 23:50:00,Sunday,NORTHERN,2000 Block of GOUGH ST,-122.426002,37.792212,2015,5,10,23,50
3,3,2015-05-10 23:45:00,Sunday,INGLESIDE,4700 Block of MISSION ST,-122.437394,37.721412,2015,5,10,23,45
4,4,2015-05-10 23:45:00,Sunday,INGLESIDE,4700 Block of MISSION ST,-122.437394,37.721412,2015,5,10,23,45


Enriching data with information of Holidays:

In [6]:
from pandas.tseries.holiday import USFederalHolidayCalendar as calendar
cal = calendar()
for df in [train, test]:
    holidays = cal.holidays(start=df['Dates'].min(), end=df['Dates'].max())
    df['Holiday'] = df['Dates'].dt.date.astype('datetime64[ns]').isin(holidays)
    print(df['Holiday'].value_counts())

Holiday
False    852329
True      25653
Name: count, dtype: int64
Holiday
False    865946
True      18316
Name: count, dtype: int64


Enriching data with information on If it is at night or during the daylight:

In [7]:
import pytz
from astral import LocationInfo
from astral.sun import sun

def get_all_sunset_sunrise_sf(x):
    city = LocationInfo("San Francisco", "USA", "America/Los_Angeles", 37.7749, -122.4194)
    timezone = pytz.timezone(city.timezone)
    return {
        f"{day['Day']}-{day['Month']}-{day['Year']}": sun(
            city.observer,
            date=pd.Timestamp(year=day['Year'], month=day['Month'], day=day['Day'], tz=timezone).date(),
            tzinfo=city.timezone
        )
        for _, day in x.iterrows()
    }

def is_at_night(date, sun_info):
    dusk = sun_info['dusk'].replace(tzinfo=None)
    dawn = sun_info['dawn'].replace(tzinfo=None)
    # Keine Änderung hier, da die Logik korrekt ist, aber stellen Sie sicher, dass 'date' auch ohne Zeitzone ist
    if dawn < dusk:  # Für Fälle, in denen der Sonnenaufgang als am nächsten Tag betrachtet wird
        return date > dusk or date < dawn
    else:
        return dusk < date < dawn

for df in [train, test]:
    unique_days = df[['Day', 'Month', 'Year']].drop_duplicates()
    sun_info = get_all_sunset_sunrise_sf(unique_days)
    df['Night'] = df['Dates'].map(lambda x: is_at_night(x, sun_info[f"{x.day}-{x.month}-{x.year}"]))
    print(df['Night'].value_counts()) 

# Checking that there is no values during the middle of the day that are marked as night
train[(train['Night'] == True) & (train['Hour'] < 17) & (train['Hour'] > 7)].shape

Night
False    544154
True     333828
Name: count, dtype: int64
Night
False    547181
True     337081
Name: count, dtype: int64


(0, 16)

In [8]:
# Season

train['Season']=(train['Month']%12 + 3)//3
test['Season']=(test['Month']%12 + 3)//3

train.head()

Unnamed: 0,Dates,Category,Descript,DayOfWeek,PdDistrict,Resolution,Address,long,lat,Year,Month,Day,Hour,Minute,Holiday,Night,Season
0,2015-05-13 23:53:00,WARRANTS,WARRANT ARREST,Wednesday,NORTHERN,"ARREST, BOOKED",OAK ST / LAGUNA ST,-122.425892,37.774599,2015,5,13,23,53,False,True,2
1,2015-05-13 23:53:00,OTHER OFFENSES,TRAFFIC VIOLATION ARREST,Wednesday,NORTHERN,"ARREST, BOOKED",OAK ST / LAGUNA ST,-122.425892,37.774599,2015,5,13,23,53,False,True,2
2,2015-05-13 23:33:00,OTHER OFFENSES,TRAFFIC VIOLATION ARREST,Wednesday,NORTHERN,"ARREST, BOOKED",VANNESS AV / GREENWICH ST,-122.424363,37.800414,2015,5,13,23,33,False,True,2
3,2015-05-13 23:30:00,LARCENY/THEFT,GRAND THEFT FROM LOCKED AUTO,Wednesday,NORTHERN,NONE,1500 Block of LOMBARD ST,-122.426995,37.800873,2015,5,13,23,30,False,True,2
4,2015-05-13 23:30:00,LARCENY/THEFT,GRAND THEFT FROM LOCKED AUTO,Wednesday,PARK,NONE,100 Block of BRODERICK ST,-122.438738,37.771541,2015,5,13,23,30,False,True,2


In [9]:
# Address extraction
import re

def get_block(address):
    match = re.search(r'(\d+)\s+block of', address, re.IGNORECASE)
    if match:
        # The block number is divided by 100 because they always increase by 100 and then increased by 1 to leave 0 for no block
        return int(match.group(1)) // 100 + 1
    return 0

def get_street_type(address):
    # See also data-understanding.ipynb
    street_types = ['AV', 'ST', 'CT', 'PZ', 'LN', 'DR', 'PL', 'HY', 'FY', 'WY', 'TR', 'RD', 'BL', 'WAY', 'CR', 'AL', 'I-80', 'RW', 'WK']
    match = re.findall(r'\b(?:' + '|'.join(street_types) + r')\b', address, re.IGNORECASE)
    if len(match) > 1 and '/' in address:
        return "INT"
    if len(match) == 1:
        return match[0]
    return "OTHER"


for df in [train, test]:
    df['Block'] = df['Address'].map(get_block)
    df['StreetType'] = df['Address'].map(get_street_type)

In [10]:
train['Block'].value_counts()

Block
0     260751
1      76325
2      51917
9      51718
3      38407
       ...  
82         7
79         5
81         4
84         4
80         3
Name: count, Length: 85, dtype: int64

In [11]:
train['StreetType'].value_counts()

StreetType
ST       447063
INT      260560
AV       122421
BL        14741
DR        11937
WY         5179
RD         3610
CT         2797
PZ         2347
HY         2281
LN         1668
TR         1115
PL         1072
CR          358
I-80        322
OTHER       271
AL          179
WAY          55
WK            5
RW            1
Name: count, dtype: int64

In [12]:
# Check that an intersection has no block number
train[(train['Block'] != 0) & (train['StreetType'] == 'INT')].shape

(0, 19)

### Dropping unnecesary Dimensions

In [13]:
# Dropping the Resolution and Descript in training dataset because they should not be relevant for prediction
train = train.drop(columns=['Resolution', 'Descript'])

In [14]:
# Dropping the Dates column because we already extracted the relevant information
train = train.drop(columns=['Dates'])
test = test.drop(columns=['Dates'])

In [15]:
# Dropping the Address column because we already extracted the relevant information
train = train.drop(columns=['Address'])
test = test.drop(columns=['Address'])

In [16]:
# Sort the columns to be more organized
train = train[['DayOfWeek', 'Day', 'Month', 'Year', 'Hour', 'Minute', 'Season', 'Night', 'Holiday', 'Block', 'StreetType', 'PdDistrict', 'lat', 'long', 'Category']]
test = test[['Id', 'DayOfWeek', 'Day', 'Month', 'Year', 'Hour', 'Minute', 'Season', 'Night', 'Holiday', 'Block', 'StreetType', 'PdDistrict', 'lat', 'long']]

train.head()

Unnamed: 0,DayOfWeek,Day,Month,Year,Hour,Minute,Season,Night,Holiday,Block,StreetType,PdDistrict,lat,long,Category
0,Wednesday,13,5,2015,23,53,2,True,False,0,INT,NORTHERN,37.774599,-122.425892,WARRANTS
1,Wednesday,13,5,2015,23,53,2,True,False,0,INT,NORTHERN,37.774599,-122.425892,OTHER OFFENSES
2,Wednesday,13,5,2015,23,33,2,True,False,0,INT,NORTHERN,37.800414,-122.424363,OTHER OFFENSES
3,Wednesday,13,5,2015,23,30,2,True,False,16,ST,NORTHERN,37.800873,-122.426995,LARCENY/THEFT
4,Wednesday,13,5,2015,23,30,2,True,False,2,ST,PARK,37.771541,-122.438738,LARCENY/THEFT


In [17]:
# TODO Normalization of the X and Y values

## Data Encoding

In [19]:
train['Year'] = train['Year'] - (train['Year'].min())
test['Year'] = test['Year'] - (test['Year'].min())
train

Unnamed: 0,DayOfWeek,Day,Month,Year,Hour,Minute,Season,Night,Holiday,Block,StreetType,PdDistrict,lat,long,Category
0,Wednesday,13,5,12,23,53,2,True,False,0,INT,NORTHERN,37.774599,-122.425892,WARRANTS
1,Wednesday,13,5,12,23,53,2,True,False,0,INT,NORTHERN,37.774599,-122.425892,OTHER OFFENSES
2,Wednesday,13,5,12,23,33,2,True,False,0,INT,NORTHERN,37.800414,-122.424363,OTHER OFFENSES
3,Wednesday,13,5,12,23,30,2,True,False,16,ST,NORTHERN,37.800873,-122.426995,LARCENY/THEFT
4,Wednesday,13,5,12,23,30,2,True,False,2,ST,PARK,37.771541,-122.438738,LARCENY/THEFT
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
878044,Monday,6,1,0,0,15,1,True,False,0,INT,TARAVAL,37.714056,-122.459033,ROBBERY
878045,Monday,6,1,0,0,1,1,True,False,7,ST,INGLESIDE,37.731948,-122.447364,LARCENY/THEFT
878046,Monday,6,1,0,0,1,1,True,False,0,INT,SOUTHERN,37.780266,-122.403390,LARCENY/THEFT
878047,Monday,6,1,0,0,1,1,True,False,0,INT,SOUTHERN,37.780607,-122.390531,VANDALISM


In [21]:
pd_districts = {'SOUTHERN':0, 'MISSION':1, 'NORTHERN':2, 'CENTRAL':3, 'BAYVIEW':4, 'INGLESIDE':5, 
                'TENDERLOIN':6, 'TARAVAL':7, 'PARK':8, 'RICHMOND':9}

train.replace({'PdDistrict': pd_districts}, inplace=True)
test.replace({'PdDistrict': pd_districts}, inplace=True)

train

Unnamed: 0,DayOfWeek,Day,Month,Year,Hour,Minute,Season,Night,Holiday,Block,StreetType,PdDistrict,lat,long,Category
0,Wednesday,13,5,12,23,53,2,True,False,0,INT,2,37.774599,-122.425892,WARRANTS
1,Wednesday,13,5,12,23,53,2,True,False,0,INT,2,37.774599,-122.425892,OTHER OFFENSES
2,Wednesday,13,5,12,23,33,2,True,False,0,INT,2,37.800414,-122.424363,OTHER OFFENSES
3,Wednesday,13,5,12,23,30,2,True,False,16,ST,2,37.800873,-122.426995,LARCENY/THEFT
4,Wednesday,13,5,12,23,30,2,True,False,2,ST,8,37.771541,-122.438738,LARCENY/THEFT
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
878044,Monday,6,1,0,0,15,1,True,False,0,INT,7,37.714056,-122.459033,ROBBERY
878045,Monday,6,1,0,0,1,1,True,False,7,ST,5,37.731948,-122.447364,LARCENY/THEFT
878046,Monday,6,1,0,0,1,1,True,False,0,INT,0,37.780266,-122.403390,LARCENY/THEFT
878047,Monday,6,1,0,0,1,1,True,False,0,INT,0,37.780607,-122.390531,VANDALISM
