# San Francisco Crime Classification
A Kaggle Playground Prediction Competition ([link](https://www.kaggle.com/c/sf-crime))

In [1]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

train = pd.read_csv('./data/train.csv')
print(train.shape)
train.head(10)

(878049, 9)


Unnamed: 0,Dates,Category,Descript,DayOfWeek,PdDistrict,Resolution,Address,X,Y
0,2015-05-13 23:53:00,WARRANTS,WARRANT ARREST,Wednesday,NORTHERN,"ARREST, BOOKED",OAK ST / LAGUNA ST,-122.425892,37.774599
1,2015-05-13 23:53:00,OTHER OFFENSES,TRAFFIC VIOLATION ARREST,Wednesday,NORTHERN,"ARREST, BOOKED",OAK ST / LAGUNA ST,-122.425892,37.774599
2,2015-05-13 23:33:00,OTHER OFFENSES,TRAFFIC VIOLATION ARREST,Wednesday,NORTHERN,"ARREST, BOOKED",VANNESS AV / GREENWICH ST,-122.424363,37.800414
3,2015-05-13 23:30:00,LARCENY/THEFT,GRAND THEFT FROM LOCKED AUTO,Wednesday,NORTHERN,NONE,1500 Block of LOMBARD ST,-122.426995,37.800873
4,2015-05-13 23:30:00,LARCENY/THEFT,GRAND THEFT FROM LOCKED AUTO,Wednesday,PARK,NONE,100 Block of BRODERICK ST,-122.438738,37.771541
5,2015-05-13 23:30:00,LARCENY/THEFT,GRAND THEFT FROM UNLOCKED AUTO,Wednesday,INGLESIDE,NONE,0 Block of TEDDY AV,-122.403252,37.713431
6,2015-05-13 23:30:00,VEHICLE THEFT,STOLEN AUTOMOBILE,Wednesday,INGLESIDE,NONE,AVALON AV / PERU AV,-122.423327,37.725138
7,2015-05-13 23:30:00,VEHICLE THEFT,STOLEN AUTOMOBILE,Wednesday,BAYVIEW,NONE,KIRKWOOD AV / DONAHUE ST,-122.371274,37.727564
8,2015-05-13 23:00:00,LARCENY/THEFT,GRAND THEFT FROM LOCKED AUTO,Wednesday,RICHMOND,NONE,600 Block of 47TH AV,-122.508194,37.776601
9,2015-05-13 23:00:00,LARCENY/THEFT,GRAND THEFT FROM LOCKED AUTO,Wednesday,CENTRAL,NONE,JEFFERSON ST / LEAVENWORTH ST,-122.419088,37.807802


### Data fields
* **Dates** - timestamp of the crime incident
* **Category** - category of the crime incident (only in train.csv). This is the target variable you are going to predict.
* **Descript** - detailed description of the crime incident (only in train.csv)
* **DayOfWeek** - the day of the week
* **PdDistrict** - name of the Police Department District
* **Resolution** - how the crime incident was resolved (only in train.csv)
* **Address** - the approximate street address of the crime incident 
* **X** - Longitude
* **Y** - Latitude

In [2]:
# Finding the number of times a crime has been committed
crime_count = dict(train.Category.value_counts())
crime_count

{'LARCENY/THEFT': 174900,
 'OTHER OFFENSES': 126182,
 'NON-CRIMINAL': 92304,
 'ASSAULT': 76876,
 'DRUG/NARCOTIC': 53971,
 'VEHICLE THEFT': 53781,
 'VANDALISM': 44725,
 'WARRANTS': 42214,
 'BURGLARY': 36755,
 'SUSPICIOUS OCC': 31414,
 'MISSING PERSON': 25989,
 'ROBBERY': 23000,
 'FRAUD': 16679,
 'FORGERY/COUNTERFEITING': 10609,
 'SECONDARY CODES': 9985,
 'WEAPON LAWS': 8555,
 'PROSTITUTION': 7484,
 'TRESPASS': 7326,
 'STOLEN PROPERTY': 4540,
 'SEX OFFENSES FORCIBLE': 4388,
 'DISORDERLY CONDUCT': 4320,
 'DRUNKENNESS': 4280,
 'RECOVERED VEHICLE': 3138,
 'KIDNAPPING': 2341,
 'DRIVING UNDER THE INFLUENCE': 2268,
 'RUNAWAY': 1946,
 'LIQUOR LAWS': 1903,
 'ARSON': 1513,
 'LOITERING': 1225,
 'EMBEZZLEMENT': 1166,
 'SUICIDE': 508,
 'FAMILY OFFENSES': 491,
 'BAD CHECKS': 406,
 'BRIBERY': 289,
 'EXTORTION': 256,
 'SEX OFFENSES NON FORCIBLE': 148,
 'GAMBLING': 146,
 'PORNOGRAPHY/OBSCENE MAT': 22,
 'TREA': 6}

## Analyzing Address

In [3]:
# Show the first 30 address data values
print(train.Address.value_counts()[:30])

800 Block of BRYANT ST          26533
800 Block of MARKET ST           6581
2000 Block of MISSION ST         5097
1000 Block of POTRERO AV         4063
900 Block of MARKET ST           3251
0 Block of TURK ST               3228
0 Block of 6TH ST                2884
300 Block of ELLIS ST            2703
400 Block of ELLIS ST            2590
16TH ST / MISSION ST             2504
1000 Block of MARKET ST          2489
1100 Block of MARKET ST          2319
2000 Block of MARKET ST          2168
100 Block of OFARRELL ST         2140
700 Block of MARKET ST           2081
3200 Block of 20TH AV            2035
100 Block of 6TH ST              1887
500 Block of JOHNFKENNEDY DR     1824
TURK ST / TAYLOR ST              1810
200 Block of TURK ST             1800
0 Block of PHELAN AV             1791
0 Block of UNITEDNATIONS PZ      1789
0 Block of POWELL ST             1717
100 Block of EDDY ST             1681
1400 Block of PHELPS ST          1629
300 Block of EDDY ST             1589
100 Block of

In [4]:
# Defining a function to extract the road info from the raw address values
def get_road(address):
    if "Block of" in address:
        start_of_road = address.index("Block") + 9
        road = address[start_of_road:]
        return road
    else:
        separater = address.index('/')
        first_road = address[:separater-1]
        second_road = address[separater+2:]
        return first_road, second_road

In [5]:
# Applying the function to a new column named 'Road'
train['Road'] = train['Address'].apply(get_road)
train.head()

Unnamed: 0,Dates,Category,Descript,DayOfWeek,PdDistrict,Resolution,Address,X,Y,Road
0,2015-05-13 23:53:00,WARRANTS,WARRANT ARREST,Wednesday,NORTHERN,"ARREST, BOOKED",OAK ST / LAGUNA ST,-122.425892,37.774599,"(OAK ST, LAGUNA ST)"
1,2015-05-13 23:53:00,OTHER OFFENSES,TRAFFIC VIOLATION ARREST,Wednesday,NORTHERN,"ARREST, BOOKED",OAK ST / LAGUNA ST,-122.425892,37.774599,"(OAK ST, LAGUNA ST)"
2,2015-05-13 23:33:00,OTHER OFFENSES,TRAFFIC VIOLATION ARREST,Wednesday,NORTHERN,"ARREST, BOOKED",VANNESS AV / GREENWICH ST,-122.424363,37.800414,"(VANNESS AV, GREENWICH ST)"
3,2015-05-13 23:30:00,LARCENY/THEFT,GRAND THEFT FROM LOCKED AUTO,Wednesday,NORTHERN,NONE,1500 Block of LOMBARD ST,-122.426995,37.800873,LOMBARD ST
4,2015-05-13 23:30:00,LARCENY/THEFT,GRAND THEFT FROM LOCKED AUTO,Wednesday,PARK,NONE,100 Block of BRODERICK ST,-122.438738,37.771541,BRODERICK ST
