In [1]:
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt

In [2]:
train_df = pd.read_csv('train.csv')
#test_df = pd.read_csv('test.csv')
print(train_df.shape)
#print(test_df.shape)

(878049, 9)


In [3]:
#train_df = train_df_.sample(frac=0.1)

In [None]:
print('Any missing values?')
print('In train: ', pandas.isnull(train_df).values.any())
#print('In test:  ', pandas.isnull(test).values.any())

In [None]:
from pandas.tseries.holiday import USFederalHolidayCalendar as calendar
from pandas.tseries.holiday import Holiday, HolidayCalendarFactory, FR, DateOffset

def transform_date(df):
    """
    Split Dates into Date, Year, Month, Day, DayofYear, Hour and Minute
    """
    df['Date'] = pd.to_datetime(df['Dates'].dt.date)
    df['Year'] = df['Dates'].dt.year
    df['Month'] = df['Dates'].dt.month
    df['Day'] = df['Dates'].dt.day
    df['DayOfYear'] = df['Dates'].dt.dayofyear   # the ordinal day of the year
    df['Time'] = df['Dates'].dt.hour + train['Dates'].dt.minute/60
    df['Hour'] = df['Dates'].dt.hour
    df['Minutes'] = df['Dates'].dt.minute
    del df['Dates']
    
    return df

def compute_working_days(df):
    """
    Check if working day, include information about holidays:
    New Year, Memorial Day, Independence Day, Labor Day, Thanksgiving, Black Friday, Christmas
    """
    cal = calendar()
    # need to remove rules in descending order!
    cal.rules.pop(7) # remove Veterans Day
    cal.rules.pop(6) # remove Columbus Day
    cal.rules.pop(2) # remove President's Day
    cal.rules.pop(1) # remove Martin Luther King Day
    # create new rule for Black Friday
    USBlackFriday = Holiday('BlackFriday', month=11, day=1,
                            offset=DateOffset(weekday=FR(4)))

    # create own holiday calendar based on the above rules
    ownCal = HolidayCalendarFactory('OwnCalendar', cal, USBlackFriday)
    cal = ownCal()
    holidays = cal.holidays(start='2003-01-01', end='2015-05-13')

    # set flag according to whether the day is a holiday, a weekend or not
    df['isWorkingday'] = ((df['DayOfWeek'].isin(['Saturday', 'Sunday'])==False) & 
                          (df['Date'].isin(holidays)==False))

    # transform from True/False into 1/0
    df['isWorkingday'] = df['isWorkingday'].astype(int)
    
def compute_night(df):
    df['Night'] = True
    
    df.loc[(train['Month'] ==  1) & (df['Time'] > 7.3) & (df['Time'] < 17.3), 'Night'] = False
    df.loc[(train['Month'] ==  2) & (df['Time'] > 7.0) & (df['Time'] < 17.8), 'Night'] = False
    df.loc[(train['Month'] ==  3) & (df['Time'] > 6.3) & (df['Time'] < 18.3), 'Night'] = False
    df.loc[(train['Month'] ==  4) & (df['Time'] > 5.5) & (df['Time'] < 18.8), 'Night'] = False
    df.loc[(train['Month'] ==  5) & (df['Time'] > 5.0) & (df['Time'] < 19.3), 'Night'] = False
    df.loc[(train['Month'] ==  6) & (df['Time'] > 4.8) & (df['Time'] < 19.5), 'Night'] = False
    df.loc[(train['Month'] ==  7) & (df['Time'] > 5.0) & (df['Time'] < 19.5), 'Night'] = False
    df.loc[(train['Month'] ==  8) & (df['Time'] > 5.5) & (df['Time'] < 19.0), 'Night'] = False
    df.loc[(train['Month'] ==  9) & (df['Time'] > 5.8) & (df['Time'] < 18.3), 'Night'] = False
    df.loc[(train['Month'] == 10) & (df['Time'] > 6.3) & (df['Time'] < 17.5), 'Night'] = False
    df.loc[(train['Month'] == 11) & (df['Time'] > 6.8) & (df['Time'] < 17.0), 'Night'] = False
    df.loc[(train['Month'] == 12) & (df['Time'] > 7.3) & (df['Time'] < 16.9), 'Night'] = False

    # transform from True/False into 1/0
    df['Night'] = df['Night'].astype(int)
    return df
    


In [1]:
def categorize_crimes(df):
    otherCrimes = ['WARRANTS','OTHER OFFENSES','WEAPON LAWS','SECONDARY CODES']
    familyCrimes = ['MISSING PERSON','RUNAWAY', 'FAMILY OFFENSES','SUICIDE']
    violentCrimes = ['ASSAULT','SEX OFFENSES FORCIBLE','ARSON','KIDNAPPING']
    theftBurglary = ['LARCENY/THEFT','VEHICLE THEFT','ROBBERY','BURGLARY','STOLEN PROPERTY','RECOVERED VEHICLE']
    economicCrimes = ['FORGERY/COUNTERFEITING','FRAUD', 'BRIBERY', 'EMBEZZLEMENT', 'BAD CHECKS', 'EXTORTION']
    publicDisorder = ['VANDALISM','NON-CRIMINAL', 'SUSPICIOUS OCC','TRESPASS','PROSTITUTION', 'DISORDERLY CONDUCT',
                  'LOITERING', 'SEX OFFENSES NON FORCIBLE', 'PORNOGRAPHY/OBSCENE MAT','TREA', 'GAMBLING']  
    drugsAlcohol = ['DRUNKENNESS','DRUG/NARCOTIC', 'DRIVING UNDER THE INFLUENCE', 'LIQUOR LAWS']
    
    df['isOtherCrimes'] = df['Category'].isin(otherCrimes)
    df['isFamilyCrimes'] = df['Category'].isin(familyCrimes)
    df['isViolentCrimes'] = df['Category'].isin(violentCrimes)
    df['isTheftBurglary'] = df['Category'].isin(theftBurglary)
    df['isEconomicCrimes'] = df['Category'].isin(economicCrimes)

    return df

def check_intersection(df):
    # check whether address is an intersection or not
    # one column, i.e. 1 for intersection and 0 for block should suffice
    df['StreetCorner'] = df['Address'].apply(lambda x: 1 if '/' in x else 0)
    return df


In [None]:
from sklearn.preprocessing import StandardScaler, LabelEncoder

def xy_clean(df):
    # medians of X and Y by police district
    listOfPrecincts = df['PdDistrict'].unique()
    precinctX = (df.groupby('PdDistrict'))['X'].median()
    precinctY = (df.groupby('PdDistrict'))['Y'].median()

    # impute wrong values with the medians
    for i in range(len(listOfPrecincts)) :
        df.loc[ (df['Y'] > 38) & (df['PdDistrict'] == listOfPrecincts[i]), 'X'] = \
            precinctX[listOfPrecincts[i]]
        df.loc[(df['Y'] > 38) & (df['PdDistrict'] == listOfPrecincts[i]), 'Y'] = \
            precinctY[listOfPrecincts[i]]
            
    return df
        

def xy_scale(df):
    xy_scaler = StandardScaler() 
    xy_scaler.fit(df[["X","Y"]]) 
    df[["X","Y"]]=xy_scaler.transform(df[["X","Y"]]) 
    
    return df

def xy_rotate(df):
    """
    rotate coordinates by 30, 45, 60 degrees
    """
    df["rot45_X"] = 0.707* df["Y"] + 0.707* df["X"]
    df["rot45_Y"] = 0.707* df["Y"] - 0.707* df["X"]
    df["rot30_X"] = 0.866* df["X"] + 0.5* df["Y"]
    df["rot30_Y"] = 0.866* df["Y"] - 0.5* df["X"]
    df["rot60_X"] = 0.5* df["X"] + 0.866* df["Y"]
    df["rot60_Y"] = 0.5* df["Y"] - 0.866* df["X"] 
    return df

def xy_distance_from_center(df):
    """
    Compute radial distance from center of map
    """
    df["radial_r"] = numpy.sqrt( numpy.power(df["Y"],2) + numpy.power(df["X"],2) )
    return df


def xy_distance_from_crime_median(df):
    # medians of X and Y by crime category
    listOfCrimes = df['Category'].unique()
    crimeX = (df.groupby('Category'))['X'].median()
    crimeY = (df.groupby('Category'))['Y'].median()

    # compute distance from from crime median
    for crime_name in listOfCrimes:
    #for i in range(len(listOfCrimes)):
        df[crime_name+"_r"] = numpy.sqrt( numpy.power(df["Y"]-crimeY[crime_name],2) + numpy.power(df["X"]-crimeX[crime_name],2) )
        
        # debug version
        #df[listOfCrimes[0]+"_r"] = numpy.sqrt( numpy.power(df["Y"]-crimeY[listOfCrimes[0]],2) + numpy.power(df["X"]-crimeX[listOfCrimes[0]],2) )
    
    
            
    return df




In [None]:
def label_encode(df):
    """
    Label encode PdDistrict and DayOfWeek
    """
    lab_enc = LabelEncoder()
    df['PdDistrict'] = lab_enc.fit_transform(df.PdDistrict)
    df['DayOfWeek'] = lab_enc.fit_transform(df.DayOfWeek)
    return df

In [6]:
from sklearn.model_selection import train_test_split

train, test = train_test_split(train_df, test_size=0.3, random_state=1)