In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder


In [2]:
train_df = pd.read_csv('./data/train.csv', parse_dates=['Dates'])
test_df = pd.read_csv('./data/test.csv', parse_dates=['Dates'])

In [3]:
train_df.sample(3)

Unnamed: 0,Dates,Category,Descript,DayOfWeek,PdDistrict,Resolution,Address,X,Y
222561,2012-05-20 12:00:00,NON-CRIMINAL,LOST PROPERTY,Sunday,SOUTHERN,NONE,3RD ST / HOWARD ST,-122.400474,37.785029
653807,2006-02-06 16:31:00,OTHER OFFENSES,TRAFFIC VIOLATION,Monday,MISSION,"ARREST, CITED",MISSION ST / 16TH ST,-122.419672,37.76505
284739,2011-06-19 23:10:00,SUSPICIOUS OCC,INVESTIGATIVE DETENTION,Sunday,MISSION,NONE,3000 Block of 16TH ST,-122.42007,37.764972


In [4]:
test_df.sample(3)

Unnamed: 0,Id,Dates,DayOfWeek,PdDistrict,Address,X,Y
147545,147545,2013-06-06 23:18:00,Thursday,MISSION,SOUTH VAN NESS AV / 22ND ST,-122.416569,37.755569
29822,29822,2014-12-21 16:30:00,Sunday,TENDERLOIN,200 Block of MASON ST,-122.409524,37.78576
177770,177770,2013-01-16 04:23:00,Wednesday,TENDERLOIN,0 Block of MCALLISTER ST,-122.412597,37.781119


In [5]:
def engineer_dates_col(df):
    dates_col   = df['Dates']
    dates_col   = pd.to_datetime(dates_col)
    df['Year']  = dates_col.dt.year
    df['Month'] = dates_col.dt.month
    df['IsDay'] = dates_col.dt.hour.apply(lambda h: 1 if (h > 6 and h < 20) else 0)
    df['MonthSin'] = np.sin((df['Month']*2*np.pi)/12)
    df['MonthCos'] = np.cos((df['Month']*2*np.pi)/12)
    return df

In [6]:
def engineer_dayofweek_col(df):
    days_int = df['DayOfWeek'].map({
        'Monday': 1,
        'Tuesday': 2,
        'Wednesday': 3,
        'Thursday': 4,
        'Friday': 5,
        'Saturday': 6,
        'Sunday': 7,
    })
    df['DayOfWeekSin']= np.sin((days_int*2*np.pi)/7)
    df['DayOfWeekCos']= np.cos((days_int*2*np.pi)/7)
    return df

In [7]:
def engineer_category_col(df):
    le = LabelEncoder()
    df['Category'] = pd.Series(le.fit_transform(df['Category']))
    return df

In [8]:
def engineer_pddistrict_col(df):
    le = LabelEncoder()
    df['PdDistrict'] = pd.Series(le.fit_transform(df['PdDistrict']))
    return df

In [9]:
def engineer_all_cols(df, encode_category=False):
    df = engineer_dates_col(df)
    df = engineer_dayofweek_col(df)
    df = engineer_pddistrict_col(df)
    if encode_category:
        df = engineer_category_col(df)
    return df

In [10]:
train_df = engineer_all_cols(train_df, encode_category=True)
test_df  = engineer_all_cols(test_df)

In [11]:
train_df.sample(3)

Unnamed: 0,Dates,Category,Descript,DayOfWeek,PdDistrict,Resolution,Address,X,Y,Year,Month,IsDay,MonthSin,MonthCos,DayOfWeekSin,DayOfWeekCos
255342,2011-12-02 12:50:00,1,BATTERY,Friday,9,"ARREST, BOOKED",500 Block of ELLIS ST,-122.415414,37.784477,2011,12,1,-2.449294e-16,1.0,-0.974928,-0.222521
298152,2011-04-09 01:20:00,5,MAINTAINING A PUBLIC NUISANCE,Saturday,3,"ARREST, BOOKED",500 Block of CAPP ST,-122.417818,37.757888,2011,4,0,0.8660254,-0.5,-0.781831,0.62349
358351,2010-05-21 10:00:00,16,GRAND THEFT FROM A BUILDING,Friday,0,NONE,700 Block of INNES AV,-122.372926,37.73046,2010,5,1,0.5,-0.866025,-0.974928,-0.222521


In [12]:
test_df.sample(3)

Unnamed: 0,Id,Dates,DayOfWeek,PdDistrict,Address,X,Y,Year,Month,IsDay,MonthSin,MonthCos,DayOfWeekSin,DayOfWeekCos
442017,442017,2009-02-28 23:30:00,Saturday,7,0 Block of TANDANGSORA ST,-122.399982,37.781719,2009,2,0,0.866025,0.5,-0.7818315,0.62349
533871,533871,2007-11-10 20:30:00,Saturday,8,1900 Block of 36TH AV,-122.494424,37.750622,2007,11,0,-0.5,0.866025,-0.7818315,0.62349
469322,469322,2008-10-12 01:55:00,Sunday,5,FULTON ST / MASONIC AV,-122.446473,37.775802,2008,10,0,-0.866025,0.5,-2.449294e-16,1.0


In [13]:
train_df.to_csv('./data/inter/train_clean.csv', index=False)
test_df.to_csv('./data/inter/test_clean.csv', index=False)
print('Done Outputing !')

Done Outputing !
