In [1]:
import pandas as pd
import numpy as np

In [2]:
train_df = pd.read_csv('./data/train.csv', parse_dates=['Dates'])
test_df = pd.read_csv('./data/test.csv', parse_dates=['Dates'])

In [3]:
train_df.sample(3)

Unnamed: 0,Dates,Category,Descript,DayOfWeek,PdDistrict,Resolution,Address,X,Y
460158,2008-11-14 14:30:00,FORGERY/COUNTERFEITING,"CHECKS, FORGERY (FELONY)",Friday,CENTRAL,NONE,700 Block of GRANT AV,-122.406246,37.793611
122414,2013-09-22 09:00:00,OTHER OFFENSES,LOST/STOLEN LICENSE PLATE,Sunday,CENTRAL,NONE,800 Block of WASHINGTON ST,-122.407793,37.795033
566612,2007-05-15 08:00:00,NON-CRIMINAL,LOST PROPERTY,Tuesday,MISSION,NONE,26TH ST / MISSION ST,-122.418137,37.749041


In [4]:
test_df.sample(3)

Unnamed: 0,Id,Dates,DayOfWeek,PdDistrict,Address,X,Y
443486,443486,2009-02-25 13:29:00,Wednesday,SOUTHERN,800 Block of HARRISON ST,-122.4008,37.779934
385478,385478,2009-12-18 17:37:00,Friday,INGLESIDE,GENEVA AV / PARIS ST,-122.439438,37.715674
74302,74302,2014-05-22 19:00:00,Thursday,CENTRAL,THE EMBARCADERONORTH ST / WASHINGTON ST,-122.39533,37.796665


In [5]:
def engineer_dates_col(df):
    dates_col   = df['Dates']
    dates_col   = pd.to_datetime(dates_col)
    df['Year']  = dates_col.dt.year
    df['Month'] = dates_col.dt.month
    df['IsDay'] = dates_col.dt.hour.apply(lambda h: 1 if (h > 6 and h < 20) else 0)
    df['MonthSin'] = np.sin((df['Month']*2*np.pi)/12)
    df['MonthCos'] = np.cos((df['Month']*2*np.pi)/12)
    return df

In [6]:
def engineer_dayofweek_col(df):
    days_int = df['DayOfWeek'].map({
        'Monday': 1,
        'Tuesday': 2,
        'Wednesday': 3,
        'Thursday': 4,
        'Friday': 5,
        'Saturday': 6,
        'Sunday': 7,
    })
    df['DayOfWeekSin']= np.sin((days_int*2*np.pi)/7)
    df['DayOfWeekCos']= np.cos((days_int*2*np.pi)/7)
    return df

In [7]:
def engineer_category_col(df):
    return pd.get_dummies(df, columns=['Category'])

In [8]:
from sklearn.preprocessing import LabelEncoder
def engineer_pddistrict_col(df):
    le = LabelEncoder()
    le.fit(df['PdDistrict'])
    df['PdDistrict'] = pd.Series(le.transform(df['PdDistrict']))
    return df

In [9]:
def engineer_all_cols(df, encode_category=False):
    df = engineer_dates_col(df)
    df = engineer_dayofweek_col(df)
    df = engineer_pddistrict_col(df)
    if encode_category:
        df = engineer_category_col(df)
    return df

In [10]:
train_df = engineer_all_cols(train_df, encode_category=True)
test_df  = engineer_all_cols(test_df)

In [11]:
train_df.sample(3)

Unnamed: 0,Dates,Descript,DayOfWeek,PdDistrict,Resolution,Address,X,Y,Year,Month,...,Category_SEX OFFENSES NON FORCIBLE,Category_STOLEN PROPERTY,Category_SUICIDE,Category_SUSPICIOUS OCC,Category_TREA,Category_TRESPASS,Category_VANDALISM,Category_VEHICLE THEFT,Category_WARRANTS,Category_WEAPON LAWS
458426,2008-11-25 16:11:00,"BURGLARY OF STORE, UNLAWFUL ENTRY",Tuesday,9,NONE,0 Block of GRANT AV,-122.404997,37.787196,2008,11,...,0,0,0,0,0,0,0,0,0,0
307047,2011-02-22 15:50:00,"DRIVERS LICENSE, SUSPENDED OR REVOKED",Tuesday,0,"ARREST, CITED",3RD ST / KIRKWOOD AV,-122.389588,37.737957,2011,2,...,0,0,0,0,0,0,0,0,0,0
428529,2009-04-30 14:00:00,STOLEN AUTOMOBILE,Thursday,8,NONE,RIVERA ST / 31ST AV,-122.488805,37.746211,2009,4,...,0,0,0,0,0,0,0,1,0,0


In [12]:
test_df.sample(3)

Unnamed: 0,Id,Dates,DayOfWeek,PdDistrict,Address,X,Y,Year,Month,IsDay,MonthSin,MonthCos,DayOfWeekSin,DayOfWeekCos
792665,792665,2004-03-14 16:00:00,Sunday,0,3RD ST / WILLIAMS AV,-122.392625,37.72928,2004,3,1,1.0,6.123234000000001e-17,-2.449294e-16,1.0
530336,530336,2007-12-04 12:18:00,Tuesday,0,JERROLD AV / BAY SHORE BL,-122.403564,37.747761,2007,12,1,-2.449294e-16,1.0,0.9749279,-0.222521
200388,200388,2012-09-24 16:00:00,Monday,7,THE EMBARCADEROSOUTH ST / BRANNAN ST,-122.38819,37.784773,2012,9,1,-1.0,-1.83697e-16,0.7818315,0.62349


In [13]:
train_df.to_csv('./data/inter/train_clean.csv', index=False)
test_df.to_csv('./data/inter/test_clean.csv', index=False)
print('Done Outputing !')

Done Outputing !
