In [1]:
import pandas as pd
import numpy as np
from workalendar.usa import Illinois
from sklearn.cluster import *

In [2]:
calendar = Illinois()   

n_clusters = 50

def feature_selection(initial_csv):
    columns_to_use = ['Date', 'X Coordinate', 'Y Coordinate']
    data = pd.read_csv(initial_csv, sep=',', encoding='utf8', usecols=columns_to_use)
    return data

def date_decompose(df):
    df = df.copy()
    df['Year'] = df['Date'].dt.year
    df['Month'] = df['Date'].dt.month
    df['Day'] = df['Date'].dt.day
    df['Hour'] = df['Date'].dt.hour
    df['Weekday'] = df['Date'].dt.weekday
    ## Adding day off informations does not seem to be useful eventually
    #df['Weekend'] = (df['Weekday'] < 5).astype(int) ## not pretty effective
    #df['Holiday'] = df['Date'].apply(lambda x: int(calendar.is_holiday(x))) ## same
    return df

def feature_engineering(df):
    df['Date'] = pd.to_datetime(df['Date'], format='%m/%d/%Y %I:%M:%S %p')
    df = df.dropna()
    df = date_decompose(df)
    df['Crimes'] = 1
    coordX = df['X Coordinate']
    coordY = df['Y Coordinate']
    coords = pd.concat([coordX, coordY],axis=1).as_matrix()
    ## The mini batch KMeans is appropriate here as it stays time efficient even for huge dataset
    modCluster = MiniBatchKMeans(n_clusters=n_clusters, init='k-means++', max_iter=100, batch_size=100, verbose=0, compute_labels=True,
                                 random_state=None, tol=0.0, max_no_improvement=10, init_size=None, n_init=3, reassignment_ratio=0.01)
    print('begin clustering...')
    labels = modCluster.fit_predict(coords)
    print('done clustering...')
        
    df['Cluster'] = labels
    df = df[['Year', 'Month', 'Day', 'Weekday', 'Hour', 'Cluster', 'Crimes']]
    df = df.groupby(['Year', 'Month', 'Day', 'Weekday', 'Hour', 'Cluster']).sum().reset_index()
    
    years = df['Year']
    months = df['Month']
    days = df['Day']
    weekday = df['Weekday']
    hours = df['Hour']
    cluster = df['Cluster']
    crimes = df['Crimes']
    
    df = pd.concat([years, months, days, weekday, hours, cluster, crimes], axis=1)
        
    ## Two years of data, from 2008 to 2009 included
    #df = df.query('Year >= 2008 and Year <= 2009')
    ## One year of data, 2008
    df = df.query('Year == 2008')
        
    return df

#inputfile = 'data/stack.csv' ## Sample dataset
inputfile = 'data/Crimes_-_2001_to_present.csv' 

dataSelected = feature_selection(inputfile)
print("done data selection...")
dataTransformed = feature_engineering(dataSelected)
print("done all of feature engineering...")

done data selection...
begin clustering...
done clustering...
done all of feature engineering...


In [3]:
from datetime import datetime
from datetime import timedelta
import math

""" 
    The goal here is to generate new rows matching (date, location) couples for which no crime occured. 
    In order to do so, we have to consider all (date, location) couples in a given range of dates 
    (all dates between the first and last chosen dates) 
"""

years = dataTransformed['Year'].values
years = np.reshape(years, (len(years), 1))
months = pd.get_dummies(dataTransformed['Month']).values
days = dataTransformed['Day'].values
days = np.reshape(days, (len(days), 1))
weekdays = pd.get_dummies(dataTransformed['Weekday']).values
hours = dataTransformed['Hour'].values
hours = np.reshape(hours, (len(hours), 1))
clusters = pd.get_dummies(dataTransformed['Cluster']).values
crimes = dataTransformed['Crimes'].values
crimes = np.reshape(crimes, (len(crimes), 1))

hours_clusters_crimes = np.concatenate((years, months, days, weekdays, hours, clusters, crimes), axis=1)

date_begin = dataTransformed.head(1).ix[:, :3].values[0]
date_end = dataTransformed.tail(1).ix[:, :3].values[0]

date_begin_dt = datetime(date_begin[0], date_begin[1], date_begin[2])
date_end_dt = datetime(date_end[0], date_end[1], date_end[2])

diff = date_end_dt - date_begin_dt
theorical_size = (diff.days + 1) * n_clusters * 24

date_cursor = date_begin_dt
mat = []
compt_hours = 0
for i in range(theorical_size):
    if i%n_clusters == 0 and i != 0:
        date_cursor += timedelta(hours=1)
    
    tok_clusters = i % n_clusters
      
    year = date_cursor.year  
    month = [0.]*(date_cursor.month -1) + [1.] + [0.]*(12 - date_cursor.month)
    day = date_cursor.day
    weekday = [0.]*date_cursor.weekday() + [1.] + [0.]*(7-date_cursor.weekday()-1)
    hour = date_cursor.hour
    vec_cluster = [0.]*tok_clusters + [1.] + [0.]*(n_clusters-tok_clusters-1)
    
    mat.append(np.asarray([year] + month + [day] + weekday + [hour] + vec_cluster + [0.]))
    
mat = np.asarray(mat)

index = []
compt = 0

for i in range(len(mat)):
    if np.array_equal(mat[i, :-1], hours_clusters_crimes[compt, :-1]):
        index.append([i, hours_clusters_crimes[compt, -1]])
        compt += 1
        if compt == len(hours_clusters_crimes):
            break
    else:
        pass
    if i % 100000 == 0:
        state = i / len(mat) * 100
        print("{0:.2f}".format(state) + '%')

for i in range(len(index)):
    mat[index[i][0], -1] = index[i][1]

0.00%
22.77%
45.54%
68.31%
91.07%


In [4]:
outputfile = 'data/complete_grouped_crimes.csv'
pd.DataFrame(mat).to_csv(outputfile, sep=';', mode='w', encoding='utf8', index=False)

In [5]:
import numpy as np
import platform
from sklearn.ensemble import RandomForestRegressor as RFR
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.pipeline import Pipeline

def model(X, y):
    print('Training the model..')
            
    reg = Pipeline([
            ('rfr', RFR(n_estimators=30, criterion='mse', max_depth=None, min_samples_split=2, min_samples_leaf=1, min_weight_fraction_leaf=0.0, 
             max_features='auto', max_leaf_nodes=None, min_impurity_split=1e-07, bootstrap=True, oob_score=False, n_jobs=1, random_state=None, 
             verbose=0, warm_start=False))
            ]) 
    reg.fit(X, y)
    print('Done training the model..')       
    return reg
    
def predict(X, reg):    
    y_pred = reg.predict(X)         
    return y_pred

def parse_file(file):
    X = []
    y = []
    with open(file, 'r') as data:
        data.readline()
        for line in data:
            row = line.split(';')
            for i in range(len(row)):
                row[i] = float(row[i])
            X.append(row[:len(row)-1])
            y.append(row[len(row)-1])
    return X, y

In [6]:
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor as RFR
from sklearn.preprocessing import StandardScaler
from datetime import *
from sklearn.metrics import mean_squared_error

from sklearn.svm import SVC 
        
def crossValidationError(file='data/complete_grouped_crimes.csv'):
    p_unlabelled = 0.2
    X, y = parse_file(file)   
    print("Done parsing..")
    X = np.asarray(X)
    Y = np.asarray(y)
    Y = Y.reshape(len(Y),1)
    
    X_lab, X_unlab, y_lab, y_unlab = train_test_split(X, Y, test_size=p_unlabelled, random_state=57)
    print("Done splitting labeled/unlabeled ones..")
    
    mod = model(X_lab, y_lab)
    print('Done training..')
    
    yPred = predict(X_unlab, mod)
    
    mseError = mean_squared_error(y_unlab, yPred)
    return mseError

if __name__ == '__main__':
    crossvalidationError = crossValidationError()
    print('--------------------------') 
    print('Mean Squared Error = ' + str(crossvalidationError))
    print('--------------------------') 

Done parsing..
Done splitting labeled/unlabeled ones..
Training the model..


  self._final_estimator.fit(Xt, y, **fit_params)


Done training the model..
Done training..
--------------------------
Mean Squared Error = 1.18908343453
--------------------------
