In [None]:
#import libraries
import numpy as np
import scipy as scp
import matplotlib as mpl
import sklearn as skl
import matplotlib.pyplot as plt
import pandas
from pandas import DataFrame
import gzip
import re
import pydot
from sklearn import svm
import time

%matplotlib inline

In [None]:
train_data_frame = pandas.read_csv("data\\train.csv", sep=",", quotechar='"')
test = pandas.read_csv("data\\test.csv", sep=",", quotechar='"')

#extract year, day and hour and create "Year", "Month", "Hour" columns in train_data_frame


#
def parse_date(date):
    """
    maps date formatted as in training set as a list of [Year, Month, Hour]
    :param date: date as formatted in training set
    :retuap oject that can extract of Year, Month and Hour
    """
    mo=re.search(r'^([0-9]{4})-([0-9]{2})-[0-9]{2}\s+([0-9]{2}):[0-9]{2}:[0-9]{2}$',date)
    timelist = map(int,(mo.group(1),mo.group(2),mo.group(3)))
    return map(int,(mo.group(1),mo.group(2),mo.group(3)))

# Extract 'Year', 'Month' and 'Hour' columns for later use
train_data_frame['Year'],train_data_frame['Month'],train_data_frame['Hour']=zip(*train_data_frame.loc[:,'Dates'].map(parse_date))
test['Year'],test['Month'],test['Hour']=zip(*test.loc[:,'Dates'].map(parse_date))

##geographically no anomaly detected except the following outliers, which actually are the same instances
#outliersX = np.where(train_data_frame.X >-121)[0]
#outliersY = np.where(train_data_frame.Y > 38)[0]

#clean date : supress data outliers from dataframe
train_data_frame_ret = train_data_frame[train_data_frame.Y < 38]
#train_data_frame_ret.shape

In [None]:
#converts unidimensianal ndarray with values in [0::38] into 2 dim ndarray 
#same length as initially, columns in range(38) filled with 0 except 1 at position given by input

def convertToSubmittable(prediction):
    """
    Transforms a column of predictions as a number associated to a Crime Category (between 0 and 38)
    into a dataframe filled with 0s except 1 coding the predicted Crime Category in the corresponding column
    :param prediction: 1 dimensional ndarray with values in [0:38]
    :return submittable: 2 dimensional ndarray with {columns : Crime Categories, rows : Predictions}
    """
    submittable = np.zeros((len(prediction),len(crimeCategoryList)))
    for i in range(len(prediction)):
        submittable[i][prediction[i]] = 1
    return submittable

    
def prepend(filename, line):
    """
    adds line to first column of first row of csv filename
    :type filename: String
    :type line: String
    :param filename: path to csv file
    :param line: String to be added
    """
    #r+ allows file to be read and written
    with open(filename, 'r+') as f:
        content = f.read()
        f.seek(0, 0)
        f.write(line.rstrip('\r\n')+ content)
        
def tocsv(submission,filename):
    """
    converts 2d dataframe or ndarray to csv file named filename
    :type submission: ndarray
    :param submision:that contains probabilities of belonging to various Crime Categories
    :param filename: path to csv file 
    """
    keys = np.unique(train_data_frame["Category"])
    DataFrame(dict(zip(keys, submission.T))).to_csv(filename)


def gzipcompress(filename):
    """
    compresses csv to gzip in same folder 
    :param filename: path to csv file
    """
    with open(filename, 'rb') as f_in:
        with gzip.open(filename[:-3]+"gz", 'wb') as f_out:
            f_out.writelines(f_in)

In [None]:
from sklearn import cross_validation
df=pandas.get_dummies(train_data_frame_ret[["PdDistrict","Hour"]])
start = time.time()
scoresvm = []
paramsvm = []
k = 10e-3
while k<=10e5:
    svmClassifier = svm.SVC(C=k, probability = True)
    score = cross_validation.cross_val_score(svmClassifier, df,
                                             np.ravel(train_data_frame_ret[["Category"]]),
                                             cv=5, scoring='log_loss')
    scoresvm += [score.mean()]
    paramsvm += [k]
    elapsedtime = time.time() - start
    print(k, scores)
    print(elapsedtime)
    k = k*10
    

plt.scatter(paramsvm,scoresvm)

plt.title('Decision Tree on District Hour (dummy)')
plt.legend(["logloss"])
plt.xlabel("min_sample_leaf")