In [1]:
%matplotlib inline

import accelerate
#import numbapro
import mkl
#import iopro

# General libraries.
from __future__ import division
from collections import Counter
import csv
import dateutil
import os
import pandas as pd
import re
import numpy as np
import matplotlib.pyplot as plt

# SK-learn libraries for preprocessing.
from sklearn import preprocessing
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import OneHotEncoder # for integer values
from sklearn.feature_extraction import DictVectorizer as DV
from sklearn.cross_validation import StratifiedShuffleSplit

# SK-learn libraries for learning.
from sklearn.pipeline import Pipeline
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import BernoulliNB
from sklearn.naive_bayes import MultinomialNB
from sklearn import svm
from sklearn.mixture import GMM
from sklearn.grid_search import GridSearchCV
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.base import TransformerMixin

from copy import deepcopy
# SK-learn libraries for evaluation.
from sklearn.metrics import confusion_matrix
from sklearn import metrics
from sklearn.metrics import classification_report
from sklearn.metrics import log_loss

In [2]:
data_path = "data"
submissions_path = "submissions"
if not data_path or not submissions_path:
    raise Exception("Set the data and submission paths in competition_utilities.py!")

def parse_date_maybe_null(date):
    if date:
        return dateutil.parser.parse(date)
    return None

df_converters = {"Dates": dateutil.parser.parse}

def get_reader(file_name="train.csv"):
    reader = csv.reader(open(os.path.join(data_path, file_name)))
    header = reader.next()
    return reader

def get_header(file_name="train.csv"):
    reader = csv.reader(open(os.path.join(data_path, file_name)))
    header = reader.next()
    return header

def get_dataframe(file_name="train.csv"):
    return pd.io.parsers.read_csv(os.path.join(data_path, file_name), converters = df_converters)

    
def write_submission(file_name, predictions):
    writer = csv.writer(open(os.path.join(submissions_path, file_name), "w"), lineterminator="\n")
    writer.writerows(predictions)    

In [3]:
TestD = get_dataframe("test.csv")
TestD.head()

Unnamed: 0,Id,Dates,DayOfWeek,PdDistrict,Address,X,Y
0,0,2015-05-10 23:59:00,Sunday,BAYVIEW,2000 Block of THOMAS AV,-122.399588,37.735051
1,1,2015-05-10 23:51:00,Sunday,BAYVIEW,3RD ST / REVERE AV,-122.391523,37.732432
2,2,2015-05-10 23:50:00,Sunday,NORTHERN,2000 Block of GOUGH ST,-122.426002,37.792212
3,3,2015-05-10 23:45:00,Sunday,INGLESIDE,4700 Block of MISSION ST,-122.437394,37.721412
4,4,2015-05-10 23:45:00,Sunday,INGLESIDE,4700 Block of MISSION ST,-122.437394,37.721412


In [4]:
# make a copy so that you don't have to load the original data if you need to start over
test_data = TestD.copy(deep=True)

In [5]:
TrainD = get_dataframe("train.csv")
TrainD.head()

Unnamed: 0,Dates,Category,Descript,DayOfWeek,PdDistrict,Resolution,Address,X,Y
0,2015-05-13 23:53:00,WARRANTS,WARRANT ARREST,Wednesday,NORTHERN,"ARREST, BOOKED",OAK ST / LAGUNA ST,-122.425892,37.774599
1,2015-05-13 23:53:00,OTHER OFFENSES,TRAFFIC VIOLATION ARREST,Wednesday,NORTHERN,"ARREST, BOOKED",OAK ST / LAGUNA ST,-122.425892,37.774599
2,2015-05-13 23:33:00,OTHER OFFENSES,TRAFFIC VIOLATION ARREST,Wednesday,NORTHERN,"ARREST, BOOKED",VANNESS AV / GREENWICH ST,-122.424363,37.800414
3,2015-05-13 23:30:00,LARCENY/THEFT,GRAND THEFT FROM LOCKED AUTO,Wednesday,NORTHERN,NONE,1500 Block of LOMBARD ST,-122.426995,37.800873
4,2015-05-13 23:30:00,LARCENY/THEFT,GRAND THEFT FROM LOCKED AUTO,Wednesday,PARK,NONE,100 Block of BRODERICK ST,-122.438738,37.771541


In [6]:
# make a copy so that you don't have to load the original data if you need to start over
train_data = TrainD.copy(deep=True)
train_data=train_data[abs(train_data["Y"])<100]
train_data.index=range(len(train_data))

In [7]:
number_of_categories = train_data['Category'].nunique()
category_names = sorted(train_data['Category'].unique())
labels = train_data["Category"].astype('category')

print number_of_categories
print category_names

39
['ARSON', 'ASSAULT', 'BAD CHECKS', 'BRIBERY', 'BURGLARY', 'DISORDERLY CONDUCT', 'DRIVING UNDER THE INFLUENCE', 'DRUG/NARCOTIC', 'DRUNKENNESS', 'EMBEZZLEMENT', 'EXTORTION', 'FAMILY OFFENSES', 'FORGERY/COUNTERFEITING', 'FRAUD', 'GAMBLING', 'KIDNAPPING', 'LARCENY/THEFT', 'LIQUOR LAWS', 'LOITERING', 'MISSING PERSON', 'NON-CRIMINAL', 'OTHER OFFENSES', 'PORNOGRAPHY/OBSCENE MAT', 'PROSTITUTION', 'RECOVERED VEHICLE', 'ROBBERY', 'RUNAWAY', 'SECONDARY CODES', 'SEX OFFENSES FORCIBLE', 'SEX OFFENSES NON FORCIBLE', 'STOLEN PROPERTY', 'SUICIDE', 'SUSPICIOUS OCC', 'TREA', 'TRESPASS', 'VANDALISM', 'VEHICLE THEFT', 'WARRANTS', 'WEAPON LAWS']


# Features

In [42]:
def make_features(df):
    
    
    ###############################################################
    # Make regions from lat/long values
    ###############################################################
    
    
    '''
    # fix invalid values:
    df.loc[df['X'] > -122.3, 'X'] = -122.3
    df.loc[df['Y'] > 37.8, 'Y'] = 37.8
    
    
    print "Creating regions..."
    
    grid_size = 20
    grid_width = np.max(df['X']) - np.min(df['X'])
    grid_height = np.max(df['Y']) - np.min(df['Y'])

    x_interval = grid_width/grid_size
    y_interval = grid_height/grid_size

    min_x, max_x = np.min(df['X']), np.max(df['X'])
    min_y, max_y = np.min(df['Y']), np.max(df['Y'])

    df['region'] = (10*(np.ceil((df['X'] - min_x)/x_interval)) + (np.ceil((df['Y'] - min_y)/y_interval))).astype(int)

    print "New max and min values"
    print "---------------------------"
    print (min_x, max_x, min_y, max_y)
    
    '''
    
    ###############################################################
    # PARSE THE DATE
    ###############################################################
    print "Parsing Dates..."
    
    df.loc[:,'Hour'] = (df.loc[:,'Dates']).dt.hour
    df.loc[:,'Month'] = (df.loc[:,'Dates']).dt.month
    df.loc[:,'Year'] = (df.loc[:,'Dates']).dt.year
    df.loc[:,'Day'] = (df.loc[:,'Dates']).dt.day
    
    ###############################################################
    # MAKE SEASONS
    ###############################################################
    print 'Making seasons...'
    def get_season(x):
        summer=0
        fall=0
        winter=0
        spring=0
        if (x in [5, 6, 7]):
            summer=1
        if (x in [8, 9, 10]):
            fall=1
        if (x in [11, 0, 1]):
            winter=1
        if (x in [2, 3, 4]):
            spring=1
        return summer, fall, winter, spring
    
    
    df["Awake"] = df["Hour"].apply(lambda x: 1 if (x==0 or (x>=8 and x<=23)) else 0)
    df["Summer"], df["Fall"], df["Winter"], df["Spring"] = zip(*df["Month"].apply(get_season))
    
    ###############################################################
    # MARK Duplicates
    ###############################################################
    print 'Marking dupes...'
    df["IsDup"] = pd.Series(df.duplicated()|df.duplicated(take_last=True)).apply(int)
    
    return df

In [43]:
train_data = make_features(train_data)

Parsing Dates...
Making seasons...
Marking dupes...




In [44]:
test_data = make_features(test_data)

Parsing Dates...
Making seasons...
Marking dupes...




### ADDRESSES

In [11]:
# MAKE ADDRESS FEATURES ON TRAINING DATA:

addresses=sorted(train_data["Address"].unique())
categories=sorted(train_data["Category"].unique())

C_counts=train_data.groupby(["Category"]).size()
A_C_counts=train_data.groupby(["Address","Category"]).size()

A_counts=train_data.groupby(["Address"]).size()

logodds={}
logoddsPA={}

MIN_CAT_COUNTS=2

default_logodds=np.log(C_counts/len(train_data))-np.log(1.0-C_counts/float(len(train_data)))

for addr in addresses:
    PA=A_counts[addr]/float(len(train_data))
    logoddsPA[addr]=np.log(PA)-np.log(1.-PA)
    logodds[addr]=deepcopy(default_logodds)
    for cat in A_C_counts[addr].keys():
        if (A_C_counts[addr][cat]>MIN_CAT_COUNTS) and A_C_counts[addr][cat]<A_counts[addr]:
            PA=A_C_counts[addr][cat]/float(A_counts[addr])
            logodds[addr][categories.index(cat)]=np.log(PA)-np.log(1.0-PA)
    logodds[addr]=pd.Series(logodds[addr])
    logodds[addr].index=range(len(categories))
    
print "Creating logodds address features..."
address_features=train_data["Address"].apply(lambda x: logodds[x])
address_features.columns=["logodds"+str(x) for x in range(len(address_features.columns))]

train_data["IsInterection"]=train_data["Address"].apply(lambda x: 1 if "/" in x else 0)
train_data["logoddsPA"]=train_data["Address"].apply(lambda x: logoddsPA[x])

train_data = pd.concat([train_data, address_features], axis=1, join='inner')

Creating logodds address features...


In [15]:
train_data.columns

Index([u'Dates', u'Category', u'Descript', u'DayOfWeek', u'PdDistrict',
       u'Resolution', u'Address', u'X', u'Y', u'IsInterection', u'logoddsPA',
       u'logodds0', u'logodds1', u'logodds2', u'logodds3', u'logodds4',
       u'logodds5', u'logodds6', u'logodds7', u'logodds8', u'logodds9',
       u'logodds10', u'logodds11', u'logodds12', u'logodds13', u'logodds14',
       u'logodds15', u'logodds16', u'logodds17', u'logodds18', u'logodds19',
       u'logodds20', u'logodds21', u'logodds22', u'logodds23', u'logodds24',
       u'logodds25', u'logodds26', u'logodds27', u'logodds28', u'logodds29',
       u'logodds30', u'logodds31', u'logodds32', u'logodds33', u'logodds34',
       u'logodds35', u'logodds36', u'logodds37', u'logodds38'],
      dtype='object')

In [47]:
# MAKE ADDRESS FEATURES ON TEST DATA:

new_addresses=sorted(test_data["Address"].unique())
new_A_counts=test_data.groupby("Address").size()
only_new=set(new_addresses+addresses)-set(addresses)
only_old=set(new_addresses+addresses)-set(new_addresses)
in_both=set(new_addresses).intersection(addresses)
in_either=set(new_addresses).union(addresses)

for addr in only_new:
    PA=new_A_counts[addr]/float(len(test_data)+len(train_data))
    logoddsPA[addr]=np.log(PA)-np.log(1.-PA)
    logodds[addr]=deepcopy(default_logodds)
    logodds[addr].index=range(len(categories))
    
for addr in in_both:
    PA=(A_counts[addr]+new_A_counts[addr])/float(len(test_data)+len(train_data))
    logoddsPA[addr]=np.log(PA)-np.log(1.-PA)   
    
print "Creating logodds address features"
test_address_features=test_data["Address"].apply(lambda x: logodds[x])
test_address_features.columns=["logodds"+str(x) for x in range(len(test_address_features.columns))]

test_data["IsInterection"]=test_data["Address"].apply(lambda x: 1 if "/" in x else 0)
test_data["logoddsPA"]=test_data["Address"].apply(lambda x: logoddsPA[x])

test_data = pd.concat([test_data, test_address_features], axis=1, join='inner')

Creating logodds address features


In [13]:
test_data.head()

Unnamed: 0,Id,Dates,DayOfWeek,PdDistrict,Address,X,Y
0,0,2015-05-10 23:59:00,Sunday,BAYVIEW,2000 Block of THOMAS AV,-122.399588,37.735051
1,1,2015-05-10 23:51:00,Sunday,BAYVIEW,3RD ST / REVERE AV,-122.391523,37.732432
2,2,2015-05-10 23:50:00,Sunday,NORTHERN,2000 Block of GOUGH ST,-122.426002,37.792212
3,3,2015-05-10 23:45:00,Sunday,INGLESIDE,4700 Block of MISSION ST,-122.437394,37.721412
4,4,2015-05-10 23:45:00,Sunday,INGLESIDE,4700 Block of MISSION ST,-122.437394,37.721412


### MAKE DUMMY VARIABLES

In [15]:
le = preprocessing.LabelEncoder()
days = train_data.loc[:,'DayOfWeek'].unique()
le.fit(days)
train_data.loc[:,'DayOfWeek'] = le.transform(train_data.loc[:,'DayOfWeek']) 


days = test_data.loc[:,'DayOfWeek'].unique()
le.fit(days)
test_data.loc[:,'DayOfWeek'] = le.transform(test_data.loc[:,'DayOfWeek'])


district = train_data.loc[:,'PdDistrict'].unique()
le.fit(district)
train_data.loc[:,'PdDistrict'] = le.transform(train_data.loc[:,'PdDistrict']) 


district = test_data.loc[:,'PdDistrict'].unique()
le.fit(district)
test_data.loc[:,'PdDistrict'] = le.transform(test_data.loc[:,'PdDistrict']) 



In [16]:
test_data.head()

Unnamed: 0,Id,Dates,DayOfWeek,PdDistrict,Address,X,Y,region,Hour,Month,...,logodds29,logodds30,logodds31,logodds32,logodds33,logodds34,logodds35,logodds36,logodds37,logodds38
0,0,2015-05-10 23:59:00,3,0,2000 Block of THOMAS AV,-122.399588,37.735051,116,23,5,...,-8.688077,-5.259591,-7.454398,-2.772589,-11.893691,-4.777894,-2.533697,-2.335375,-2.533697,-4.621396
1,1,2015-05-10 23:51:00,3,0,3RD ST / REVERE AV,-122.391523,37.732432,126,23,5,...,-8.688077,-5.259591,-7.454398,-3.332205,-11.893691,-4.777894,-2.92489,-2.682075,-2.159484,-4.199705
2,2,2015-05-10 23:50:00,3,4,2000 Block of GOUGH ST,-122.426002,37.792212,109,23,5,...,-8.688077,-5.259591,-7.454398,-3.294016,-11.893691,-4.777894,-2.92489,-1.223775,-2.985679,-4.621396
3,3,2015-05-10 23:45:00,3,2,4700 Block of MISSION ST,-122.437394,37.721412,83,23,5,...,-8.688077,-5.259591,-7.454398,-2.888233,-11.893691,-5.095793,-2.373798,-4.240298,-2.929592,-3.608837
4,4,2015-05-10 23:45:00,3,2,4700 Block of MISSION ST,-122.437394,37.721412,83,23,5,...,-8.688077,-5.259591,-7.454398,-2.888233,-11.893691,-5.095793,-2.373798,-4.240298,-2.929592,-3.608837


In [17]:
# make one-hot from columns
def make_one_hot(df, columns=['PdDistrict','region']):
    enc = OneHotEncoder()
    
    df_onehot = df[columns]
    df_onehot = enc.fit_transform(df_onehot) 

    #print enc.n_values_
    #print enc.feature_indices_
    
    return df_onehot

In [18]:
districts_test = make_one_hot(test_data, columns=['PdDistrict'])
districts_test_df = pd.DataFrame(data=districts_test.toarray()) 
districts_test_df.columns = ['PdDistrict_'+ str(col) for col in districts_test_df.columns]

DayOfWeek_test = make_one_hot(test_data, columns=['DayOfWeek'])
DayOfWeek_test_df = pd.DataFrame(data=DayOfWeek_test.toarray()) 
DayOfWeek_test_df.columns = ['DayOfWeek_'+ str(col) for col in DayOfWeek_test_df.columns]

test_data = pd.concat([test_data, districts_test_df, DayOfWeek_test_df], axis=1, join='inner')

In [19]:
districts_train = make_one_hot(train_data, columns=['PdDistrict'])
districts_train_df = pd.DataFrame(data=districts_train.toarray()) 
districts_train_df.columns = ['PdDistrict_'+ str(col) for col in districts_train_df.columns]

DayOfWeek_train = make_one_hot(train_data, columns=['DayOfWeek'])
DayOfWeek_train_df = pd.DataFrame(data=DayOfWeek_train.toarray()) 
DayOfWeek_train_df.columns = ['DayOfWeek_'+ str(col) for col in DayOfWeek_train_df.columns]

train_data = pd.concat([train_data, districts_train_df, DayOfWeek_train_df], axis=1, join='inner')


### REMOVE UNUSED COLUMNS

In [20]:
def remove_unused_cols(df, data_src='train'):
    del df['Dates']
#     del df['X']
#     del df['Y']
    del df['Address']
    del df['DayOfWeek']
    del df['PdDistrict']
        
    if data_src == 'train':
        del df['Descript']
        del df['Resolution']
        del df['Category']
    else:
        del df['Id']
        
    return df 

In [21]:
train_data = remove_unused_cols(train_data,'train')
test_data = remove_unused_cols(test_data,'test')

In [22]:
print train_data.columns.tolist()

train_data.head()

['X', 'Y', 'region', 'Hour', 'Month', 'Year', 'Day', 'Awake', 'Summer', 'Fall', 'Winter', 'Spring', 'IsDup', 'IsInterection', 'logoddsPA', 'logodds0', 'logodds1', 'logodds2', 'logodds3', 'logodds4', 'logodds5', 'logodds6', 'logodds7', 'logodds8', 'logodds9', 'logodds10', 'logodds11', 'logodds12', 'logodds13', 'logodds14', 'logodds15', 'logodds16', 'logodds17', 'logodds18', 'logodds19', 'logodds20', 'logodds21', 'logodds22', 'logodds23', 'logodds24', 'logodds25', 'logodds26', 'logodds27', 'logodds28', 'logodds29', 'logodds30', 'logodds31', 'logodds32', 'logodds33', 'logodds34', 'logodds35', 'logodds36', 'logodds37', 'logodds38', 'PdDistrict_0', 'PdDistrict_1', 'PdDistrict_2', 'PdDistrict_3', 'PdDistrict_4', 'PdDistrict_5', 'PdDistrict_6', 'PdDistrict_7', 'PdDistrict_8', 'PdDistrict_9', 'DayOfWeek_0', 'DayOfWeek_1', 'DayOfWeek_2', 'DayOfWeek_3', 'DayOfWeek_4', 'DayOfWeek_5', 'DayOfWeek_6']


Unnamed: 0,X,Y,region,Hour,Month,Year,Day,Awake,Summer,Fall,...,PdDistrict_7,PdDistrict_8,PdDistrict_9,DayOfWeek_0,DayOfWeek_1,DayOfWeek_2,DayOfWeek_3,DayOfWeek_4,DayOfWeek_5,DayOfWeek_6
0,-122.425892,37.774599,105,23,5,2015,13,1,1,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
1,-122.425892,37.774599,105,23,5,2015,13,1,1,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
2,-122.424363,37.8,110,23,5,2015,13,1,1,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
3,-122.426995,37.8,110,23,5,2015,13,1,1,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
4,-122.438738,37.771541,94,23,5,2015,13,1,1,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0


# Make train and dev sets
We have very uneven portions of classes. Therefore a stratified sampling makes sense.

In [23]:
sss = StratifiedShuffleSplit(labels, train_size=0.5)

for train_index, dev_index in sss:
    X_train, X_dev = train_data.iloc[train_index], train_data.iloc[dev_index]
    y_train, y_dev = labels[train_index],labels[dev_index]
    
X_train.index=range(len(X_train))
X_dev.index=range(len(X_dev))

y_train.index=range(len(y_train))
y_dev.index=range(len(y_dev))

train_data.index=range(len(train_data))
labels.index=range(len(labels))


# Basic Classifiers

In [None]:
classifier = RandomForestClassifier()
model = classifier.fit(X_train, y_train)
print model.score(X_dev, y_dev)

In [None]:
classifier = DecisionTreeClassifier()
model = classifier.fit(X_train, y_train)
print model.score(X_dev, y_dev)

In [None]:
classifier = LogisticRegression()
model = classifier.fit(X_train, y_train)
print model.score(X_dev, y_dev)

In [None]:
classifier = BernoulliNB()
model = classifier.fit(X_train, y_train)
print model.score(X_dev, y_dev)

In [None]:
classifier = MultinomialNB()
model = classifier.fit(X_train, y_train)
print model.score(X_dev, y_dev)

# Predict and write file

In [None]:
predictions = model.predict_proba(test_data)
withId  = np.column_stack((map(str,xrange(test_data.shape[0])),predictions))
towrite = np.row_stack((["Id"] + sorted(y_train.unique()),withId))

print(towrite)
write_submission('submission_MultiNB.csv', towrite)

# NEURAL NETWORK

### Scale the data

In [24]:
def scale_data(dfs=[train_data,test_data,X_train,X_dev]):
    for d in dfs:
        collist=d.columns.tolist()
        scaler = preprocessing.StandardScaler()
        scaler.fit(d)
        d[collist]=scaler.transform(d)
    return train_data,test_data,X_train,X_dev

train_data,test_data,X_train,X_dev = scale_data(dfs=[train_data,test_data,X_train,X_dev])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self.obj[item] = s


In [25]:
from keras.models import Sequential
from keras.optimizers import SGD
from keras.layers.recurrent import LSTM, GRU, SimpleRNN
from keras.layers.advanced_activations import PReLU
from keras.layers.core import Dense, Dropout, Activation
from keras.layers.normalization import BatchNormalization
from keras.utils import np_utils


Using Theano backend.


In [9]:
y_train = y_train.astype('category')
y_train = y_train.cat.rename_categories(range(len(y_train.unique())))

print y_train.shape
print X_train.shape

y_dev = y_dev.astype('category')
y_dev = y_dev.cat.rename_categories(range(len(y_dev.unique())))


NameError: name 'y_train' is not defined

In [8]:
N_EPOCHS=5
N_HN=10
N_LAYERS=1
DP=0.5
input_dim=X_train.shape[1]
output_dim=len(labels_train.unique())
Y_train=y_train.cat.rename_categories(range(len(y_train.unique())))


model = Sequential()
model.add(Dense(input_dim=input_dim, output_dim=N_HN, init='glorot_uniform'))
model.add(PReLU(input_shape=(N_HN,)))
model.add(Dropout(dp))

for i in range(layers):
    model.add(Dense(input_dim=N_HN, output_dim=N_HN,init='glorot_uniform'))
    model.add(Activation('softmax'))
    model.add(PReLU(input_shape=(N_HN,)))
    model.add(BatchNormalization())
    model.add(Dropout(dp))

model.add(Dense(input_dim=N_HN, output_dim=output_dim,init='glorot_uniform'))
model.add(Activation('softmax'))
model.compile(loss='sparse_categorical_crossentropy', optimizer='adam')    
    

model.fit(X_train.as_matrix(), y_train, nb_epoch=Nepoch, batch_size=NBatch,validation_data=(X_dev.as_matrix(),y_dev))

NameError: name 'X_train' is not defined

In [29]:
print "train", log_loss(y_train, model.predict_proba(X_train.as_matrix(),verbose=0))
print "dev", log_loss(y_dev, model.predict_proba(X_dev.as_matrix(),verbose=0))
print "all", log_loss(labels, model.predict_proba(train_data.as_matrix(),verbose=0))

train 2.28427365966
dev 2.29383120554
all 2.30952260183


In [30]:
train_data.shape


(878049, 71)

In [32]:
X = train_data.as_matrix()
Y = labels.cat.rename_categories(range(len(labels.unique())))

Nlayers = 1
Nepoch = 20
input_dim = train_data.shape[1]
output_dim = 39
hn = 128
NBatch = 64

model = Sequential()
model.add(Dense(input_dim=input_dim, output_dim=hn, init='glorot_uniform'))
model.add(PReLU(input_shape=(hn,)))
model.add(Dropout(dp))

for i in range(Nlayers):
    model.add(Dense(input_dim=hn, output_dim=hn,init='glorot_uniform'))
    model.add(PReLU(input_shape=(hn,)))
    model.add(BatchNormalization())
    model.add(Dropout(dp))

model.add(Dense(input_dim=hn, output_dim=output_dim,init='glorot_uniform'))
model.add(Activation('softmax'))
model.compile(loss='sparse_categorical_crossentropy', optimizer='adam')
    
model.fit(X, Y, nb_epoch=Nepoch, batch_size=NBatch)

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


<keras.callbacks.History at 0x15ae16d90>

In [33]:
print "train", log_loss(y_train, model.predict_proba(X_train.as_matrix(),verbose=0))
print "dev", log_loss(y_dev, model.predict_proba(X_dev.as_matrix(),verbose=0))
print "all", log_loss(labels, model.predict_proba(train_data.as_matrix(),verbose=0))

train 2.313499976
dev 2.31556292382
all 2.28215142557


In [None]:
predDF=pd.DataFrame(model.predict_proba(test_data.as_matrix(),verbose=0),columns=sorted(labels.unique()))

In [None]:
predDF.to_csv(os.path.join(submissions_path, "predictions_x.csv"),index_label="Id",na_rep="0")