In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline

### Initial random forest predictions

In [None]:
hits_train = pd.read_csv("data/train.csv", index_col='global_id')
hits_test = pd.read_csv("data/test.csv", index_col='global_id')
wires = pd.read_csv("data/wires.csv", index_col='wire_id')

In [None]:
wires_cartesian = np.vstack((wires['wire_rho'] * np.cos(wires['wire_phi']),
                                  wires['wire_rho'] * np.sin(wires['wire_phi']))).T
wires_r = np.vstack((wires['wire_rho'], wires['wire_phi'])).T

In [None]:
from sklearn.ensemble import RandomForestClassifier

In [None]:
features_list = ['energy_deposit', 'relative_time', 'wire_x', 'wire_y', 
                 'wire_r', 'x_plus_y', 'x_minus_y', 'xy', 'log_en', 'sq_time']

In [None]:
clf = RandomForestClassifier(n_estimators=700, criterion='gini', 
                             max_depth=None, n_jobs=-1, verbose=0, random_state=1, 
                             min_weight_fraction_leaf=0.0, class_weight=None)

In [None]:
hits_train['wire_x'] = wires_cartesian[hits_train.wire_id][:,0]
hits_train['wire_y'] = wires_cartesian[hits_train.wire_id][:,1]
hits_train['wire_r'] = wires_r[hits_train.wire_id][:,0]
hits_train['x_plus_y'] = np.array(hits_train['wire_x'].values) + np.array(hits_train['wire_y'].values)
hits_train['x_minus_y'] = np.array(hits_train['wire_x'].values) - np.array(hits_train['wire_y'].values)
hits_train['xy'] = np.array(hits_train['wire_x'].values) * np.array(hits_train['wire_y'].values)
hits_train['log_en'] = np.log(np.array(hits_train['energy_deposit'].values)**2)
hits_train['sq_time'] = np.array(hits_train['relative_time'].values)**2

In [None]:
hits_test['wire_x'] = wires_cartesian[hits_test.wire_id][:,0]
hits_test['wire_y'] = wires_cartesian[hits_test.wire_id][:,1]
hits_test['wire_r'] = wires_r[hits_test.wire_id][:,0]
hits_test['x_plus_y'] = np.array(hits_test['wire_x'].values) + np.array(hits_test['wire_y'].values)
hits_test['x_minus_y'] = np.array(hits_test['wire_x'].values) - np.array(hits_test['wire_y'].values)
hits_test['xy'] = np.array(hits_test['wire_x'].values) * np.array(hits_test['wire_y'].values)
hits_test['log_en'] = np.log(np.array(hits_test['energy_deposit'].values)**2)
hits_test['sq_time'] = np.array(hits_test['relative_time'].values)**2

In [None]:
good = hits_train[hits_train.energy_deposit > 0]
clf.fit(good[features_list].values, (good.label==1).astype(int))

In [None]:
candidates = hits_test[hits_test.energy_deposit > 0]
candidates['prediction'] = clf.predict_proba(candidates[features_list])[:,1]

###Filter predictions by the circle center +/- pi/2

It goes through each event by id, plots histogram of the predictions and finds it's maximum based on the assumption that it should be around the center of the track circle.
Overal it takes about 2-3 minutes to calculate everything.

In [None]:
for e in np.unique(candidates.event_id):
    ev = candidates[candidates.event_id == e]
    a,b=np.histogram(wires_r[ev[ev.prediction > .5].wire_id][:,1], 
                     weights=np.array(ev[ev.prediction > .5].prediction), bins=30)
    # Find the maximum of the histogram and create bounds around it
    mn = b[np.argmax(a)+1]
    bounds = np.array([mn+(np.pi/2.),mn-(np.pi/2.)])
    bounds = [bounds-(2*np.pi), bounds, bounds+(2*np.pi)]
    # Complicated conditions, because phi lied within [0, 2pi] but there could
    # be problems at the edges of the range.
    cond = (wires_r[ev.wire_id][:,1] >= bounds[1][1])*(wires_r[ev.wire_id][:,1] <= bounds[1][0]) + \
    (wires_r[ev.wire_id][:,1] >= bounds[0][1])*(wires_r[ev.wire_id][:,1] <= bounds[0][0]) + \
    (wires_r[ev.wire_id][:,1] >= bounds[2][1])*(wires_r[ev.wire_id][:,1] <= bounds[2][0])
    # Set everything outside the bounds to 0
    candidates.loc[ev.index[-cond], 'prediction'] = np.zeros(np.sum(-cond))
    print e

In [None]:
prs = pd.DataFrame()
prs['prediction'] = candidates.prediction
prs.index = candidates.index
prs.to_csv('rf_with_filter_700.csv', index_label='global_id')

### Filter the rest by the window around the track

In [None]:
preds = pd.read_csv('rf_with_filter_700.csv', index_col='global_id')