In [1]:
from sklearn.externals import joblib
import numpy as np
import pandas as pd
df = pd.read_pickle('data/trains.pkl')

# This is the time sampling interval used- smaller intervals increase model accuracy, but increase processing time
period = "1S"

estimators = 10
max_features = 4

In [2]:
# Setup filter for rows where it was turned off
bad = df[df['duration'] < 0]

enter_bad_count = bad.start.value_counts()
exit_bad_count = bad.end.value_counts()
bad2 = pd.concat([enter_bad_count, exit_bad_count], axis=1, keys=["turned_off", "turned_on"])
bad3 = bad2["turned_off"].resample(period).fillna(method='ffill')
bad3 = bad3.fillna(0)

enter_count = df.start.value_counts().clip(upper=1)
exit_count = df.end.value_counts()
df2 = pd.concat([enter_count, exit_count], axis=1, keys=["train_present", "exit"])

# df2['train_present'][df2['train_present'] == 2] = 1
counts = df2["train_present"].resample(period).fillna(method='ffill')
counts = counts.fillna(0)

In [3]:
df3 = counts.to_frame()
#Remove rows where tracking was off
df3 = df3.join(bad3)
df3 = df3[df3['turned_off'] != 1]
del df3['turned_off']
df3['dow'] = df3.index.dayofweek
df3['minute'] = df3.index.minute + (60*df3.index.hour)
df3['hour'] = df3.index.hour
df3['5min'] = np.floor(df3['minute'] / 5)
df3['2min'] = np.floor(df3['minute'] / 2)
df3['15min'] = np.floor(df3['minute'] / 15)
df3['30min'] = np.floor(df3['minute'] / 30)
print(len(df3))

2507018


In [4]:
from sklearn.utils import resample
df_majority = df3[df3.train_present==0]
print(df_majority.dow.count())
df_minority = df3[df3.train_present==1]
# Upsample minority class
df_minority_upsampled = resample(df_minority, 
                                 replace=True,     # sample with replacement
                                 n_samples=df_majority.dow.count(),    # to match majority class
                                 random_state=123) # reproducible results
# Combine majority class with upsampled minority class
df_upsampled = pd.concat([df_majority, df_minority_upsampled])
df3 = df_upsampled.copy()

2081809


In [5]:
df3['is_training'] = np.random.uniform(0, 1, len(df3)) <= .75
# Create two new dataframes, one with the training rows, one with the test rows
train, test = df3[df3['is_training']==True], df3[df3['is_training']==False]
# Show the number of observations for the test and training dataframes
print('Number of observations in the training data:', len(train))
print('Number of observations in the test data:',len(test))
y = pd.factorize(train['train_present'])[0]
features = df3.columns[1:8]
print("features:", features)

Number of observations in the training data: 3122680
Number of observations in the test data: 1040938
features: Index(['dow', 'minute', 'hour', '5min', '2min', '15min', '30min'], dtype='object')


In [6]:
#### Random forest
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report

from datetime import datetime

# Create a random forest Classifier. By convention, clf means 'Classifier'
clf = RandomForestClassifier(n_jobs=2, random_state=0,n_estimators=estimators, max_features=max_features)

# Train the Classifier to take the training features and learn how they relate
# to the training y (the species)
clf.fit(train[features], y)
predict=clf.predict(test[features])
joblib.dump(clf, 'models/randomtree-'+str(estimators)+'-'+str(max_features)+'-'+str(datetime.now().date())+'.pkl') 
print('accuracy: ', accuracy_score(predict,test['train_present']))
print(classification_report(predict,test['train_present']))
print('feature importance')
print(list(zip(train[features], clf.feature_importances_)))
x=pd.crosstab(test['train_present'], predict, rownames=['Actual'], colnames=['Predicted'])
print(x[0][0])
x

accuracy:  0.756280393261
             precision    recall  f1-score   support

          0       0.59      0.88      0.71    349065
          1       0.92      0.69      0.79    691873

avg / total       0.81      0.76      0.76   1040938

feature importance
[('dow', 0.32781286300283391), ('minute', 0.36271088460332895), ('hour', 0.0028835875575442156), ('5min', 0.077362398090605633), ('2min', 0.202304825747395), ('15min', 0.01943952094156786), ('30min', 0.0074859200567244603)]
307826


Predicted,0,1
Actual,Unnamed: 1_level_1,Unnamed: 2_level_1
0.0,307826,212458
1.0,41239,479415
