In [1]:
import numpy as np
import matplotlib.pyplot as plt
from sklearn.metrics import classification_report, mean_absolute_error
from sklearn import cross_validation, preprocessing
from sklearn.ensemble import ExtraTreesRegressor



In [2]:
input_file = 'traffic_data.txt'
data = []
with open('./data/' + input_file, 'r') as f:
    for line in f.readlines():
        items = line[:-1].split(',')
        data.append(items)

data = np.array(data)

In [3]:
label_encoder = [] 
X_encoded = np.empty(data.shape)
for i, item in enumerate(data[0]):
    if item.isdigit():
        X_encoded[:, i] = data[:, i]
    else:
        label_encoder.append(preprocessing.LabelEncoder())
        X_encoded[:, i] = label_encoder[-1].fit_transform(data[:, i])

In [4]:
X = X_encoded[:, :-1].astype(int)
y = X_encoded[:, -1].astype(int)

In [5]:
X_train, X_test, y_train, y_test = cross_validation.train_test_split(X, y, test_size = 0.25, random_state = 5)

In [6]:
params = {'n_estimators' : 100, 'max_depth' : 4, 'random_state' : 0}

In [7]:
regressor = ExtraTreesRegressor(**params)

In [8]:
regressor.fit(X_train, y_train)

ExtraTreesRegressor(bootstrap=False, criterion='mse', max_depth=4,
          max_features='auto', max_leaf_nodes=None,
          min_impurity_decrease=0.0, min_impurity_split=None,
          min_samples_leaf=1, min_samples_split=2,
          min_weight_fraction_leaf=0.0, n_estimators=100, n_jobs=1,
          oob_score=False, random_state=0, verbose=0, warm_start=False)

In [9]:
y_pred = regressor.predict(X_test)
print('Mean absolute error : ', round(mean_absolute_error(y_test, y_pred), 2))

Mean absolute error :  7.42


In [10]:
test_datapoint = ['Saturday', '10:20', 'Atlanta', 'no']

In [11]:
test_datapoint_encoded = [-1] * len(test_datapoint)
count = 0

In [12]:
for i, item in enumerate(test_datapoint):
    if item.isdigit():
        test_datapoint_encoded[i] = int(test_datapoint[i])
    else:
        test_datapoint_encoded[i] = int(label_encoder[count].transform([test_datapoint[i]]))
        count += 1
test_datapoint_encoded = np.array(test_datapoint_encoded)

In [13]:
print('Predicted traffic : ', int(regressor.predict([test_datapoint_encoded])[0]))

Predicted traffic :  26
