In [1]:
import MySQLdb as db
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from pandas.tools.plotting import scatter_matrix
from sklearn.linear_model import LogisticRegression
from sklearn.cross_validation import train_test_split
from sklearn import metrics
from sklearn.cross_validation import cross_val_score
%matplotlib inline

In [2]:
df = pd.read_csv('ABT-Avgs.csv')

### For logistic regression, we need to make the target feature categorical. For this, we can bin the value ranges. 

In [3]:
# http://chrisalbon.com/python/pandas_create_column_using_conditional.html
# Add emtimate column
df['estimate'] = df['Capacity'] * df['PercentageEstimate']
# Bin results into categories for logistic regression. 
bins = [-1, 25, 50, 75, 100, 125, 150, 175, 200, 225, 250]
groups = [ '0-25', '25-50', '50-75', '75-100', '100-125', '125-150', '150-175', '175-200', '200-225', '225-250']
df['occupantEstimate'] = pd.cut(df['estimate'], bins, labels = groups )

In [4]:
df.dtypes

DateTime                object
Room                    object
Capacity                 int64
Module                  object
NumReg                   int64
AvgNumWifiConn         float64
PercentageEstimate     float64
BinaryEstimate           int64
estimate               float64
occupantEstimate      category
dtype: object

In [5]:
df['Room'] = df['Room'].astype('category')
df['Module'] = df['Module'].astype('category')
df['DateTime'] = df['DateTime'].astype('datetime64[ns]')
df.dtypes

DateTime              datetime64[ns]
Room                        category
Capacity                       int64
Module                      category
NumReg                         int64
AvgNumWifiConn               float64
PercentageEstimate           float64
BinaryEstimate                 int64
estimate                     float64
occupantEstimate            category
dtype: object

In [6]:
df.shape

(216, 10)

### For logistic regression, we need to make the training features to be continuous. To do this we need to create dummy values for catergorical features. 


In [7]:
days = ['mon', 'tues', 'wed', 'thurs', 'fri', 'sat', 'sun']
# http://stackoverflow.com/questions/13740672/in-pandas-how-can-i-groupby-weekday-for-a-datetime-column
# http://chrisalbon.com/python/pandas_apply_operations_to_dataframes.html
df['weekday'] = df['DateTime'].apply(lambda dt: dt.weekday())
df.head()


Unnamed: 0,DateTime,Room,Capacity,Module,NumReg,AvgNumWifiConn,PercentageEstimate,BinaryEstimate,estimate,occupantEstimate,weekday
0,2015-11-03 16:00:00,B-002,90,COMP40370P1,27,39.0,0.25,1,22.5,0-25,1
1,2015-11-04 16:00:00,B-002,90,COMP30250P1,22,20.0,0.25,1,22.5,0-25,2
2,2015-11-05 16:00:00,B-002,90,COMP30520P1,60,32.6667,0.0,0,0.0,0-25,3
3,2015-11-06 16:00:00,B-002,90,,0,20.3333,0.5,1,45.0,25-50,4
4,2015-11-09 16:00:00,B-002,90,COMP40660P1,53,72.5,0.25,1,22.5,0-25,0


In [8]:
for i in range(len(days)):
    df[days[i]] =(df.weekday==i)*1
df.drop('weekday', axis=1, inplace=True)
df.drop('sat', axis=1, inplace=True)
df.drop('sun', axis=1, inplace=True)


df.head(10)

Unnamed: 0,DateTime,Room,Capacity,Module,NumReg,AvgNumWifiConn,PercentageEstimate,BinaryEstimate,estimate,occupantEstimate,mon,tues,wed,thurs,fri
0,2015-11-03 16:00:00,B-002,90,COMP40370P1,27,39.0,0.25,1,22.5,0-25,0,1,0,0,0
1,2015-11-04 16:00:00,B-002,90,COMP30250P1,22,20.0,0.25,1,22.5,0-25,0,0,1,0,0
2,2015-11-05 16:00:00,B-002,90,COMP30520P1,60,32.6667,0.0,0,0.0,0-25,0,0,0,1,0
3,2015-11-06 16:00:00,B-002,90,,0,20.3333,0.5,1,45.0,25-50,0,0,0,0,1
4,2015-11-09 16:00:00,B-002,90,COMP40660P1,53,72.5,0.25,1,22.5,0-25,1,0,0,0,0
5,2015-11-10 16:00:00,B-002,90,COMP40370P1,27,35.1667,0.25,1,22.5,0-25,0,1,0,0,0
6,2015-11-11 16:00:00,B-002,90,COMP30250P1,22,23.5,0.25,1,22.5,0-25,0,0,1,0,0
7,2015-11-12 16:00:00,B-002,90,COMP30520P1,60,49.8333,0.25,1,22.5,0-25,0,0,0,1,0
8,2015-11-13 16:00:00,B-002,90,,0,8.6667,0.25,1,22.5,0-25,0,0,0,0,1
9,2015-11-03 09:00:00,B-002,90,,0,2.0,0.0,0,0.0,0-25,0,1,0,0,0


In [9]:
times = ['9AM', '10AM', '11AM', '12PM', '1PM', '2PM', '3PM', '4PM']
df['time'] = df['DateTime'].apply(lambda dt: dt.hour)

for i in range(0,len(times)):
    df[times[i]] =(df.time==i+9)*1
df.drop('time', axis=1, inplace=True)
df.head(10)

Unnamed: 0,DateTime,Room,Capacity,Module,NumReg,AvgNumWifiConn,PercentageEstimate,BinaryEstimate,estimate,occupantEstimate,...,thurs,fri,9AM,10AM,11AM,12PM,1PM,2PM,3PM,4PM
0,2015-11-03 16:00:00,B-002,90,COMP40370P1,27,39.0,0.25,1,22.5,0-25,...,0,0,0,0,0,0,0,0,0,1
1,2015-11-04 16:00:00,B-002,90,COMP30250P1,22,20.0,0.25,1,22.5,0-25,...,0,0,0,0,0,0,0,0,0,1
2,2015-11-05 16:00:00,B-002,90,COMP30520P1,60,32.6667,0.0,0,0.0,0-25,...,1,0,0,0,0,0,0,0,0,1
3,2015-11-06 16:00:00,B-002,90,,0,20.3333,0.5,1,45.0,25-50,...,0,1,0,0,0,0,0,0,0,1
4,2015-11-09 16:00:00,B-002,90,COMP40660P1,53,72.5,0.25,1,22.5,0-25,...,0,0,0,0,0,0,0,0,0,1
5,2015-11-10 16:00:00,B-002,90,COMP40370P1,27,35.1667,0.25,1,22.5,0-25,...,0,0,0,0,0,0,0,0,0,1
6,2015-11-11 16:00:00,B-002,90,COMP30250P1,22,23.5,0.25,1,22.5,0-25,...,0,0,0,0,0,0,0,0,0,1
7,2015-11-12 16:00:00,B-002,90,COMP30520P1,60,49.8333,0.25,1,22.5,0-25,...,1,0,0,0,0,0,0,0,0,1
8,2015-11-13 16:00:00,B-002,90,,0,8.6667,0.25,1,22.5,0-25,...,0,1,0,0,0,0,0,0,0,1
9,2015-11-03 09:00:00,B-002,90,,0,2.0,0.0,0,0.0,0-25,...,0,0,1,0,0,0,0,0,0,0


In [10]:
for i in range(2, 5):
    df['B00'+str(i)] =(df.Room=='B-00'+str(i))*1
df.head(10)

Unnamed: 0,DateTime,Room,Capacity,Module,NumReg,AvgNumWifiConn,PercentageEstimate,BinaryEstimate,estimate,occupantEstimate,...,10AM,11AM,12PM,1PM,2PM,3PM,4PM,B002,B003,B004
0,2015-11-03 16:00:00,B-002,90,COMP40370P1,27,39.0,0.25,1,22.5,0-25,...,0,0,0,0,0,0,1,1,0,0
1,2015-11-04 16:00:00,B-002,90,COMP30250P1,22,20.0,0.25,1,22.5,0-25,...,0,0,0,0,0,0,1,1,0,0
2,2015-11-05 16:00:00,B-002,90,COMP30520P1,60,32.6667,0.0,0,0.0,0-25,...,0,0,0,0,0,0,1,1,0,0
3,2015-11-06 16:00:00,B-002,90,,0,20.3333,0.5,1,45.0,25-50,...,0,0,0,0,0,0,1,1,0,0
4,2015-11-09 16:00:00,B-002,90,COMP40660P1,53,72.5,0.25,1,22.5,0-25,...,0,0,0,0,0,0,1,1,0,0
5,2015-11-10 16:00:00,B-002,90,COMP40370P1,27,35.1667,0.25,1,22.5,0-25,...,0,0,0,0,0,0,1,1,0,0
6,2015-11-11 16:00:00,B-002,90,COMP30250P1,22,23.5,0.25,1,22.5,0-25,...,0,0,0,0,0,0,1,1,0,0
7,2015-11-12 16:00:00,B-002,90,COMP30520P1,60,49.8333,0.25,1,22.5,0-25,...,0,0,0,0,0,0,1,1,0,0
8,2015-11-13 16:00:00,B-002,90,,0,8.6667,0.25,1,22.5,0-25,...,0,0,0,0,0,0,1,1,0,0
9,2015-11-03 09:00:00,B-002,90,,0,2.0,0.0,0,0.0,0-25,...,0,0,0,0,0,0,0,1,0,0


In [11]:
df.corr()

Unnamed: 0,Capacity,NumReg,AvgNumWifiConn,PercentageEstimate,BinaryEstimate,estimate,mon,tues,wed,thurs,...,10AM,11AM,12PM,1PM,2PM,3PM,4PM,B002,B003,B004
Capacity,1.0,0.468271,0.355578,-0.045083,-0.056617,0.39296,2.6474550000000002e-17,3.7443450000000005e-17,2.194961e-17,2.2595180000000002e-17,...,-9.332719e-17,-1.5419270000000002e-17,6.248864e-17,-2.434622e-18,-2.8403930000000004e-17,3.6519330000000003e-17,3.002701e-17,-0.5,-0.5,1.0
NumReg,0.4682708,1.0,0.736217,0.344868,0.290085,0.562036,-0.04625192,0.02518073,-0.04275289,0.09583169,...,-0.009564565,0.0570458,0.1000863,0.009564565,-0.024253,-0.01161411,-0.1762613,-0.1253355,-0.3429352,0.4682708
AvgNumWifiConn,0.3555784,0.736217,1.0,0.598574,0.454588,0.765715,-0.008559815,0.127076,-0.1088693,0.08762523,...,-0.01575205,0.02496158,0.1411035,-0.005947925,0.1159951,-0.05334014,-0.1693601,-0.2275643,-0.1280142,0.3555784
PercentageEstimate,-0.04508348,0.344868,0.598574,1.0,0.769572,0.816815,0.05071892,0.1150197,-0.1022397,-0.02555993,...,0.0,0.06426169,0.1445888,-0.04819627,0.1285234,0.0,-0.1767197,-0.07889609,0.1239796,-0.04508348
BinaryEstimate,-0.05661669,0.290085,0.454588,0.769572,1.0,0.634861,0.2052355,0.06954706,-0.1711928,0.0213991,...,-0.0491772,0.1324002,0.1929259,-0.07944009,0.1324002,0.04161148,-0.1702288,0.0495396,0.007077086,-0.05661669
estimate,0.39296,0.562036,0.765715,0.816815,0.634861,1.0,0.04876492,0.1268725,-0.1533565,0.04715219,...,0.006579753,0.04302146,0.1017331,-0.007592023,0.09262268,-0.01973926,-0.198911,-0.2540037,-0.1389563,0.39296
mon,2.6474550000000002e-17,-0.046252,-0.00856,0.050719,0.205235,0.048765,1.0,-0.1889822,-0.1889822,-0.1889822,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.301043e-17,1.301043e-17,1.301043e-17
tues,3.7443450000000005e-17,0.025181,0.127076,0.11502,0.069547,0.126872,-0.1889822,1.0,-0.2857143,-0.2857143,...,1.5420550000000002e-17,1.5887840000000002e-17,1.635513e-17,1.3084100000000001e-17,1.355139e-17,1.401868e-17,1.448597e-17,1.3113280000000001e-18,1.3113280000000001e-18,1.3113280000000001e-18
wed,2.194961e-17,-0.042753,-0.108869,-0.10224,-0.171193,-0.153356,-0.1889822,-0.2857143,1.0,-0.2857143,...,1.5420550000000002e-17,1.2149520000000001e-17,1.2616810000000002e-17,1.3084100000000001e-17,1.355139e-17,1.401868e-17,1.448597e-17,1.3113280000000001e-18,1.3113280000000001e-18,1.3113280000000001e-18
thurs,2.2595180000000002e-17,0.095832,0.087625,-0.02556,0.021399,0.047152,-0.1889822,-0.2857143,-0.2857143,1.0,...,1.5420550000000002e-17,1.2149520000000001e-17,1.2616810000000002e-17,1.3084100000000001e-17,1.355139e-17,1.401868e-17,1.448597e-17,1.966992e-18,1.966992e-18,1.966992e-18


In [12]:
df.shape

(216, 26)

In [13]:
intercept = pd.DataFrame({'Intercept':np.ones(len(df))})
intercept

Unnamed: 0,Intercept
0,1.0
1,1.0
2,1.0
3,1.0
4,1.0
5,1.0
6,1.0
7,1.0
8,1.0
9,1.0


In [14]:
x = pd.concat([intercept, df[['Capacity', 'NumReg', 'AvgNumWifiConn', 'mon', 'tues', 'wed', 'thurs', 'fri', '9AM', '10AM', '11AM', '12PM', '1PM', '2PM', '3PM', '4PM', 'B002', 'B003', 'B004']]], axis = 1)
y = df['occupantEstimate']
x

Unnamed: 0,Intercept,Capacity,NumReg,AvgNumWifiConn,mon,tues,wed,thurs,fri,9AM,10AM,11AM,12PM,1PM,2PM,3PM,4PM,B002,B003,B004
0,1.0,90,27,39.0000,0,1,0,0,0,0,0,0,0,0,0,0,1,1,0,0
1,1.0,90,22,20.0000,0,0,1,0,0,0,0,0,0,0,0,0,1,1,0,0
2,1.0,90,60,32.6667,0,0,0,1,0,0,0,0,0,0,0,0,1,1,0,0
3,1.0,90,0,20.3333,0,0,0,0,1,0,0,0,0,0,0,0,1,1,0,0
4,1.0,90,53,72.5000,1,0,0,0,0,0,0,0,0,0,0,0,1,1,0,0
5,1.0,90,27,35.1667,0,1,0,0,0,0,0,0,0,0,0,0,1,1,0,0
6,1.0,90,22,23.5000,0,0,1,0,0,0,0,0,0,0,0,0,1,1,0,0
7,1.0,90,60,49.8333,0,0,0,1,0,0,0,0,0,0,0,0,1,1,0,0
8,1.0,90,0,8.6667,0,0,0,0,1,0,0,0,0,0,0,0,1,1,0,0
9,1.0,90,0,2.0000,0,1,0,0,0,1,0,0,0,0,0,0,0,1,0,0


In [15]:
log = LogisticRegression().fit(x[['Intercept','Capacity', 'NumReg', 'AvgNumWifiConn', 'mon', 'tues', 'wed', 'thurs', 'fri', '9AM', '10AM', '11AM', '12PM', '1PM', '2PM', '3PM', '4PM']], y)

In [16]:
predictions = log.predict(x[['Intercept','Capacity', 'NumReg', 'AvgNumWifiConn', 'mon', 'tues', 'wed', 'thurs', 'fri', '9AM', '10AM', '11AM', '12PM', '1PM', '2PM', '3PM', '4PM']])
log.score(x[['Intercept','Capacity', 'NumReg', 'AvgNumWifiConn', 'mon', 'tues', 'wed', 'thurs', 'fri', '9AM', '10AM', '11AM', '12PM', '1PM', '2PM', '3PM', '4PM']], y)

0.75462962962962965

In [17]:
# include rooms
log = LogisticRegression().fit(x[['Intercept','Capacity', 'NumReg', 'AvgNumWifiConn', 'mon', 'tues', 'wed', 'thurs', 'fri', '9AM', '10AM', '11AM', '12PM', '1PM', '2PM', '3PM', '4PM', 'B002', 'B003', 'B004']], y)

In [18]:
predictions = log.predict(x[['Intercept','Capacity', 'NumReg', 'AvgNumWifiConn', 'mon', 'tues', 'wed', 'thurs', 'fri', '9AM', '10AM', '11AM', '12PM', '1PM', '2PM', '3PM', '4PM', 'B002', 'B003', 'B004']])
log.score(x[['Intercept','Capacity', 'NumReg', 'AvgNumWifiConn', 'mon', 'tues', 'wed', 'thurs', 'fri', '9AM', '10AM', '11AM', '12PM', '1PM', '2PM', '3PM', '4PM', 'B002', 'B003', 'B004']], y)


0.75

In [19]:
df.columns

Index(['DateTime', 'Room', 'Capacity', 'Module', 'NumReg', 'AvgNumWifiConn',
       'PercentageEstimate', 'BinaryEstimate', 'estimate', 'occupantEstimate',
       'mon', 'tues', 'wed', 'thurs', 'fri', '9AM', '10AM', '11AM', '12PM',
       '1PM', '2PM', '3PM', '4PM', 'B002', 'B003', 'B004'],
      dtype='object')

## Model evaluation on out-of-sample data - with rooms

In [20]:
# Split the data into train and test sets
# Take a fifth of data samples as test data, rest as training data
X_train, X_test, y_train, y_test = train_test_split(x[['Intercept','Capacity', 'NumReg', 'AvgNumWifiConn', 'mon', 'tues', 'wed', 'thurs', 'fri', '9AM', '10AM', '11AM', '12PM', '1PM', '2PM', '3PM', '4PM', 'B002', 'B003', 'B004']], y, test_size=0.2)

# print("Training data:\n", pd.concat([X_train, y_train], axis=1))
# print("\nTest data:\n", pd.concat([X_test, y_test], axis=1))

In [21]:
# Train on the training sample and test on the test sample.
logreg_train = LogisticRegression().fit(X_train, y_train)
# Print the weights learned for each feature.
print(logreg_train.coef_)

[[  1.04341425e+00   5.32838718e-04   2.06808931e-03  -7.87172593e-02
    3.67380545e-02  -3.31305932e-01   5.78507174e-01   5.40213588e-01
    2.19261362e-01   2.76411864e-01  -7.08856929e-01  -2.08360234e-01
   -2.48371449e-02   6.75241998e-01  -1.80216391e-01   7.33897696e-01
    4.80133388e-01   1.24706130e+00   5.18712553e-01  -7.22359610e-01]
 [ -1.45710768e+00  -2.69338821e-03  -3.90872534e-03   3.34071366e-02
   -1.93877731e-01  -3.61498272e-01  -5.77481914e-01   2.66272548e-01
   -5.90522316e-01  -1.33604467e-01  -6.60327582e-02  -4.27271676e-01
   -2.23438898e-01   3.04271779e-01  -2.81199960e-01  -1.81739843e-01
   -4.48091862e-01  -1.14953744e+00  -1.31631639e+00   1.00874614e+00]
 [  2.38590226e-01  -3.91318590e-02   8.64922920e-03   2.42488932e-02
    1.56103823e-01   5.06246727e-01  -4.72735289e-01   4.28363204e-02
    6.13864430e-03  -1.30564332e-01   4.26806223e-01   1.67225003e-01
    3.72935549e-01  -9.93122070e-01   5.11449077e-01  -1.19840523e-01
    3.70129984e-03

In [22]:
# Estimated class probabilities on test set
print(logreg_train.predict_proba(X_test))

[[  9.33966252e-01   1.54803232e-02   4.42971543e-02   6.25627085e-03]
 [  9.22736969e-01   8.98192043e-03   4.57177279e-02   2.25633825e-02]
 [  8.49440177e-01   1.17675537e-02   9.64128168e-02   4.23794525e-02]
 [  7.81497361e-01   1.93355966e-02   1.46256443e-01   5.29106001e-02]
 [  8.02133358e-01   8.90088988e-03   1.52500836e-01   3.64649154e-02]
 [  8.51053214e-01   8.02415946e-03   6.36288308e-02   7.72937956e-02]
 [  8.49454740e-01   2.12713850e-02   1.02015100e-01   2.72587748e-02]
 [  1.60424644e-01   1.77278056e-01   3.02645738e-03   6.59270842e-01]
 [  8.79865409e-01   8.44831117e-03   8.16614097e-02   3.00248698e-02]
 [  8.59208232e-01   7.42731278e-03   8.25001886e-02   5.08642663e-02]
 [  1.11212755e-07   4.67812962e-01   7.62309015e-02   4.55956025e-01]
 [  4.99657253e-04   3.59093391e-01   1.96072244e-02   6.20799728e-01]
 [  2.54313584e-03   2.96342303e-01   1.47863570e-02   6.86328205e-01]
 [  4.11756824e-04   3.73538357e-01   1.32595808e-02   6.12790305e-01]
 [  9.

In [23]:
# Estimated classes on train set
predicted_train = logreg_train.predict(X_train)
print(predicted_train)

['0-25' '0-25' '50-75' '0-25' '50-75' '0-25' '0-25' '0-25' '0-25' '0-25'
 '0-25' '0-25' '0-25' '0-25' '0-25' '0-25' '0-25' '0-25' '0-25' '0-25'
 '50-75' '0-25' '50-75' '0-25' '50-75' '0-25' '0-25' '0-25' '0-25' '0-25'
 '0-25' '50-75' '0-25' '50-75' '0-25' '25-50' '0-25' '0-25' '50-75' '0-25'
 '50-75' '50-75' '0-25' '50-75' '50-75' '0-25' '50-75' '0-25' '0-25' '0-25'
 '0-25' '0-25' '0-25' '0-25' '0-25' '0-25' '0-25' '25-50' '50-75' '0-25'
 '0-25' '100-125' '50-75' '0-25' '0-25' '0-25' '0-25' '0-25' '0-25' '0-25'
 '0-25' '25-50' '0-25' '0-25' '0-25' '0-25' '0-25' '0-25' '0-25' '0-25'
 '0-25' '0-25' '0-25' '0-25' '50-75' '0-25' '50-75' '50-75' '0-25' '0-25'
 '0-25' '0-25' '0-25' '0-25' '0-25' '50-75' '0-25' '0-25' '0-25' '0-25'
 '0-25' '50-75' '0-25' '0-25' '0-25' '0-25' '50-75' '25-50' '0-25' '0-25'
 '0-25' '100-125' '50-75' '50-75' '50-75' '0-25' '0-25' '0-25' '0-25'
 '0-25' '0-25' '0-25' '50-75' '0-25' '0-25' '25-50' '0-25' '0-25' '0-25'
 '25-50' '0-25' '50-75' '0-25' '0-25' '25-50' '0

In [24]:
# Estimated classes on train set
predicted_test = logreg_train.predict(X_test)
print(predicted_test)

['0-25' '0-25' '0-25' '0-25' '0-25' '0-25' '0-25' '50-75' '0-25' '0-25'
 '100-125' '50-75' '50-75' '50-75' '0-25' '0-25' '50-75' '50-75' '0-25'
 '0-25' '0-25' '50-75' '0-25' '0-25' '50-75' '50-75' '0-25' '0-25' '50-75'
 '0-25' '50-75' '0-25' '0-25' '0-25' '0-25' '50-75' '50-75' '0-25' '0-25'
 '0-25' '0-25' '0-25' '50-75' '0-25']


In [25]:
# accuracy of trained predictions
print(metrics.accuracy_score(y_train, predicted_train))

0.767441860465


In [26]:
# accuracy of trained predictions
print(metrics.accuracy_score(y_test, predicted_test))

0.636363636364


## Model evaluation on out-of-sample data - without rooms

In [30]:
# Split the data into train and test sets
# Take a fifth of data samples as test data, rest as training data
X_train, X_test, y_train, y_test = train_test_split(x[['Intercept','Capacity', 'NumReg', 'AvgNumWifiConn', 'mon', 'tues', 'wed', 'thurs', 'fri', '9AM', '10AM', '11AM', '12PM', '1PM', '2PM', '3PM', '4PM']], y, test_size=0.3)

# print("Training data:\n", pd.concat([X_train, y_train], axis=1))
# print("\nTest data:\n", pd.concat([X_test, y_test], axis=1))

In [31]:
# Train on the training sample and test on the test sample.
logreg_train = LogisticRegression().fit(X_train, y_train)
# Print the weights learned for each feature.
print(logreg_train.coef_)

[[  1.52843100e+00  -9.01529081e-03   6.15727557e-03  -7.28584020e-02
    2.88214814e-01  -2.27631750e-01   7.57717285e-01   6.37148400e-01
    7.29822491e-02  -2.22683480e-01  -1.72183778e-01  -1.67696328e-01
   -2.63955536e-01   7.49753835e-01   1.09661709e-01   4.62921326e-01
    1.03261325e+00]
 [ -2.12698209e+00   1.29286975e-02   4.39821857e-03   1.13906510e-02
   -4.26592580e-01  -3.64073173e-01  -9.89173503e-01  -1.50757817e-01
   -1.96385015e-01  -7.93010387e-01   9.12386283e-03  -3.86575011e-01
   -4.24534069e-01   2.05621701e-01   8.35956254e-02  -1.57244146e-01
   -6.63959666e-01]
 [ -3.83691791e-02  -2.46694784e-01  -3.09897435e-01   4.50962342e-01
   -2.41308165e-03  -3.97128987e-04  -5.67691288e-02   2.37579553e-02
   -2.54779495e-03   2.46639663e-02  -3.45849711e-06  -1.09176079e-05
   -3.57024356e-02  -1.60309043e-02  -6.32880167e-03  -2.53699288e-03
   -2.41963479e-03]
 [  4.15821149e-01  -3.76059491e-02  -2.99492249e-03   2.71315717e-02
    1.35621309e-01   8.4316553

In [32]:
# Estimated class probabilities on test set
print(logreg_train.predict_proba(X_test))

[[  8.00498289e-01   1.07668944e-01   2.22164780e-23   1.82619711e-03
    9.00065707e-02]
 [  7.85584846e-01   2.90802906e-02   3.73567474e-10   7.14407273e-02
    1.13894136e-01]
 [  5.81872338e-01   6.54597550e-02   3.08656179e-11   2.51201391e-01
    1.01466516e-01]
 [  4.92759972e-01   5.87127948e-02   6.35855930e-10   2.07220034e-01
    2.41307198e-01]
 [  7.91164865e-01   1.53758444e-02   1.22463121e-08   1.63960053e-01
    2.94992252e-02]
 [  6.27384383e-01   9.06138902e-02   2.21490767e-08   1.48117154e-01
    1.33884551e-01]
 [  5.85500779e-01   7.19764716e-02   3.31048845e-30   6.29511865e-04
    3.41893238e-01]
 [  7.88196767e-01   1.23466965e-02   2.34578164e-11   6.70396820e-02
    1.32416855e-01]
 [  7.33340090e-01   6.55573375e-02   4.20608242e-04   1.26417044e-01
    7.42649193e-02]
 [  2.55579501e-02   3.18890266e-01   4.79783291e-25   1.07992671e-02
    6.44752516e-01]
 [  4.63500212e-03   3.14933503e-01   3.12361480e-25   1.26070492e-02
    6.67824446e-01]
 [  3.9828

In [33]:
# Estimated classes on train set
predicted_train = logreg_train.predict(X_train)
print(predicted_train)

['50-75' '0-25' '100-125' '0-25' '0-25' '50-75' '0-25' '100-125' '0-25'
 '0-25' '0-25' '0-25' '0-25' '0-25' '0-25' '50-75' '0-25' '0-25' '50-75'
 '0-25' '0-25' '0-25' '0-25' '0-25' '25-50' '0-25' '0-25' '0-25' '0-25'
 '0-25' '50-75' '0-25' '0-25' '0-25' '0-25' '0-25' '0-25' '0-25' '0-25'
 '50-75' '100-125' '0-25' '0-25' '0-25' '0-25' '25-50' '0-25' '0-25' '0-25'
 '25-50' '0-25' '0-25' '0-25' '0-25' '100-125' '0-25' '50-75' '0-25'
 '100-125' '50-75' '0-25' '50-75' '0-25' '0-25' '0-25' '0-25' '50-75'
 '0-25' '25-50' '0-25' '0-25' '0-25' '0-25' '0-25' '0-25' '50-75' '0-25'
 '0-25' '0-25' '0-25' '0-25' '0-25' '0-25' '0-25' '50-75' '0-25' '100-125'
 '100-125' '100-125' '50-75' '50-75' '0-25' '50-75' '0-25' '0-25' '0-25'
 '0-25' '0-25' '50-75' '50-75' '200-225' '0-25' '0-25' '0-25' '0-25' '0-25'
 '0-25' '0-25' '0-25' '0-25' '0-25' '0-25' '0-25' '0-25' '100-125' '50-75'
 '0-25' '50-75' '0-25' '0-25' '0-25' '50-75' '0-25' '0-25' '0-25' '0-25'
 '100-125' '0-25' '0-25' '0-25' '0-25' '50-75' '25-

In [34]:
# Estimated classes on test set
predicted_test = logreg_train.predict(X_test)
print(predicted_test)

['0-25' '0-25' '0-25' '0-25' '0-25' '0-25' '0-25' '0-25' '0-25' '50-75'
 '50-75' '0-25' '0-25' '0-25' '0-25' '50-75' '50-75' '0-25' '0-25' '0-25'
 '0-25' '0-25' '0-25' '0-25' '0-25' '0-25' '25-50' '0-25' '100-125' '50-75'
 '0-25' '0-25' '0-25' '25-50' '0-25' '100-125' '0-25' '50-75' '50-75'
 '0-25' '0-25' '0-25' '50-75' '0-25' '0-25' '0-25' '0-25' '0-25' '0-25'
 '0-25' '0-25' '0-25' '0-25' '50-75' '0-25' '0-25' '0-25' '100-125' '0-25'
 '0-25' '0-25' '0-25' '0-25' '0-25' '0-25']


In [35]:
# accuracy of trained predictions
print(metrics.accuracy_score(y_train, predicted_train))

0.761589403974


In [36]:
# accuracy of test predictions
print(metrics.accuracy_score(y_test, predicted_test))

0.753846153846


## Model evaluation on out-of-sample data - with rooms, without capacity

In [37]:
# Split the data into train and test sets
# Take a fifth of data samples as test data, rest as training data
X_train, X_test, y_train, y_test = train_test_split(x[['Intercept', 'NumReg', 'AvgNumWifiConn', 'mon', 'tues', 'wed', 'thurs', 'fri', '9AM', '10AM', '11AM', '12PM', '1PM', '2PM', '3PM', '4PM', 'B002', 'B003', 'B004']], y, test_size=0.3)

# print("Training data:\n", pd.concat([X_train, y_train], axis=1))
# print("\nTest data:\n", pd.concat([X_test, y_test], axis=1))

In [38]:
# Train on the training sample and test on the test sample.
logreg_train = LogisticRegression().fit(X_train, y_train)
# Print the weights learned for each feature.
print(logreg_train.coef_)

[[  1.01177058e+00   7.32012597e-03  -7.98376805e-02   5.57649828e-01
   -5.36130856e-01   4.88792295e-01   4.01815149e-01   9.96441607e-02
    2.40600769e-01  -2.52115042e-01   9.83850121e-02  -7.36357581e-01
    8.56038136e-01  -1.73639747e-01   6.33421675e-01   3.45437355e-01
    8.66638641e-01   7.72899839e-01  -6.27767904e-01]
 [ -1.40864221e+00   2.86557811e-03   1.32556739e-02  -1.25138830e-01
    1.59474343e-01  -5.88646732e-01  -3.87764537e-01  -4.66566452e-01
   -7.20800249e-01  -4.48517714e-01  -4.77506367e-01  -2.22791429e-01
    6.95764665e-01   4.28004873e-01  -3.40830256e-02  -6.28712960e-01
   -1.08149440e+00  -1.18201291e+00   8.54865096e-01]
 [ -1.66664452e+00   2.10960708e-03   6.93407664e-03  -2.80076792e-01
    1.47884553e-01  -4.82424965e-01  -6.06683396e-01  -4.45343917e-01
    2.76149476e-01  -2.19993518e-01  -3.56084034e-01  -2.89633374e-01
   -2.98902909e-01  -2.62652489e-01  -2.56495634e-01  -2.59032033e-01
   -6.73536055e-01  -7.13485674e-01  -2.79622788e-01

In [39]:
# Estimated class probabilities on test set
print(logreg_train.predict_proba(X_test))

[[  1.62522518e-02   3.84212635e-02   2.04806835e-02   3.49781940e-02
    5.14792261e-01   3.75075346e-01]
 [  1.53292480e-03   2.86372710e-01   4.02378226e-02   3.64641571e-03
    1.83702899e-01   4.84507228e-01]
 [  9.52931814e-03   4.79433097e-01   2.59042358e-02   2.13916007e-02
    2.12857018e-02   4.42456046e-01]
 [  8.23875664e-01   4.54676567e-02   1.46085192e-02   1.07471851e-02
    9.56957921e-02   9.60518251e-03]
 [  8.63795486e-01   6.57584972e-03   1.26471630e-02   3.97393151e-03
    6.38010025e-02   4.92065672e-02]
 [  2.31112494e-01   2.81979468e-01   2.31285160e-02   6.85554939e-03
    3.38437175e-02   4.23080255e-01]
 [  6.72655528e-01   4.59089258e-02   9.95554207e-03   3.17691373e-04
    1.75580110e-02   2.53604302e-01]
 [  6.85283111e-01   1.01424217e-02   1.29965199e-02   7.71839879e-03
    2.61509192e-01   2.23503566e-02]
 [  7.38513401e-01   2.31050932e-02   9.78877259e-03   6.01944805e-03
    1.99582313e-01   2.29909724e-02]
 [  7.69168535e-01   1.99259235e-02  

In [40]:
# Estimated classes on train set
predicted_train = logreg_train.predict(X_train)
print(predicted_train)

['0-25' '25-50' '0-25' '25-50' '0-25' '100-125' '0-25' '0-25' '100-125'
 '0-25' '0-25' '0-25' '0-25' '0-25' '0-25' '50-75' '0-25' '0-25' '0-25'
 '25-50' '0-25' '0-25' '0-25' '50-75' '25-50' '0-25' '50-75' '50-75' '0-25'
 '50-75' '0-25' '50-75' '50-75' '50-75' '0-25' '0-25' '0-25' '0-25' '0-25'
 '0-25' '0-25' '0-25' '50-75' '0-25' '0-25' '0-25' '0-25' '50-75' '0-25'
 '25-50' '0-25' '0-25' '0-25' '25-50' '0-25' '0-25' '0-25' '0-25' '0-25'
 '0-25' '0-25' '0-25' '0-25' '0-25' '0-25' '0-25' '50-75' '0-25' '0-25'
 '0-25' '0-25' '0-25' '0-25' '50-75' '200-225' '0-25' '0-25' '0-25' '25-50'
 '0-25' '0-25' '0-25' '0-25' '0-25' '0-25' '0-25' '0-25' '0-25' '0-25'
 '25-50' '0-25' '0-25' '50-75' '0-25' '25-50' '0-25' '0-25' '0-25' '0-25'
 '0-25' '50-75' '0-25' '50-75' '0-25' '0-25' '0-25' '0-25' '0-25' '0-25'
 '50-75' '50-75' '0-25' '0-25' '0-25' '0-25' '100-125' '0-25' '0-25' '0-25'
 '0-25' '0-25' '0-25' '0-25' '0-25' '0-25' '0-25' '0-25' '0-25' '50-75'
 '0-25' '0-25' '0-25' '0-25' '0-25' '50-75' '

In [41]:
# Estimated classes on test set
predicted_test = logreg_train.predict(X_test)
print(predicted_test)

['25-50' '50-75' '100-125' '0-25' '0-25' '50-75' '0-25' '0-25' '0-25'
 '0-25' '50-75' '0-25' '0-25' '0-25' '0-25' '0-25' '50-75' '0-25' '0-25'
 '100-125' '0-25' '0-25' '0-25' '0-25' '0-25' '100-125' '25-50' '0-25'
 '25-50' '0-25' '0-25' '50-75' '0-25' '0-25' '0-25' '0-25' '0-25' '0-25'
 '25-50' '0-25' '0-25' '0-25' '0-25' '25-50' '0-25' '50-75' '0-25' '0-25'
 '100-125' '0-25' '0-25' '0-25' '25-50' '0-25' '0-25' '50-75' '0-25'
 '50-75' '100-125' '100-125' '50-75' '100-125' '0-25' '0-25' '0-25']


In [42]:
# accuracy of trained predictions
print(metrics.accuracy_score(y_train, predicted_train))

0.788079470199


In [43]:
# accuracy of test predictions
print(metrics.accuracy_score(y_test, predicted_test))

0.615384615385


## Model evaluation on out-of-sample data - with rooms, without NumReg

In [44]:
# Split the data into train and test sets
# Take a fifth of data samples as test data, rest as training data
X_train, X_test, y_train, y_test = train_test_split(x[['Intercept', 'Capacity', 'AvgNumWifiConn', 'mon', 'tues', 'wed', 'thurs', 'fri', '9AM', '10AM', '11AM', '12PM', '1PM', '2PM', '3PM', '4PM', 'B002', 'B003', 'B004']], y, test_size=0.3)

# print("Training data:\n", pd.concat([X_train, y_train], axis=1))
# print("\nTest data:\n", pd.concat([X_test, y_test], axis=1))

In [45]:
# Train on the training sample and test on the test sample.
logreg_train = LogisticRegression().fit(X_train, y_train)
# Print the weights learned for each feature.
print(logreg_train.coef_)

[[ 1.05160385 -0.0020333  -0.07137177 -0.35480217 -0.52633745  1.12941007
   0.62535038  0.17798303  0.13558936 -0.10035682  0.07232519 -0.04141183
   0.4290825  -0.00375998 -0.11035107  0.67048651  1.1322192   0.64743374
  -0.72804908]
 [-1.34023989 -0.00189531  0.0190825  -0.60407144 -0.19783791 -0.56835111
  -0.162975    0.19299558 -0.45689897  0.12330056 -0.42843572 -0.13774607
   0.62737041 -0.44689154 -0.13432447 -0.48661409 -1.11857299 -1.14951069
   0.9278438 ]
 [ 0.27172209 -0.04313442  0.04390767  0.49424893  0.96557888 -0.95747762
  -0.19137956 -0.03924854  0.40567399 -0.28423813  0.35693832 -0.01095535
  -0.82328893  0.64959944  0.22365144 -0.2456587   0.22037199  0.2397972
  -0.1884471 ]
 [-1.19427691  0.00247669  0.0304621   0.52387058 -0.4163929  -0.40851087
  -0.51237973 -0.380864   -0.42163946 -0.14710642  0.1205342   0.20203573
  -0.41250893 -0.31019065  0.23722868 -0.46263006 -1.62819462 -0.39290844
   0.82682615]]


In [46]:
# Estimated class probabilities on test set
print(logreg_train.predict_proba(X_test))

[[  7.93132497e-01   7.55794976e-02   8.09184558e-05   1.31207087e-01]
 [  2.65581616e-01   8.47821179e-02   1.25335269e-03   6.48382913e-01]
 [  2.49787632e-01   2.47023176e-01   5.07712055e-04   5.02681480e-01]
 [  8.90553199e-01   1.96654664e-02   7.26849822e-02   1.70963521e-02]
 [  8.16169716e-01   2.12631496e-02   9.85668086e-02   6.40003260e-02]
 [  1.37901242e-02   4.15480273e-01   1.43804148e-03   5.69291561e-01]
 [  7.32365231e-01   7.11821667e-02   1.01830794e-04   1.96350771e-01]
 [  1.14695131e-01   2.19223133e-01   1.02886145e-03   6.65052874e-01]
 [  8.35491584e-01   1.02956774e-02   1.15563860e-01   3.86488786e-02]
 [  3.06437151e-01   2.30427081e-01   4.24816771e-04   4.62710951e-01]
 [  8.19640129e-01   3.51555313e-02   1.12243838e-01   3.29605023e-02]
 [  9.05520667e-01   8.93729299e-03   2.27466027e-02   6.27954374e-02]
 [  1.81889137e-03   2.74135230e-01   1.31158273e-02   7.10930051e-01]
 [  9.33034344e-01   2.91246660e-02   1.90698740e-02   1.87711159e-02]
 [  1.

In [47]:
# Estimated classes on train set
predicted_train = logreg_train.predict(X_train)
print(predicted_train)

['50-75' '0-25' '25-50' '0-25' '0-25' '0-25' '0-25' '0-25' '0-25' '0-25'
 '0-25' '25-50' '0-25' '50-75' '50-75' '0-25' '0-25' '0-25' '0-25' '0-25'
 '50-75' '0-25' '50-75' '0-25' '0-25' '50-75' '0-25' '50-75' '0-25' '0-25'
 '0-25' '0-25' '0-25' '0-25' '50-75' '50-75' '0-25' '0-25' '50-75' '0-25'
 '0-25' '50-75' '0-25' '0-25' '0-25' '0-25' '0-25' '0-25' '0-25' '50-75'
 '0-25' '50-75' '0-25' '0-25' '50-75' '0-25' '0-25' '0-25' '0-25' '0-25'
 '50-75' '0-25' '50-75' '0-25' '25-50' '0-25' '0-25' '50-75' '50-75' '0-25'
 '0-25' '0-25' '0-25' '50-75' '0-25' '0-25' '0-25' '0-25' '0-25' '0-25'
 '50-75' '50-75' '0-25' '0-25' '0-25' '0-25' '0-25' '0-25' '0-25' '0-25'
 '0-25' '0-25' '50-75' '0-25' '0-25' '0-25' '0-25' '0-25' '0-25' '0-25'
 '0-25' '0-25' '0-25' '0-25' '0-25' '25-50' '50-75' '25-50' '0-25' '0-25'
 '0-25' '0-25' '25-50' '0-25' '0-25' '50-75' '0-25' '0-25' '0-25' '0-25'
 '50-75' '0-25' '0-25' '0-25' '50-75' '50-75' '0-25' '0-25' '0-25' '0-25'
 '0-25' '0-25' '0-25' '0-25' '50-75' '0-25' 

In [48]:
# Estimated classes on test set
predicted_test = logreg_train.predict(X_test)
print(predicted_test)

['0-25' '50-75' '50-75' '0-25' '0-25' '50-75' '0-25' '50-75' '0-25' '50-75'
 '0-25' '0-25' '50-75' '0-25' '50-75' '0-25' '0-25' '0-25' '0-25' '50-75'
 '0-25' '0-25' '25-50' '50-75' '0-25' '0-25' '0-25' '0-25' '0-25' '0-25'
 '0-25' '50-75' '0-25' '50-75' '0-25' '0-25' '0-25' '0-25' '0-25' '0-25'
 '0-25' '0-25' '25-50' '50-75' '50-75' '0-25' '0-25' '0-25' '0-25' '50-75'
 '0-25' '0-25' '50-75' '25-50' '0-25' '50-75' '0-25' '25-50' '0-25' '50-75'
 '0-25' '50-75' '0-25' '0-25' '50-75']


In [49]:
# accuracy of trained predictions
print(metrics.accuracy_score(y_train, predicted_train))

0.781456953642


In [50]:
# accuracy of test predictions
print(metrics.accuracy_score(y_test, predicted_test))

0.646153846154


### Model evaluation on out-of-sample data - with rooms, without NumReg, without Capacity

In [51]:
# Split the data into train and test sets
# Take a fifth of data samples as test data, rest as training data
X_train, X_test, y_train, y_test = train_test_split(x[['Intercept', 'Capacity', 'AvgNumWifiConn', 'mon', 'tues', 'wed', 'thurs', 'fri', '9AM', '10AM', '11AM', '12PM', '1PM', '2PM', '3PM', '4PM', 'B002', 'B003', 'B004']], y, test_size=0.3)

# print("Training data:\n", pd.concat([X_train, y_train], axis=1))
# print("\nTest data:\n", pd.concat([X_test, y_test], axis=1))

In [52]:
# Train on the training sample and test on the test sample.
logreg_train = LogisticRegression().fit(X_train, y_train)
# Print the weights learned for each feature.
print(logreg_train.coef_)

[[  1.06633993e+00   6.51642054e-05  -7.40249953e-02   1.55382220e-01
   -1.34731369e-01   8.15264020e-01   4.36525613e-01  -2.06100556e-01
   -4.93726789e-02  -7.61517943e-01  -1.85589629e-01  -2.36480383e-01
    7.77525266e-01  -1.54804054e-01  -1.78810505e-01   1.85538986e+00
    1.13517356e+00   6.69401197e-01  -7.38234833e-01]
 [ -1.31479350e+00  -1.60127347e-03   1.28016726e-02  -5.61945051e-01
    8.62405974e-02  -1.00195721e+00  -2.04125095e-01   3.66993264e-01
   -1.02782919e+00   1.87163055e-01  -6.08425256e-01  -3.11508834e-01
    8.10144593e-01   2.32940208e-01   5.90598287e-03  -6.03184051e-01
   -1.16699193e+00  -1.05803090e+00   9.10229335e-01]
 [ -7.96716591e-01  -2.88362833e-02   2.72928458e-02  -1.69491515e-01
    4.49102014e-01  -3.31909185e-01  -4.80659794e-01  -2.63758112e-01
    4.93691980e-01  -1.55402867e-01  -1.49736122e-01  -2.54549108e-01
   -1.58527041e-01  -1.99986345e-01  -1.91802233e-01  -1.80404855e-01
   -6.91377501e-01  -6.56690297e-01   5.51351207e-01

In [53]:
# Estimated class probabilities on test set
print(logreg_train.predict_proba(X_test))

[[  1.74133592e-02   1.59709617e-01   5.76573445e-03   3.27439685e-28
    6.22160053e-04   8.16489130e-01]
 [  9.14660727e-01   2.86873184e-03   1.19703662e-02   4.84410172e-15
    4.80614521e-02   2.24387230e-02]
 [  3.90147489e-02   9.42386019e-02   2.97514319e-03   6.31777104e-28
    7.96974495e-04   8.62974532e-01]
 [  8.39943395e-01   7.97643901e-02   3.51242933e-04   1.04073348e-40
    6.09296326e-05   7.98800427e-02]
 [  7.86099040e-01   1.93729326e-02   4.11218231e-03   1.43886737e-16
    1.09359094e-01   8.10567511e-02]
 [  4.46625326e-01   6.16427148e-02   1.81782165e-02   4.41191212e-10
    3.09700417e-01   1.63853326e-01]
 [  2.89556330e-02   4.10540489e-01   2.53901442e-03   2.41287657e-30
    2.68493689e-03   5.55279927e-01]
 [  5.89186702e-01   2.93773417e-02   8.88361793e-03   5.59999541e-13
    1.73792083e-01   1.98760256e-01]
 [  7.74468267e-01   1.55429922e-02   1.10314799e-02   2.71502240e-15
    1.18168550e-01   8.07887104e-02]
 [  2.38147091e-02   1.54275652e-01  

In [54]:
# Estimated classes on train set
predicted_train = logreg_train.predict(X_train)
print(predicted_train)

['25-50' '50-75' '0-25' '0-25' '50-75' '0-25' '0-25' '0-25' '0-25' '0-25'
 '0-25' '0-25' '0-25' '0-25' '0-25' '200-225' '50-75' '0-25' '0-25' '0-25'
 '0-25' '0-25' '50-75' '0-25' '50-75' '0-25' '0-25' '0-25' '0-25' '0-25'
 '50-75' '0-25' '0-25' '0-25' '0-25' '0-25' '50-75' '0-25' '50-75' '50-75'
 '0-25' '0-25' '0-25' '100-125' '0-25' '0-25' '0-25' '0-25' '0-25' '50-75'
 '0-25' '0-25' '0-25' '0-25' '0-25' '50-75' '0-25' '0-25' '0-25' '0-25'
 '0-25' '50-75' '0-25' '50-75' '0-25' '50-75' '0-25' '0-25' '25-50' '50-75'
 '0-25' '0-25' '50-75' '0-25' '100-125' '0-25' '100-125' '25-50' '0-25'
 '0-25' '0-25' '50-75' '50-75' '50-75' '50-75' '50-75' '0-25' '0-25' '0-25'
 '0-25' '0-25' '0-25' '50-75' '0-25' '0-25' '100-125' '0-25' '0-25' '50-75'
 '0-25' '0-25' '50-75' '50-75' '0-25' '50-75' '0-25' '50-75' '0-25' '0-25'
 '0-25' '0-25' '50-75' '0-25' '0-25' '0-25' '0-25' '0-25' '0-25' '0-25'
 '0-25' '0-25' '25-50' '100-125' '0-25' '0-25' '0-25' '0-25' '0-25' '0-25'
 '0-25' '0-25' '0-25' '0-25' '0-25

In [55]:
# Estimated classes on test set
predicted_test = logreg_train.predict(X_test)
print(predicted_test)

['50-75' '0-25' '50-75' '0-25' '0-25' '0-25' '50-75' '0-25' '0-25' '50-75'
 '50-75' '0-25' '50-75' '0-25' '0-25' '0-25' '50-75' '0-25' '0-25' '0-25'
 '0-25' '0-25' '0-25' '0-25' '0-25' '0-25' '0-25' '0-25' '0-25' '0-25'
 '0-25' '0-25' '0-25' '0-25' '0-25' '50-75' '0-25' '0-25' '0-25' '0-25'
 '50-75' '25-50' '50-75' '0-25' '0-25' '0-25' '50-75' '0-25' '0-25' '0-25'
 '0-25' '0-25' '0-25' '0-25' '0-25' '0-25' '0-25' '0-25' '0-25' '0-25'
 '0-25' '0-25' '50-75' '0-25' '0-25']


In [56]:
# accuracy of trained predictions
print(metrics.accuracy_score(y_train, predicted_train))

0.788079470199


In [57]:
# accuracy of test predictions
print(metrics.accuracy_score(y_test, predicted_test))

0.676923076923
