In [1]:
import MySQLdb as db
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from pandas.tools.plotting import scatter_matrix
from sklearn.linear_model import LogisticRegression
from sklearn.cross_validation import train_test_split
from sklearn import metrics
from sklearn.cross_validation import cross_val_score
%matplotlib inline

In [2]:
df = pd.read_csv('ABT-Avgs.csv')

### For logistic regression, we need to make the target feature categorical. For this, we can bin the value ranges. 

In [3]:
# http://chrisalbon.com/python/pandas_create_column_using_conditional.html
# Add emtimate column
df['estimate'] = df['Capacity'] * df['PercentageEstimate']
# Bin results into categories for logistic regression. 
bins = [-1, 25, 50, 75, 100, 125, 150, 175, 200, 225, 250]
groups = [ '0-25', '25-50', '50-75', '75-100', '100-125', '125-150', '150-175', '175-200', '200-225', '225-250']
df['occupantEstimate'] = pd.cut(df['estimate'], bins, labels = groups )

In [4]:
df.dtypes

DateTime                object
Room                    object
Capacity                 int64
Module                  object
NumReg                   int64
AvgNumWifiConn         float64
PercentageEstimate     float64
BinaryEstimate           int64
estimate               float64
occupantEstimate      category
dtype: object

In [5]:
df['Room'] = df['Room'].astype('category')
df['Module'] = df['Module'].astype('category')
df['DateTime'] = df['DateTime'].astype('datetime64[ns]')
df.dtypes

DateTime              datetime64[ns]
Room                        category
Capacity                       int64
Module                      category
NumReg                         int64
AvgNumWifiConn               float64
PercentageEstimate           float64
BinaryEstimate                 int64
estimate                     float64
occupantEstimate            category
dtype: object

In [6]:
df.shape

(216, 10)

### For logistic regression, we need to make the training features to be continuous. To do this we need to create dummy values for catergorical features. 


In [7]:
days = ['mon', 'tues', 'wed', 'thurs', 'fri', 'sat', 'sun']
# http://stackoverflow.com/questions/13740672/in-pandas-how-can-i-groupby-weekday-for-a-datetime-column
# http://chrisalbon.com/python/pandas_apply_operations_to_dataframes.html
df['weekday'] = df['DateTime'].apply(lambda dt: dt.weekday())
df.head()


Unnamed: 0,DateTime,Room,Capacity,Module,NumReg,AvgNumWifiConn,PercentageEstimate,BinaryEstimate,estimate,occupantEstimate,weekday
0,2015-11-03 16:00:00,B-002,90,COMP40370P1,27,39.0,0.25,1,22.5,0-25,1
1,2015-11-04 16:00:00,B-002,90,COMP30250P1,22,20.0,0.25,1,22.5,0-25,2
2,2015-11-05 16:00:00,B-002,90,COMP30520P1,60,32.6667,0.0,0,0.0,0-25,3
3,2015-11-06 16:00:00,B-002,90,,0,20.3333,0.5,1,45.0,25-50,4
4,2015-11-09 16:00:00,B-002,90,COMP40660P1,53,72.5,0.25,1,22.5,0-25,0


In [8]:
for i in range(len(days)):
    df[days[i]] =(df.weekday==i)*1
df.drop('weekday', axis=1, inplace=True)
df.drop('sat', axis=1, inplace=True)
df.drop('sun', axis=1, inplace=True)


df.head(10)

Unnamed: 0,DateTime,Room,Capacity,Module,NumReg,AvgNumWifiConn,PercentageEstimate,BinaryEstimate,estimate,occupantEstimate,mon,tues,wed,thurs,fri
0,2015-11-03 16:00:00,B-002,90,COMP40370P1,27,39.0,0.25,1,22.5,0-25,0,1,0,0,0
1,2015-11-04 16:00:00,B-002,90,COMP30250P1,22,20.0,0.25,1,22.5,0-25,0,0,1,0,0
2,2015-11-05 16:00:00,B-002,90,COMP30520P1,60,32.6667,0.0,0,0.0,0-25,0,0,0,1,0
3,2015-11-06 16:00:00,B-002,90,,0,20.3333,0.5,1,45.0,25-50,0,0,0,0,1
4,2015-11-09 16:00:00,B-002,90,COMP40660P1,53,72.5,0.25,1,22.5,0-25,1,0,0,0,0
5,2015-11-10 16:00:00,B-002,90,COMP40370P1,27,35.1667,0.25,1,22.5,0-25,0,1,0,0,0
6,2015-11-11 16:00:00,B-002,90,COMP30250P1,22,23.5,0.25,1,22.5,0-25,0,0,1,0,0
7,2015-11-12 16:00:00,B-002,90,COMP30520P1,60,49.8333,0.25,1,22.5,0-25,0,0,0,1,0
8,2015-11-13 16:00:00,B-002,90,,0,8.6667,0.25,1,22.5,0-25,0,0,0,0,1
9,2015-11-03 09:00:00,B-002,90,,0,2.0,0.0,0,0.0,0-25,0,1,0,0,0


In [9]:
times = ['9AM', '10AM', '11AM', '12PM', '1PM', '2PM', '3PM', '4PM']
df['time'] = df['DateTime'].apply(lambda dt: dt.hour)

for i in range(0,len(times)):
    df[times[i]] =(df.time==i+9)*1
df.drop('time', axis=1, inplace=True)
df.head(10)

Unnamed: 0,DateTime,Room,Capacity,Module,NumReg,AvgNumWifiConn,PercentageEstimate,BinaryEstimate,estimate,occupantEstimate,...,thurs,fri,9AM,10AM,11AM,12PM,1PM,2PM,3PM,4PM
0,2015-11-03 16:00:00,B-002,90,COMP40370P1,27,39.0,0.25,1,22.5,0-25,...,0,0,0,0,0,0,0,0,0,1
1,2015-11-04 16:00:00,B-002,90,COMP30250P1,22,20.0,0.25,1,22.5,0-25,...,0,0,0,0,0,0,0,0,0,1
2,2015-11-05 16:00:00,B-002,90,COMP30520P1,60,32.6667,0.0,0,0.0,0-25,...,1,0,0,0,0,0,0,0,0,1
3,2015-11-06 16:00:00,B-002,90,,0,20.3333,0.5,1,45.0,25-50,...,0,1,0,0,0,0,0,0,0,1
4,2015-11-09 16:00:00,B-002,90,COMP40660P1,53,72.5,0.25,1,22.5,0-25,...,0,0,0,0,0,0,0,0,0,1
5,2015-11-10 16:00:00,B-002,90,COMP40370P1,27,35.1667,0.25,1,22.5,0-25,...,0,0,0,0,0,0,0,0,0,1
6,2015-11-11 16:00:00,B-002,90,COMP30250P1,22,23.5,0.25,1,22.5,0-25,...,0,0,0,0,0,0,0,0,0,1
7,2015-11-12 16:00:00,B-002,90,COMP30520P1,60,49.8333,0.25,1,22.5,0-25,...,1,0,0,0,0,0,0,0,0,1
8,2015-11-13 16:00:00,B-002,90,,0,8.6667,0.25,1,22.5,0-25,...,0,1,0,0,0,0,0,0,0,1
9,2015-11-03 09:00:00,B-002,90,,0,2.0,0.0,0,0.0,0-25,...,0,0,1,0,0,0,0,0,0,0


In [12]:
for i in range(2, 5):
    df['B00'+str(i)] =(df.Room=='B-00'+str(i))*1
df.head(10)

Unnamed: 0,DateTime,Room,Capacity,Module,NumReg,AvgNumWifiConn,PercentageEstimate,BinaryEstimate,estimate,occupantEstimate,...,10AM,11AM,12PM,1PM,2PM,3PM,4PM,B002,B003,B004
0,2015-11-03 16:00:00,B-002,90,COMP40370P1,27,39.0,0.25,1,22.5,0-25,...,0,0,0,0,0,0,1,1,0,0
1,2015-11-04 16:00:00,B-002,90,COMP30250P1,22,20.0,0.25,1,22.5,0-25,...,0,0,0,0,0,0,1,1,0,0
2,2015-11-05 16:00:00,B-002,90,COMP30520P1,60,32.6667,0.0,0,0.0,0-25,...,0,0,0,0,0,0,1,1,0,0
3,2015-11-06 16:00:00,B-002,90,,0,20.3333,0.5,1,45.0,25-50,...,0,0,0,0,0,0,1,1,0,0
4,2015-11-09 16:00:00,B-002,90,COMP40660P1,53,72.5,0.25,1,22.5,0-25,...,0,0,0,0,0,0,1,1,0,0
5,2015-11-10 16:00:00,B-002,90,COMP40370P1,27,35.1667,0.25,1,22.5,0-25,...,0,0,0,0,0,0,1,1,0,0
6,2015-11-11 16:00:00,B-002,90,COMP30250P1,22,23.5,0.25,1,22.5,0-25,...,0,0,0,0,0,0,1,1,0,0
7,2015-11-12 16:00:00,B-002,90,COMP30520P1,60,49.8333,0.25,1,22.5,0-25,...,0,0,0,0,0,0,1,1,0,0
8,2015-11-13 16:00:00,B-002,90,,0,8.6667,0.25,1,22.5,0-25,...,0,0,0,0,0,0,1,1,0,0
9,2015-11-03 09:00:00,B-002,90,,0,2.0,0.0,0,0.0,0-25,...,0,0,0,0,0,0,0,1,0,0


In [13]:
df.corr()

Unnamed: 0,Capacity,NumReg,AvgNumWifiConn,PercentageEstimate,BinaryEstimate,estimate,mon,tues,wed,thurs,...,10AM,11AM,12PM,1PM,2PM,3PM,4PM,B002,B003,B004
Capacity,1.0,0.468271,0.355578,-0.045083,-0.056617,0.39296,2.6474550000000002e-17,3.7443450000000005e-17,2.194961e-17,2.2595180000000002e-17,...,-9.332719e-17,-1.5419270000000002e-17,6.248864e-17,-2.434622e-18,-2.8403930000000004e-17,3.6519330000000003e-17,3.002701e-17,-0.5,-0.5,1.0
NumReg,0.4682708,1.0,0.736217,0.344868,0.290085,0.562036,-0.04625192,0.02518073,-0.04275289,0.09583169,...,-0.009564565,0.0570458,0.1000863,0.009564565,-0.024253,-0.01161411,-0.1762613,-0.1253355,-0.3429352,0.4682708
AvgNumWifiConn,0.3555784,0.736217,1.0,0.598574,0.454588,0.765715,-0.008559815,0.127076,-0.1088693,0.08762523,...,-0.01575205,0.02496158,0.1411035,-0.005947925,0.1159951,-0.05334014,-0.1693601,-0.2275643,-0.1280142,0.3555784
PercentageEstimate,-0.04508348,0.344868,0.598574,1.0,0.769572,0.816815,0.05071892,0.1150197,-0.1022397,-0.02555993,...,0.0,0.06426169,0.1445888,-0.04819627,0.1285234,0.0,-0.1767197,-0.07889609,0.1239796,-0.04508348
BinaryEstimate,-0.05661669,0.290085,0.454588,0.769572,1.0,0.634861,0.2052355,0.06954706,-0.1711928,0.0213991,...,-0.0491772,0.1324002,0.1929259,-0.07944009,0.1324002,0.04161148,-0.1702288,0.0495396,0.007077086,-0.05661669
estimate,0.39296,0.562036,0.765715,0.816815,0.634861,1.0,0.04876492,0.1268725,-0.1533565,0.04715219,...,0.006579753,0.04302146,0.1017331,-0.007592023,0.09262268,-0.01973926,-0.198911,-0.2540037,-0.1389563,0.39296
mon,2.6474550000000002e-17,-0.046252,-0.00856,0.050719,0.205235,0.048765,1.0,-0.1889822,-0.1889822,-0.1889822,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.301043e-17,1.301043e-17,1.301043e-17
tues,3.7443450000000005e-17,0.025181,0.127076,0.11502,0.069547,0.126872,-0.1889822,1.0,-0.2857143,-0.2857143,...,1.5420550000000002e-17,1.5887840000000002e-17,1.635513e-17,1.3084100000000001e-17,1.355139e-17,1.401868e-17,1.448597e-17,1.3113280000000001e-18,1.3113280000000001e-18,1.3113280000000001e-18
wed,2.194961e-17,-0.042753,-0.108869,-0.10224,-0.171193,-0.153356,-0.1889822,-0.2857143,1.0,-0.2857143,...,1.5420550000000002e-17,1.2149520000000001e-17,1.2616810000000002e-17,1.3084100000000001e-17,1.355139e-17,1.401868e-17,1.448597e-17,1.3113280000000001e-18,1.3113280000000001e-18,1.3113280000000001e-18
thurs,2.2595180000000002e-17,0.095832,0.087625,-0.02556,0.021399,0.047152,-0.1889822,-0.2857143,-0.2857143,1.0,...,1.5420550000000002e-17,1.2149520000000001e-17,1.2616810000000002e-17,1.3084100000000001e-17,1.355139e-17,1.401868e-17,1.448597e-17,1.966992e-18,1.966992e-18,1.966992e-18


In [14]:
df.shape

(216, 26)

In [18]:
intercept = pd.DataFrame({'Intercept':np.ones(len(df))})
intercept

Unnamed: 0,Intercept
0,1.0
1,1.0
2,1.0
3,1.0
4,1.0
5,1.0
6,1.0
7,1.0
8,1.0
9,1.0


In [28]:
x = pd.concat([intercept, df[['Capacity', 'NumReg', 'AvgNumWifiConn', 'mon', 'tues', 'wed', 'thurs', 'fri', '9AM', '10AM', '11AM', '12PM', '1PM', '2PM', '3PM', '4PM', 'B002', 'B003', 'B004']]], axis = 1)
y = df['occupantEstimate']
x

Unnamed: 0,Intercept,Capacity,NumReg,AvgNumWifiConn,mon,tues,wed,thurs,fri,9AM,10AM,11AM,12PM,1PM,2PM,3PM,4PM,B002,B003,B004
0,1.0,90,27,39.0000,0,1,0,0,0,0,0,0,0,0,0,0,1,1,0,0
1,1.0,90,22,20.0000,0,0,1,0,0,0,0,0,0,0,0,0,1,1,0,0
2,1.0,90,60,32.6667,0,0,0,1,0,0,0,0,0,0,0,0,1,1,0,0
3,1.0,90,0,20.3333,0,0,0,0,1,0,0,0,0,0,0,0,1,1,0,0
4,1.0,90,53,72.5000,1,0,0,0,0,0,0,0,0,0,0,0,1,1,0,0
5,1.0,90,27,35.1667,0,1,0,0,0,0,0,0,0,0,0,0,1,1,0,0
6,1.0,90,22,23.5000,0,0,1,0,0,0,0,0,0,0,0,0,1,1,0,0
7,1.0,90,60,49.8333,0,0,0,1,0,0,0,0,0,0,0,0,1,1,0,0
8,1.0,90,0,8.6667,0,0,0,0,1,0,0,0,0,0,0,0,1,1,0,0
9,1.0,90,0,2.0000,0,1,0,0,0,1,0,0,0,0,0,0,0,1,0,0


In [22]:
log = LogisticRegression().fit(x[['Intercept','Capacity', 'NumReg', 'AvgNumWifiConn', 'mon', 'tues', 'wed', 'thurs', 'fri', '9AM', '10AM', '11AM', '12PM', '1PM', '2PM', '3PM', '4PM']], y)

In [24]:
predictions = log.predict(x[['Intercept','Capacity', 'NumReg', 'AvgNumWifiConn', 'mon', 'tues', 'wed', 'thurs', 'fri', '9AM', '10AM', '11AM', '12PM', '1PM', '2PM', '3PM', '4PM']])
log.score(x[['Intercept','Capacity', 'NumReg', 'AvgNumWifiConn', 'mon', 'tues', 'wed', 'thurs', 'fri', '9AM', '10AM', '11AM', '12PM', '1PM', '2PM', '3PM', '4PM']], y)

0.75462962962962965

In [32]:
# include rooms
log = LogisticRegression().fit(x[['Intercept','Capacity', 'NumReg', 'AvgNumWifiConn', 'mon', 'tues', 'wed', 'thurs', 'fri', '9AM', '10AM', '11AM', '12PM', '1PM', '2PM', '3PM', '4PM', 'B002', 'B003', 'B004']], y)

In [33]:
predictions = log.predict(x[['Intercept','Capacity', 'NumReg', 'AvgNumWifiConn', 'mon', 'tues', 'wed', 'thurs', 'fri', '9AM', '10AM', '11AM', '12PM', '1PM', '2PM', '3PM', '4PM', 'B002', 'B003', 'B004']])
log.score(x[['Intercept','Capacity', 'NumReg', 'AvgNumWifiConn', 'mon', 'tues', 'wed', 'thurs', 'fri', '9AM', '10AM', '11AM', '12PM', '1PM', '2PM', '3PM', '4PM', 'B002', 'B003', 'B004']], y)


0.75

In [37]:
df.columns

Index(['DateTime', 'Room', 'Capacity', 'Module', 'NumReg', 'AvgNumWifiConn',
       'PercentageEstimate', 'BinaryEstimate', 'estimate', 'occupantEstimate',
       'mon', 'tues', 'wed', 'thurs', 'fri', '9AM', '10AM', '11AM', '12PM',
       '1PM', '2PM', '3PM', '4PM', 'B002', 'B003', 'B004'],
      dtype='object')

## Model evaluation on out-of-sample data - with rooms

In [46]:
# Split the data into train and test sets
# Take a fifth of data samples as test data, rest as training data
X_train, X_test, y_train, y_test = train_test_split(x[['Intercept','Capacity', 'NumReg', 'AvgNumWifiConn', 'mon', 'tues', 'wed', 'thurs', 'fri', '9AM', '10AM', '11AM', '12PM', '1PM', '2PM', '3PM', '4PM', 'B002', 'B003', 'B004']], y, test_size=0.2)

# print("Training data:\n", pd.concat([X_train, y_train], axis=1))
# print("\nTest data:\n", pd.concat([X_test, y_test], axis=1))

In [47]:
# Train on the training sample and test on the test sample.
logreg_train = LogisticRegression().fit(X_train, y_train)
# Print the weights learned for each feature.
print(logreg_train.coef_)

[[  9.91283415e-01  -1.27528516e-03   3.60816030e-03  -6.70743950e-02
    2.09418930e-02  -4.31671007e-01   6.08399402e-01   6.66591471e-01
    1.27021656e-01   4.18141776e-02   2.16490069e-01  -1.88999628e-01
   -3.86591376e-02   6.15919346e-01  -1.83140579e-01   5.64070446e-03
    5.22218462e-01   1.16973304e+00   5.07833315e-01  -6.86282943e-01]
 [ -1.38355381e+00  -2.11242867e-03   9.22048793e-03   1.13708890e-02
   -6.77560058e-01   1.46213748e-01  -7.04040712e-01  -2.87380240e-01
    1.39213457e-01  -7.49630194e-01   4.43420550e-01  -5.11337545e-01
   -3.29555579e-01   4.50269560e-01   1.23067306e-01  -1.68771391e-01
   -6.41016512e-01  -1.16375792e+00  -1.17762457e+00   9.57828693e-01]
 [ -7.75953859e-01  -6.27018461e-02   7.81572005e-02   5.78545871e-03
   -1.75324186e-01   5.84215744e-01  -5.82421981e-01  -3.67699091e-01
   -2.34724345e-01   5.78937946e-01  -1.13560430e-01  -1.34394415e-01
   -3.31017158e-01  -2.55763898e-01  -1.12367676e-01  -3.28678126e-01
   -7.91101017e-02

In [48]:
# Estimated class probabilities on test set
print(logreg_train.predict_proba(X_test))

[[  5.34295211e-01   8.37874289e-02   1.43259472e-02   3.13314743e-07
    3.06412981e-01   6.11781185e-02]
 [  7.45406033e-01   4.75490714e-03   1.73713817e-04   8.50409736e-08
    7.31447113e-02   1.76520550e-01]
 [  7.78229774e-04   6.06544792e-01   6.78214868e-03   2.72993226e-15
    8.13563294e-03   3.77759197e-01]
 [  8.86198841e-01   4.56982823e-02   1.25481267e-02   2.53373347e-15
    3.83753725e-02   1.71793778e-02]
 [  7.08902789e-01   7.55458574e-02   8.34635065e-04   8.71429405e-39
    2.19617122e-04   2.14497101e-01]
 [  3.95590654e-02   3.86011269e-01   1.37400022e-01   4.99782536e-30
    3.58748231e-03   4.33442161e-01]
 [  7.58214287e-01   5.78412568e-03   4.15102096e-03   1.49750725e-12
    1.22254565e-01   1.09596002e-01]
 [  3.02546211e-01   2.13468499e-01   2.55394207e-04   9.37900703e-29
    1.24591666e-03   4.82483979e-01]
 [  3.26737170e-01   1.58232774e-02   1.05777472e-02   7.90515601e-07
    2.39956645e-01   4.06904369e-01]
 [  7.38595922e-01   1.89736198e-02  

In [49]:
# Estimated classes on train set
predicted_train = logreg_train.predict(X_train)
print(predicted_train)

['0-25' '0-25' '50-75' '0-25' '100-125' '0-25' '0-25' '0-25' '25-50' '0-25'
 '50-75' '0-25' '0-25' '0-25' '0-25' '50-75' '50-75' '0-25' '50-75' '0-25'
 '0-25' '0-25' '0-25' '0-25' '50-75' '0-25' '0-25' '0-25' '0-25' '0-25'
 '50-75' '50-75' '0-25' '0-25' '100-125' '0-25' '0-25' '0-25' '0-25' '0-25'
 '0-25' '50-75' '0-25' '0-25' '0-25' '0-25' '0-25' '0-25' '25-50' '0-25'
 '0-25' '0-25' '0-25' '0-25' '0-25' '50-75' '0-25' '0-25' '0-25' '0-25'
 '0-25' '50-75' '25-50' '0-25' '0-25' '0-25' '0-25' '0-25' '50-75' '0-25'
 '0-25' '0-25' '0-25' '0-25' '0-25' '0-25' '0-25' '0-25' '200-225' '0-25'
 '0-25' '0-25' '0-25' '0-25' '0-25' '100-125' '0-25' '0-25' '0-25' '0-25'
 '0-25' '0-25' '0-25' '0-25' '50-75' '0-25' '0-25' '0-25' '0-25' '0-25'
 '0-25' '0-25' '100-125' '50-75' '50-75' '0-25' '0-25' '50-75' '0-25'
 '0-25' '0-25' '25-50' '100-125' '0-25' '100-125' '0-25' '0-25' '50-75'
 '0-25' '0-25' '0-25' '0-25' '0-25' '0-25' '0-25' '0-25' '25-50' '0-25'
 '0-25' '0-25' '50-75' '0-25' '0-25' '50-75' '0-

In [50]:
# Estimated classes on train set
predicted_test = logreg_train.predict(X_test)
print(predicted_test)

['0-25' '0-25' '100-125' '0-25' '0-25' '50-75' '0-25' '50-75' '50-75'
 '0-25' '0-25' '0-25' '25-50' '0-25' '100-125' '0-25' '0-25' '0-25' '0-25'
 '50-75' '0-25' '0-25' '0-25' '0-25' '50-75' '0-25' '50-75' '0-25' '0-25'
 '0-25' '0-25' '50-75' '0-25' '50-75' '25-50' '0-25' '0-25' '0-25' '0-25'
 '0-25' '0-25' '0-25' '0-25' '0-25']


In [51]:
# accuracy of trained predictions
print(metrics.accuracy_score(y_train, predicted_train))

0.755813953488


In [52]:
# accuracy of trained predictions
print(metrics.accuracy_score(y_test, predicted_test))

0.659090909091


## Model evaluation on out-of-sample data - without rooms

In [54]:
# Split the data into train and test sets
# Take a fifth of data samples as test data, rest as training data
X_train, X_test, y_train, y_test = train_test_split(x[['Intercept','Capacity', 'NumReg', 'AvgNumWifiConn', 'mon', 'tues', 'wed', 'thurs', 'fri', '9AM', '10AM', '11AM', '12PM', '1PM', '2PM', '3PM', '4PM']], y, test_size=0.2)

# print("Training data:\n", pd.concat([X_train, y_train], axis=1))
# print("\nTest data:\n", pd.concat([X_test, y_test], axis=1))

In [55]:
# Train on the training sample and test on the test sample.
logreg_train = LogisticRegression().fit(X_train, y_train)
# Print the weights learned for each feature.
print(logreg_train.coef_)

[[  1.58595155e+00  -9.96461528e-03   4.57427729e-03  -6.46096193e-02
   -4.59946358e-02  -4.22865298e-01   1.14194566e+00   5.58722588e-01
    3.54143233e-01   4.28262491e-01  -2.10313565e-01   6.40431651e-02
   -3.82152611e-01   6.28339440e-01  -1.46531806e-01   1.91837377e-01
    1.01246706e+00]
 [ -2.05715174e+00   8.65588588e-03   9.20951707e-03   8.67202856e-03
   -3.26413938e-01  -1.92083589e-01  -6.90601505e-01  -4.80749936e-01
   -3.67302774e-01  -5.54420886e-01  -5.29685008e-02  -4.48872624e-01
   -2.15131871e-01   1.89748814e-01  -2.98943159e-01   4.19312751e-02
   -7.18494791e-01]
 [ -3.53049088e-02  -2.21364194e-01  -3.14043568e-01   4.29388732e-01
   -3.98541249e-03  -1.96451385e-03  -4.58780766e-02   1.68200267e-02
   -2.96932575e-04   2.27381511e-02  -1.69177095e-03  -1.40334906e-04
   -3.00396928e-02  -2.04557199e-02  -1.66782307e-03  -3.50456721e-05
   -4.01267256e-03]
 [  5.30892967e-01  -4.60292721e-02   2.12158765e-02   1.52024705e-02
   -5.03188680e-02   1.0879352

In [57]:
# Estimated class probabilities on test set
print(logreg_train.predict_proba(X_test))

[[  7.63589956e-01   4.15459326e-02   1.51273741e-14   1.09272035e-01
    8.55920761e-02]
 [  8.66545921e-01   1.91287113e-02   2.08585045e-13   9.05053317e-02
    2.38200365e-02]
 [  1.30287050e-01   3.43232487e-01   2.15803020e-25   3.27448790e-03
    5.23205975e-01]
 [  8.13010057e-01   2.40978108e-02   2.47065719e-11   1.32179296e-01
    3.07128360e-02]
 [  8.39451249e-01   3.64118401e-02   1.55643501e-07   6.09303781e-02
    6.32063770e-02]
 [  7.08737862e-01   8.18659422e-02   1.33276747e-26   3.32950044e-04
    2.09063245e-01]
 [  9.17587536e-01   1.19332553e-02   1.00874368e-08   3.80381690e-02
    3.24410292e-02]
 [  9.34077008e-01   1.01544929e-02   1.97387125e-05   2.54082234e-02
    3.03405374e-02]
 [  6.41001268e-01   2.65916822e-02   1.10288132e-09   2.58543205e-01
    7.38638433e-02]
 [  8.84579012e-01   1.20623668e-02   7.15947069e-07   2.94381952e-02
    7.39197101e-02]
 [  3.99846802e-01   1.63194753e-01   1.25116278e-22   1.18425527e-03
    4.35774189e-01]
 [  7.6902

In [59]:
# Estimated classes on train set
predicted_train = logreg_train.predict(X_train)
print(predicted_train)

['0-25' '50-75' '0-25' '0-25' '50-75' '0-25' '0-25' '200-225' '0-25'
 '50-75' '0-25' '50-75' '50-75' '0-25' '0-25' '0-25' '0-25' '0-25' '0-25'
 '0-25' '0-25' '0-25' '0-25' '25-50' '0-25' '50-75' '25-50' '0-25' '0-25'
 '0-25' '0-25' '0-25' '50-75' '0-25' '0-25' '50-75' '0-25' '0-25' '0-25'
 '50-75' '25-50' '0-25' '50-75' '0-25' '0-25' '0-25' '50-75' '50-75' '0-25'
 '0-25' '0-25' '0-25' '50-75' '0-25' '0-25' '0-25' '0-25' '0-25' '0-25'
 '0-25' '0-25' '0-25' '0-25' '0-25' '0-25' '0-25' '0-25' '25-50' '0-25'
 '0-25' '50-75' '25-50' '0-25' '0-25' '25-50' '0-25' '0-25' '0-25' '0-25'
 '0-25' '0-25' '0-25' '0-25' '0-25' '50-75' '0-25' '0-25' '25-50' '0-25'
 '0-25' '0-25' '0-25' '0-25' '50-75' '0-25' '0-25' '0-25' '0-25' '0-25'
 '50-75' '0-25' '0-25' '0-25' '50-75' '0-25' '0-25' '50-75' '0-25' '0-25'
 '50-75' '0-25' '0-25' '0-25' '0-25' '50-75' '0-25' '0-25' '0-25' '0-25'
 '0-25' '0-25' '0-25' '0-25' '0-25' '50-75' '0-25' '50-75' '0-25' '0-25'
 '0-25' '50-75' '50-75' '0-25' '50-75' '0-25' '0-25

In [60]:
# Estimated classes on test set
predicted_test = logreg_train.predict(X_test)
print(predicted_test)

['0-25' '0-25' '50-75' '0-25' '0-25' '0-25' '0-25' '0-25' '0-25' '0-25'
 '50-75' '0-25' '50-75' '0-25' '0-25' '0-25' '0-25' '0-25' '25-50' '50-75'
 '0-25' '0-25' '50-75' '25-50' '0-25' '0-25' '0-25' '0-25' '0-25' '50-75'
 '0-25' '0-25' '50-75' '50-75' '0-25' '0-25' '0-25' '25-50' '50-75' '25-50'
 '50-75' '0-25' '0-25' '0-25']


In [61]:
# accuracy of trained predictions
print(metrics.accuracy_score(y_train, predicted_train))

0.75


In [62]:
# accuracy of test predictions
print(metrics.accuracy_score(y_test, predicted_test))

0.590909090909


## Model evaluation on out-of-sample data - with rooms, without capacity

In [63]:
# Split the data into train and test sets
# Take a fifth of data samples as test data, rest as training data
X_train, X_test, y_train, y_test = train_test_split(x[['Intercept', 'NumReg', 'AvgNumWifiConn', 'mon', 'tues', 'wed', 'thurs', 'fri', '9AM', '10AM', '11AM', '12PM', '1PM', '2PM', '3PM', '4PM', 'B002', 'B003', 'B004']], y, test_size=0.2)

# print("Training data:\n", pd.concat([X_train, y_train], axis=1))
# print("\nTest data:\n", pd.concat([X_test, y_test], axis=1))

In [65]:
# Train on the training sample and test on the test sample.
logreg_train = LogisticRegression().fit(X_train, y_train)
# Print the weights learned for each feature.
print(logreg_train.coef_)

[[  1.05305690e+00   5.35406249e-03  -8.80479823e-02  -7.06910307e-02
   -3.53315467e-01   8.43823195e-01   4.39817489e-01   1.93422709e-01
   -3.75399775e-01  -1.02209286e-01   4.65557163e-01   1.08891405e-02
    6.24794563e-01  -1.96490025e-01  -1.18740563e-01   7.44655678e-01
    8.98580895e-01   8.72772519e-01  -7.18296518e-01]
 [ -1.33436058e+00  -8.96604450e-04   1.03456399e-02  -1.17219861e-01
   -4.41520841e-02  -1.10090423e+00  -8.37581946e-03  -6.37085868e-02
   -5.07314653e-01   7.74326385e-02  -3.26645464e-01  -8.11946814e-01
    4.54504142e-01   3.67800740e-01   7.19180506e-02  -6.60109220e-01
   -1.15061676e+00  -1.22922723e+00   1.04548341e+00]
 [ -1.65853163e+00  -2.29048577e-03   7.60452168e-03  -2.61717460e-01
    4.36346045e-02  -4.47163387e-01  -5.57323694e-01  -4.35961690e-01
    4.30981149e-01  -3.18316620e-01  -2.94516514e-01  -2.91348993e-01
   -2.97243323e-01  -3.19028905e-01  -3.15207567e-01  -2.53850853e-01
   -6.69381301e-01  -7.36045314e-01  -2.53105012e-01

In [67]:
# Estimated class probabilities on test set
print(logreg_train.predict_proba(X_test))

[[  9.43220678e-02   2.84141911e-01   1.92889230e-02   5.73900430e-03
    3.86831119e-02   5.57824982e-01]
 [  2.99453264e-01   2.23471337e-01   2.99510224e-02   8.23687578e-02
    2.65640493e-02   3.38191569e-01]
 [  1.18101494e-02   4.20584847e-01   1.81753301e-02   7.91110982e-03
    1.32018648e-02   5.28316699e-01]
 [  3.37574445e-01   8.87145407e-02   1.43521048e-02   2.08791645e-03
    1.31551963e-02   5.44115797e-01]
 [  7.71336367e-01   1.15824915e-01   3.12187632e-02   2.33163730e-02
    2.00385722e-02   3.82650102e-02]
 [  7.29218888e-01   2.98232829e-02   7.24225917e-03   1.52866014e-02
    1.62175622e-01   5.62533462e-02]
 [  8.25269511e-01   3.38751348e-02   1.45342950e-02   1.28018003e-02
    9.54899144e-02   1.80293448e-02]
 [  1.87308866e-01   2.01695183e-01   2.10796809e-02   3.93583662e-03
    1.99380796e-02   5.66042354e-01]
 [  7.61930185e-01   1.15875267e-02   8.44909690e-03   4.90609311e-03
    1.58038533e-01   5.50885656e-02]
 [  1.92816539e-01   4.71630291e-02  

In [68]:
# Estimated classes on train set
predicted_train = logreg_train.predict(X_train)
print(predicted_train)

['50-75' '0-25' '0-25' '0-25' '0-25' '0-25' '0-25' '0-25' '0-25' '0-25'
 '50-75' '0-25' '0-25' '0-25' '0-25' '0-25' '50-75' '0-25' '0-25' '0-25'
 '0-25' '0-25' '0-25' '0-25' '0-25' '0-25' '0-25' '0-25' '50-75' '0-25'
 '0-25' '0-25' '50-75' '0-25' '0-25' '50-75' '0-25' '0-25' '50-75' '0-25'
 '0-25' '0-25' '50-75' '0-25' '0-25' '25-50' '0-25' '0-25' '0-25' '25-50'
 '0-25' '50-75' '50-75' '100-125' '50-75' '0-25' '0-25' '50-75' '0-25'
 '25-50' '0-25' '0-25' '0-25' '50-75' '0-25' '0-25' '50-75' '50-75' '0-25'
 '0-25' '0-25' '0-25' '50-75' '0-25' '0-25' '25-50' '50-75' '0-25' '50-75'
 '0-25' '0-25' '0-25' '0-25' '0-25' '0-25' '0-25' '50-75' '0-25' '50-75'
 '0-25' '50-75' '0-25' '0-25' '0-25' '0-25' '0-25' '0-25' '0-25' '0-25'
 '50-75' '50-75' '0-25' '0-25' '50-75' '0-25' '0-25' '0-25' '0-25' '0-25'
 '0-25' '0-25' '0-25' '0-25' '0-25' '0-25' '0-25' '50-75' '50-75' '0-25'
 '0-25' '0-25' '0-25' '50-75' '0-25' '0-25' '0-25' '0-25' '0-25' '0-25'
 '50-75' '0-25' '0-25' '0-25' '50-75' '0-25' '0-25

In [69]:
# Estimated classes on test set
predicted_test = logreg_train.predict(X_test)
print(predicted_test)

['50-75' '50-75' '50-75' '50-75' '0-25' '0-25' '0-25' '50-75' '0-25'
 '25-50' '0-25' '0-25' '0-25' '0-25' '50-75' '0-25' '0-25' '50-75'
 '100-125' '0-25' '0-25' '0-25' '0-25' '50-75' '0-25' '0-25' '0-25' '0-25'
 '0-25' '0-25' '50-75' '0-25' '25-50' '0-25' '0-25' '0-25' '0-25' '0-25'
 '0-25' '0-25' '100-125' '0-25' '0-25' '0-25']


In [70]:
# accuracy of trained predictions
print(metrics.accuracy_score(y_train, predicted_train))

0.755813953488


In [74]:
# accuracy of test predictions
print(metrics.accuracy_score(y_test, predicted_test))

0.636363636364


## Model evaluation on out-of-sample data - with rooms, without NumReg

In [75]:
# Split the data into train and test sets
# Take a fifth of data samples as test data, rest as training data
X_train, X_test, y_train, y_test = train_test_split(x[['Intercept', 'Capacity', 'AvgNumWifiConn', 'mon', 'tues', 'wed', 'thurs', 'fri', '9AM', '10AM', '11AM', '12PM', '1PM', '2PM', '3PM', '4PM', 'B002', 'B003', 'B004']], y, test_size=0.2)

# print("Training data:\n", pd.concat([X_train, y_train], axis=1))
# print("\nTest data:\n", pd.concat([X_test, y_test], axis=1))

In [76]:
# Train on the training sample and test on the test sample.
logreg_train = LogisticRegression().fit(X_train, y_train)
# Print the weights learned for each feature.
print(logreg_train.coef_)

[[  9.34775640e-01   1.25157173e-03  -7.25362031e-02  -1.31556667e-02
   -5.24012723e-01   7.58734932e-01   6.33776709e-01   7.94323896e-02
    8.77601334e-02  -4.55509231e-02   3.63792346e-02  -3.66364052e-01
    9.82672328e-01   6.45092481e-02   1.97664360e-01  -2.22946888e-02
    1.03312315e+00   5.48795230e-01  -6.47142739e-01]
 [ -1.35889289e+00  -4.22515276e-03   2.11519208e-02  -5.05614849e-01
   -7.53465495e-01  -4.27526323e-01   1.66162294e-01   1.61551485e-01
   -1.58055324e-01   4.90070710e-01  -7.73283036e-01  -2.43201030e-01
    4.30344310e-01  -7.57488849e-01   2.90952322e-02  -3.76374902e-01
   -1.09066940e+00  -1.20896298e+00   9.40739499e-01]
 [ -8.78990826e-01  -4.30433544e-02   6.53491552e-02  -1.26278042e-01
    2.92714060e-01  -4.28841101e-01  -3.37737659e-01  -2.78848084e-01
    7.14603724e-01  -1.93810656e-01  -2.37918939e-01  -1.95754169e-01
   -3.96753337e-01  -2.34660518e-01  -2.38546723e-01  -9.61502077e-02
   -5.54499921e-01  -9.32691913e-01   6.08201008e-01

In [78]:
# Estimated class probabilities on test set
print(logreg_train.predict_proba(X_test))

[[  3.93218386e-01   9.08565072e-03   3.89290892e-02   4.24961040e-01
    1.33805834e-01]
 [  9.02658834e-01   1.04171092e-02   5.98363985e-03   6.89885989e-02
    1.19518185e-02]
 [  2.67016663e-02   6.31146333e-02   2.14504788e-03   5.08562039e-03
    9.02953032e-01]
 [  5.61158008e-01   6.72309607e-03   4.52690045e-03   3.12475547e-01
    1.15116448e-01]
 [  7.94023191e-01   9.80769835e-03   2.40004880e-03   7.17382017e-02
    1.22030861e-01]
 [  8.20157666e-01   4.86640954e-03   1.60619770e-03   5.99605441e-02
    1.13409183e-01]
 [  4.07244674e-02   3.96948057e-01   2.91388763e-03   9.29089322e-04
    5.58484499e-01]
 [  1.68800633e-01   2.24630301e-01   9.71031005e-04   2.52079177e-03
    6.03077243e-01]
 [  1.96426145e-02   1.81069918e-01   2.75661251e-03   4.56536774e-03
    7.91965488e-01]
 [  4.84929971e-01   1.03185326e-02   2.45960244e-02   4.34440059e-01
    4.57154128e-02]
 [  7.57129526e-01   2.32812292e-02   1.08385464e-02   1.79876043e-01
    2.88746553e-02]
 [  5.2204

In [79]:
# Estimated classes on train set
predicted_train = logreg_train.predict(X_train)
print(predicted_train)

['0-25' '0-25' '50-75' '50-75' '0-25' '50-75' '0-25' '0-25' '0-25' '0-25'
 '0-25' '0-25' '0-25' '50-75' '0-25' '50-75' '0-25' '0-25' '0-25' '50-75'
 '0-25' '0-25' '0-25' '50-75' '25-50' '0-25' '0-25' '0-25' '25-50' '0-25'
 '0-25' '0-25' '0-25' '50-75' '0-25' '0-25' '0-25' '0-25' '0-25' '0-25'
 '0-25' '0-25' '50-75' '0-25' '0-25' '50-75' '0-25' '0-25' '0-25' '0-25'
 '0-25' '0-25' '50-75' '0-25' '50-75' '0-25' '0-25' '0-25' '50-75' '0-25'
 '0-25' '50-75' '0-25' '0-25' '0-25' '0-25' '0-25' '0-25' '50-75' '0-25'
 '0-25' '0-25' '0-25' '0-25' '0-25' '0-25' '0-25' '50-75' '25-50' '0-25'
 '0-25' '0-25' '0-25' '0-25' '0-25' '0-25' '0-25' '50-75' '0-25' '0-25'
 '0-25' '0-25' '50-75' '0-25' '0-25' '0-25' '0-25' '50-75' '50-75' '0-25'
 '25-50' '0-25' '0-25' '0-25' '0-25' '50-75' '0-25' '0-25' '0-25' '25-50'
 '0-25' '50-75' '0-25' '0-25' '50-75' '0-25' '0-25' '0-25' '50-75' '0-25'
 '0-25' '0-25' '25-50' '0-25' '0-25' '0-25' '0-25' '0-25' '0-25' '50-75'
 '0-25' '0-25' '0-25' '25-50' '0-25' '0-25' '0

In [80]:
# Estimated classes on test set
predicted_test = logreg_train.predict(X_test)
print(predicted_test)

['25-50' '0-25' '50-75' '0-25' '0-25' '0-25' '50-75' '50-75' '50-75' '0-25'
 '0-25' '0-25' '0-25' '0-25' '0-25' '0-25' '50-75' '50-75' '25-50' '25-50'
 '150-175' '0-25' '0-25' '0-25' '0-25' '0-25' '0-25' '0-25' '50-75' '0-25'
 '0-25' '50-75' '50-75' '25-50' '0-25' '50-75' '0-25' '50-75' '0-25' '0-25'
 '0-25' '0-25' '50-75' '0-25']


In [81]:
# accuracy of trained predictions
print(metrics.accuracy_score(y_train, predicted_train))

0.755813953488


In [82]:
# accuracy of test predictions
print(metrics.accuracy_score(y_test, predicted_test))

0.590909090909


### Model evaluation on out-of-sample data - with rooms, without NumReg, without Capacity

In [84]:
# Split the data into train and test sets
# Take a fifth of data samples as test data, rest as training data
X_train, X_test, y_train, y_test = train_test_split(x[['Intercept', 'Capacity', 'AvgNumWifiConn', 'mon', 'tues', 'wed', 'thurs', 'fri', '9AM', '10AM', '11AM', '12PM', '1PM', '2PM', '3PM', '4PM', 'B002', 'B003', 'B004']], y, test_size=0.2)

# print("Training data:\n", pd.concat([X_train, y_train], axis=1))
# print("\nTest data:\n", pd.concat([X_test, y_test], axis=1))

In [85]:
# Train on the training sample and test on the test sample.
logreg_train = LogisticRegression().fit(X_train, y_train)
# Print the weights learned for each feature.
print(logreg_train.coef_)

[[  9.35242068e-01  -6.39221701e-04  -7.11830373e-02  -1.03722499e-01
   -4.39490128e-01   5.69689866e-01   4.80307613e-01   4.28457216e-01
   -2.46087514e-01  -1.65342678e-02  -8.96068258e-03   6.29981580e-02
    8.28403362e-01  -2.81391929e-02   1.84135127e-01   1.59427078e-01
    1.10801703e+00   4.74705236e-01  -6.47480195e-01]
 [ -1.40062420e+00   6.78548763e-04   1.55861419e-02  -3.48601168e-01
   -2.13301803e-01  -6.81469135e-01  -8.75605424e-02  -6.96915486e-02
   -8.17212967e-01   1.85526194e-01  -4.17452095e-01  -2.21313941e-01
    2.56728829e-01  -3.73518309e-02   1.30674172e-01  -4.80222557e-01
   -1.15914343e+00  -1.21114889e+00   9.69668125e-01]
 [ -8.13253307e-01  -2.95907899e-02   2.79719019e-02  -1.54815485e-01
    4.14349826e-01  -3.50514881e-01  -4.83674922e-01  -2.38597845e-01
    5.65476424e-01  -1.61702135e-01  -2.19411645e-01  -2.69671608e-01
   -2.08004876e-01  -2.31916028e-01  -1.73189671e-01  -1.14833769e-01
   -6.25069565e-01  -7.50977641e-01   5.62793899e-01

In [86]:
# Estimated class probabilities on test set
print(logreg_train.predict_proba(X_test))

[[  1.52764336e-01   3.57702233e-01   2.44218247e-03   1.42280805e-33
    1.30146292e-03   4.85789786e-01]
 [  3.94877085e-01   4.78984479e-02   1.90367342e-02   2.19966477e-09
    4.91954024e-01   4.62337062e-02]
 [  3.55334583e-01   7.41346594e-02   3.07810149e-02   5.21745533e-06
    5.01276073e-01   3.84684525e-02]
 [  8.21241344e-01   6.67228895e-03   1.80415413e-02   7.29822561e-19
    1.26112171e-01   2.79326550e-02]
 [  3.84135388e-01   4.70438808e-02   1.73131997e-02   5.84836874e-10
    5.03438673e-01   4.80688576e-02]
 [  8.50762807e-01   4.10668330e-03   7.86613219e-03   6.13938046e-19
    6.36800201e-02   7.35843574e-02]
 [  8.31667777e-01   7.91240262e-03   7.69238702e-03   5.74028067e-18
    1.02902095e-01   4.98253384e-02]
 [  7.33956268e-01   9.36882252e-03   6.52414727e-03   1.15687111e-16
    1.46923501e-01   1.03227262e-01]
 [  7.83027348e-01   7.53790424e-03   1.57371696e-02   4.27226912e-16
    1.54030836e-01   3.96667427e-02]
 [  7.22491896e-02   1.62687055e-01  

In [87]:
# Estimated classes on train set
predicted_train = logreg_train.predict(X_train)
print(predicted_train)

['0-25' '50-75' '0-25' '0-25' '0-25' '50-75' '25-50' '50-75' '0-25' '0-25'
 '100-125' '0-25' '0-25' '0-25' '0-25' '0-25' '0-25' '0-25' '0-25' '50-75'
 '0-25' '0-25' '0-25' '25-50' '50-75' '50-75' '50-75' '0-25' '0-25' '0-25'
 '0-25' '0-25' '0-25' '0-25' '25-50' '0-25' '0-25' '0-25' '0-25' '50-75'
 '50-75' '0-25' '0-25' '50-75' '0-25' '0-25' '50-75' '50-75' '25-50'
 '50-75' '0-25' '0-25' '0-25' '0-25' '50-75' '100-125' '0-25' '0-25'
 '50-75' '50-75' '0-25' '50-75' '0-25' '0-25' '0-25' '0-25' '50-75' '0-25'
 '0-25' '0-25' '25-50' '100-125' '0-25' '50-75' '0-25' '0-25' '0-25'
 '100-125' '0-25' '50-75' '0-25' '200-225' '0-25' '0-25' '50-75' '0-25'
 '0-25' '0-25' '0-25' '0-25' '0-25' '0-25' '0-25' '25-50' '0-25' '0-25'
 '0-25' '0-25' '50-75' '0-25' '0-25' '0-25' '0-25' '0-25' '0-25' '0-25'
 '0-25' '0-25' '100-125' '0-25' '0-25' '50-75' '0-25' '25-50' '0-25'
 '25-50' '0-25' '0-25' '0-25' '0-25' '0-25' '25-50' '0-25' '50-75' '0-25'
 '100-125' '0-25' '25-50' '0-25' '25-50' '0-25' '0-25' '0-25'

In [88]:
# Estimated classes on test set
predicted_test = logreg_train.predict(X_test)
print(predicted_test)

['50-75' '25-50' '25-50' '0-25' '25-50' '0-25' '0-25' '0-25' '0-25' '50-75'
 '0-25' '0-25' '0-25' '0-25' '50-75' '0-25' '50-75' '0-25' '25-50' '0-25'
 '0-25' '0-25' '0-25' '0-25' '0-25' '0-25' '0-25' '0-25' '0-25' '0-25'
 '50-75' '50-75' '0-25' '0-25' '0-25' '0-25' '0-25' '0-25' '0-25' '25-50'
 '0-25' '50-75' '50-75' '0-25']


In [90]:
# accuracy of trained predictions
print(metrics.accuracy_score(y_train, predicted_train))

0.732558139535


In [91]:
# accuracy of test predictions
print(metrics.accuracy_score(y_test, predicted_test))

0.795454545455
