In [1]:
import MySQLdb as db
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from pandas.tools.plotting import scatter_matrix
from sklearn.linear_model import LogisticRegression
from sklearn.cross_validation import train_test_split
from sklearn import metrics
from sklearn.cross_validation import cross_val_score
%matplotlib inline

In [2]:
df = pd.read_csv('ABT-Avgs.csv')

### For logistic regression, we need to make the target feature categorical. For this, we can bin the value ranges. 

In [3]:
# http://chrisalbon.com/python/pandas_create_column_using_conditional.html
# Add emtimate column
df['estimate'] = df['Capacity'] * df['PercentageEstimate']
# Bin results into categories for logistic regression. 
bins = [-1, 25, 50, 75, 100, 125, 150, 175, 200, 225, 250]
groups = [ '0-25', '25-50', '50-75', '75-100', '100-125', '125-150', '150-175', '175-200', '200-225', '225-250']
df['occupantEstimate'] = pd.cut(df['estimate'], bins, labels = groups )

In [4]:
df.dtypes

DateTime                object
Room                    object
Capacity                 int64
Module                  object
NumReg                   int64
AvgNumWifiConn         float64
PercentageEstimate     float64
BinaryEstimate           int64
estimate               float64
occupantEstimate      category
dtype: object

In [5]:
df['Room'] = df['Room'].astype('category')
df['Module'] = df['Module'].astype('category')
df['DateTime'] = df['DateTime'].astype('datetime64[ns]')
df.dtypes

DateTime              datetime64[ns]
Room                        category
Capacity                       int64
Module                      category
NumReg                         int64
AvgNumWifiConn               float64
PercentageEstimate           float64
BinaryEstimate                 int64
estimate                     float64
occupantEstimate            category
dtype: object

In [6]:
df.shape

(216, 10)

### For logistic regression, we need to make the training features to be continuous. To do this we need to create dummy values for catergorical features. 


In [7]:
days = ['mon', 'tues', 'wed', 'thurs', 'fri', 'sat', 'sun']
# http://stackoverflow.com/questions/13740672/in-pandas-how-can-i-groupby-weekday-for-a-datetime-column
# http://chrisalbon.com/python/pandas_apply_operations_to_dataframes.html
df['weekday'] = df['DateTime'].apply(lambda dt: dt.weekday())
df.head()


Unnamed: 0,DateTime,Room,Capacity,Module,NumReg,AvgNumWifiConn,PercentageEstimate,BinaryEstimate,estimate,occupantEstimate,weekday
0,2015-11-03 16:00:00,B-002,90,COMP40370P1,27,39.0,0.25,1,22.5,0-25,1
1,2015-11-04 16:00:00,B-002,90,COMP30250P1,22,20.0,0.25,1,22.5,0-25,2
2,2015-11-05 16:00:00,B-002,90,COMP30520P1,60,32.6667,0.0,0,0.0,0-25,3
3,2015-11-06 16:00:00,B-002,90,,0,20.3333,0.5,1,45.0,25-50,4
4,2015-11-09 16:00:00,B-002,90,COMP40660P1,53,72.5,0.25,1,22.5,0-25,0


In [8]:
for i in range(len(days)):
    df[days[i]] =(df.weekday==i)*1
df.drop('weekday', axis=1, inplace=True)
df.drop('sat', axis=1, inplace=True)
df.drop('sun', axis=1, inplace=True)


df.head(10)

Unnamed: 0,DateTime,Room,Capacity,Module,NumReg,AvgNumWifiConn,PercentageEstimate,BinaryEstimate,estimate,occupantEstimate,mon,tues,wed,thurs,fri
0,2015-11-03 16:00:00,B-002,90,COMP40370P1,27,39.0,0.25,1,22.5,0-25,0,1,0,0,0
1,2015-11-04 16:00:00,B-002,90,COMP30250P1,22,20.0,0.25,1,22.5,0-25,0,0,1,0,0
2,2015-11-05 16:00:00,B-002,90,COMP30520P1,60,32.6667,0.0,0,0.0,0-25,0,0,0,1,0
3,2015-11-06 16:00:00,B-002,90,,0,20.3333,0.5,1,45.0,25-50,0,0,0,0,1
4,2015-11-09 16:00:00,B-002,90,COMP40660P1,53,72.5,0.25,1,22.5,0-25,1,0,0,0,0
5,2015-11-10 16:00:00,B-002,90,COMP40370P1,27,35.1667,0.25,1,22.5,0-25,0,1,0,0,0
6,2015-11-11 16:00:00,B-002,90,COMP30250P1,22,23.5,0.25,1,22.5,0-25,0,0,1,0,0
7,2015-11-12 16:00:00,B-002,90,COMP30520P1,60,49.8333,0.25,1,22.5,0-25,0,0,0,1,0
8,2015-11-13 16:00:00,B-002,90,,0,8.6667,0.25,1,22.5,0-25,0,0,0,0,1
9,2015-11-03 09:00:00,B-002,90,,0,2.0,0.0,0,0.0,0-25,0,1,0,0,0


In [9]:
times = ['9AM', '10AM', '11AM', '12PM', '1PM', '2PM', '3PM', '4PM']
df['time'] = df['DateTime'].apply(lambda dt: dt.hour)

for i in range(0,len(times)):
    df[times[i]] =(df.time==i+9)*1
df.drop('time', axis=1, inplace=True)
df.head(10)

Unnamed: 0,DateTime,Room,Capacity,Module,NumReg,AvgNumWifiConn,PercentageEstimate,BinaryEstimate,estimate,occupantEstimate,...,thurs,fri,9AM,10AM,11AM,12PM,1PM,2PM,3PM,4PM
0,2015-11-03 16:00:00,B-002,90,COMP40370P1,27,39.0,0.25,1,22.5,0-25,...,0,0,0,0,0,0,0,0,0,1
1,2015-11-04 16:00:00,B-002,90,COMP30250P1,22,20.0,0.25,1,22.5,0-25,...,0,0,0,0,0,0,0,0,0,1
2,2015-11-05 16:00:00,B-002,90,COMP30520P1,60,32.6667,0.0,0,0.0,0-25,...,1,0,0,0,0,0,0,0,0,1
3,2015-11-06 16:00:00,B-002,90,,0,20.3333,0.5,1,45.0,25-50,...,0,1,0,0,0,0,0,0,0,1
4,2015-11-09 16:00:00,B-002,90,COMP40660P1,53,72.5,0.25,1,22.5,0-25,...,0,0,0,0,0,0,0,0,0,1
5,2015-11-10 16:00:00,B-002,90,COMP40370P1,27,35.1667,0.25,1,22.5,0-25,...,0,0,0,0,0,0,0,0,0,1
6,2015-11-11 16:00:00,B-002,90,COMP30250P1,22,23.5,0.25,1,22.5,0-25,...,0,0,0,0,0,0,0,0,0,1
7,2015-11-12 16:00:00,B-002,90,COMP30520P1,60,49.8333,0.25,1,22.5,0-25,...,1,0,0,0,0,0,0,0,0,1
8,2015-11-13 16:00:00,B-002,90,,0,8.6667,0.25,1,22.5,0-25,...,0,1,0,0,0,0,0,0,0,1
9,2015-11-03 09:00:00,B-002,90,,0,2.0,0.0,0,0.0,0-25,...,0,0,1,0,0,0,0,0,0,0


In [10]:
for i in range(2, 5):
    df['B00'+str(i)] =(df.Room=='B-00'+str(i))*1
df.head(10)

Unnamed: 0,DateTime,Room,Capacity,Module,NumReg,AvgNumWifiConn,PercentageEstimate,BinaryEstimate,estimate,occupantEstimate,...,10AM,11AM,12PM,1PM,2PM,3PM,4PM,B002,B003,B004
0,2015-11-03 16:00:00,B-002,90,COMP40370P1,27,39.0,0.25,1,22.5,0-25,...,0,0,0,0,0,0,1,1,0,0
1,2015-11-04 16:00:00,B-002,90,COMP30250P1,22,20.0,0.25,1,22.5,0-25,...,0,0,0,0,0,0,1,1,0,0
2,2015-11-05 16:00:00,B-002,90,COMP30520P1,60,32.6667,0.0,0,0.0,0-25,...,0,0,0,0,0,0,1,1,0,0
3,2015-11-06 16:00:00,B-002,90,,0,20.3333,0.5,1,45.0,25-50,...,0,0,0,0,0,0,1,1,0,0
4,2015-11-09 16:00:00,B-002,90,COMP40660P1,53,72.5,0.25,1,22.5,0-25,...,0,0,0,0,0,0,1,1,0,0
5,2015-11-10 16:00:00,B-002,90,COMP40370P1,27,35.1667,0.25,1,22.5,0-25,...,0,0,0,0,0,0,1,1,0,0
6,2015-11-11 16:00:00,B-002,90,COMP30250P1,22,23.5,0.25,1,22.5,0-25,...,0,0,0,0,0,0,1,1,0,0
7,2015-11-12 16:00:00,B-002,90,COMP30520P1,60,49.8333,0.25,1,22.5,0-25,...,0,0,0,0,0,0,1,1,0,0
8,2015-11-13 16:00:00,B-002,90,,0,8.6667,0.25,1,22.5,0-25,...,0,0,0,0,0,0,1,1,0,0
9,2015-11-03 09:00:00,B-002,90,,0,2.0,0.0,0,0.0,0-25,...,0,0,0,0,0,0,0,1,0,0


In [11]:
df.corr()

Unnamed: 0,Capacity,NumReg,AvgNumWifiConn,PercentageEstimate,BinaryEstimate,estimate,mon,tues,wed,thurs,...,10AM,11AM,12PM,1PM,2PM,3PM,4PM,B002,B003,B004
Capacity,1.0,0.468271,0.355578,-0.045083,-0.056617,0.39296,2.6474550000000002e-17,3.7443450000000005e-17,2.194961e-17,2.2595180000000002e-17,...,-9.332719e-17,-1.5419270000000002e-17,6.248864e-17,-2.434622e-18,-2.8403930000000004e-17,3.6519330000000003e-17,3.002701e-17,-0.5,-0.5,1.0
NumReg,0.4682708,1.0,0.736217,0.344868,0.290085,0.562036,-0.04625192,0.02518073,-0.04275289,0.09583169,...,-0.009564565,0.0570458,0.1000863,0.009564565,-0.024253,-0.01161411,-0.1762613,-0.1253355,-0.3429352,0.4682708
AvgNumWifiConn,0.3555784,0.736217,1.0,0.598574,0.454588,0.765715,-0.008559815,0.127076,-0.1088693,0.08762523,...,-0.01575205,0.02496158,0.1411035,-0.005947925,0.1159951,-0.05334014,-0.1693601,-0.2275643,-0.1280142,0.3555784
PercentageEstimate,-0.04508348,0.344868,0.598574,1.0,0.769572,0.816815,0.05071892,0.1150197,-0.1022397,-0.02555993,...,0.0,0.06426169,0.1445888,-0.04819627,0.1285234,0.0,-0.1767197,-0.07889609,0.1239796,-0.04508348
BinaryEstimate,-0.05661669,0.290085,0.454588,0.769572,1.0,0.634861,0.2052355,0.06954706,-0.1711928,0.0213991,...,-0.0491772,0.1324002,0.1929259,-0.07944009,0.1324002,0.04161148,-0.1702288,0.0495396,0.007077086,-0.05661669
estimate,0.39296,0.562036,0.765715,0.816815,0.634861,1.0,0.04876492,0.1268725,-0.1533565,0.04715219,...,0.006579753,0.04302146,0.1017331,-0.007592023,0.09262268,-0.01973926,-0.198911,-0.2540037,-0.1389563,0.39296
mon,2.6474550000000002e-17,-0.046252,-0.00856,0.050719,0.205235,0.048765,1.0,-0.1889822,-0.1889822,-0.1889822,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.301043e-17,1.301043e-17,1.301043e-17
tues,3.7443450000000005e-17,0.025181,0.127076,0.11502,0.069547,0.126872,-0.1889822,1.0,-0.2857143,-0.2857143,...,1.5420550000000002e-17,1.5887840000000002e-17,1.635513e-17,1.3084100000000001e-17,1.355139e-17,1.401868e-17,1.448597e-17,1.3113280000000001e-18,1.3113280000000001e-18,1.3113280000000001e-18
wed,2.194961e-17,-0.042753,-0.108869,-0.10224,-0.171193,-0.153356,-0.1889822,-0.2857143,1.0,-0.2857143,...,1.5420550000000002e-17,1.2149520000000001e-17,1.2616810000000002e-17,1.3084100000000001e-17,1.355139e-17,1.401868e-17,1.448597e-17,1.3113280000000001e-18,1.3113280000000001e-18,1.3113280000000001e-18
thurs,2.2595180000000002e-17,0.095832,0.087625,-0.02556,0.021399,0.047152,-0.1889822,-0.2857143,-0.2857143,1.0,...,1.5420550000000002e-17,1.2149520000000001e-17,1.2616810000000002e-17,1.3084100000000001e-17,1.355139e-17,1.401868e-17,1.448597e-17,1.966992e-18,1.966992e-18,1.966992e-18


In [12]:
df.shape

(216, 26)

In [13]:
intercept = pd.DataFrame({'Intercept':np.ones(len(df))})
intercept

Unnamed: 0,Intercept
0,1.0
1,1.0
2,1.0
3,1.0
4,1.0
5,1.0
6,1.0
7,1.0
8,1.0
9,1.0


In [14]:
x = pd.concat([intercept, df[['Capacity', 'NumReg', 'AvgNumWifiConn', 'mon', 'tues', 'wed', 'thurs', 'fri', '9AM', '10AM', '11AM', '12PM', '1PM', '2PM', '3PM', '4PM', 'B002', 'B003', 'B004']]], axis = 1)
y = df['occupantEstimate']
x

Unnamed: 0,Intercept,Capacity,NumReg,AvgNumWifiConn,mon,tues,wed,thurs,fri,9AM,10AM,11AM,12PM,1PM,2PM,3PM,4PM,B002,B003,B004
0,1.0,90,27,39.0000,0,1,0,0,0,0,0,0,0,0,0,0,1,1,0,0
1,1.0,90,22,20.0000,0,0,1,0,0,0,0,0,0,0,0,0,1,1,0,0
2,1.0,90,60,32.6667,0,0,0,1,0,0,0,0,0,0,0,0,1,1,0,0
3,1.0,90,0,20.3333,0,0,0,0,1,0,0,0,0,0,0,0,1,1,0,0
4,1.0,90,53,72.5000,1,0,0,0,0,0,0,0,0,0,0,0,1,1,0,0
5,1.0,90,27,35.1667,0,1,0,0,0,0,0,0,0,0,0,0,1,1,0,0
6,1.0,90,22,23.5000,0,0,1,0,0,0,0,0,0,0,0,0,1,1,0,0
7,1.0,90,60,49.8333,0,0,0,1,0,0,0,0,0,0,0,0,1,1,0,0
8,1.0,90,0,8.6667,0,0,0,0,1,0,0,0,0,0,0,0,1,1,0,0
9,1.0,90,0,2.0000,0,1,0,0,0,1,0,0,0,0,0,0,0,1,0,0


In [15]:
log = LogisticRegression().fit(x[['Intercept','Capacity', 'NumReg', 'AvgNumWifiConn', 'mon', 'tues', 'wed', 'thurs', 'fri', '9AM', '10AM', '11AM', '12PM', '1PM', '2PM', '3PM', '4PM']], y)

In [16]:
predictions = log.predict(x[['Intercept','Capacity', 'NumReg', 'AvgNumWifiConn', 'mon', 'tues', 'wed', 'thurs', 'fri', '9AM', '10AM', '11AM', '12PM', '1PM', '2PM', '3PM', '4PM']])
log.score(x[['Intercept','Capacity', 'NumReg', 'AvgNumWifiConn', 'mon', 'tues', 'wed', 'thurs', 'fri', '9AM', '10AM', '11AM', '12PM', '1PM', '2PM', '3PM', '4PM']], y)

0.75462962962962965

In [17]:
# include rooms
log = LogisticRegression().fit(x[['Intercept','Capacity', 'NumReg', 'AvgNumWifiConn', 'mon', 'tues', 'wed', 'thurs', 'fri', '9AM', '10AM', '11AM', '12PM', '1PM', '2PM', '3PM', '4PM', 'B002', 'B003', 'B004']], y)

In [18]:
predictions = log.predict(x[['Intercept','Capacity', 'NumReg', 'AvgNumWifiConn', 'mon', 'tues', 'wed', 'thurs', 'fri', '9AM', '10AM', '11AM', '12PM', '1PM', '2PM', '3PM', '4PM', 'B002', 'B003', 'B004']])
log.score(x[['Intercept','Capacity', 'NumReg', 'AvgNumWifiConn', 'mon', 'tues', 'wed', 'thurs', 'fri', '9AM', '10AM', '11AM', '12PM', '1PM', '2PM', '3PM', '4PM', 'B002', 'B003', 'B004']], y)


0.75

In [19]:
df.columns

Index(['DateTime', 'Room', 'Capacity', 'Module', 'NumReg', 'AvgNumWifiConn',
       'PercentageEstimate', 'BinaryEstimate', 'estimate', 'occupantEstimate',
       'mon', 'tues', 'wed', 'thurs', 'fri', '9AM', '10AM', '11AM', '12PM',
       '1PM', '2PM', '3PM', '4PM', 'B002', 'B003', 'B004'],
      dtype='object')

## Model evaluation on out-of-sample data - with rooms

In [20]:
# Split the data into train and test sets
# Take a fifth of data samples as test data, rest as training data
X_train, X_test, y_train, y_test = train_test_split(x[['Intercept','Capacity', 'NumReg', 'AvgNumWifiConn', 'mon', 'tues', 'wed', 'thurs', 'fri', '9AM', '10AM', '11AM', '12PM', '1PM', '2PM', '3PM', '4PM', 'B002', 'B003', 'B004']], y, test_size=0.2)

# print("Training data:\n", pd.concat([X_train, y_train], axis=1))
# print("\nTest data:\n", pd.concat([X_test, y_test], axis=1))

In [21]:
# Train on the training sample and test on the test sample.
logreg_train = LogisticRegression().fit(X_train, y_train)
# Print the weights learned for each feature.
print(logreg_train.coef_)

[[  1.06599714e+00  -2.97753706e-04  -2.91095044e-03  -7.01463123e-02
    1.94010722e-01  -3.47611518e-01   4.83205308e-01   6.33509578e-01
    1.02883046e-01   4.97965818e-01  -2.37635111e-01  -3.18332840e-01
   -4.56632183e-01   7.84678626e-01  -1.92646982e-01   4.31168704e-01
    5.57431103e-01   1.09160812e+00   7.12389326e-01  -7.38000308e-01]
 [ -1.39301511e+00  -1.64253730e-03   7.25635985e-03   1.31358796e-02
   -2.00339202e-01  -1.06787352e-01  -9.84253797e-01   6.07296314e-03
   -1.07707719e-01  -9.25971694e-01   6.98256054e-01  -3.22383926e-01
   -6.37783869e-01   6.69411424e-01   5.40844331e-02  -3.47731739e-01
   -5.80895791e-01  -1.19026604e+00  -1.16713150e+00   9.64382439e-01]
 [ -7.54944046e-01  -6.51905141e-02   7.95383316e-02   7.49670106e-03
   -1.01603441e-01   4.76165538e-01  -5.34275231e-01  -3.62978215e-01
   -2.32252697e-01   5.74227883e-01  -7.51222559e-02  -1.49604489e-01
   -3.24103329e-01  -1.96919305e-01  -1.88872752e-01  -3.06291598e-01
   -8.82581997e-02

In [22]:
# Estimated class probabilities on test set
print(logreg_train.predict_proba(X_test))

[[  8.36438088e-01   1.07618921e-02   1.90119793e-02   8.43235594e-12
    6.23240415e-02   7.14639992e-02]
 [  2.80955785e-01   4.70269826e-01   2.11271046e-05   2.07111383e-23
    6.43405010e-04   2.48109857e-01]
 [  2.27492776e-01   3.66211791e-01   1.86085701e-03   7.08989035e-27
    1.31008979e-03   4.03124486e-01]
 [  6.83690146e-01   1.31102912e-01   1.36731937e-07   1.04211083e-22
    7.41848347e-05   1.85132620e-01]
 [  5.31406576e-02   1.49667640e-01   5.23128521e-04   6.36580355e-25
    1.11203857e-03   7.95556536e-01]
 [  5.35426073e-01   2.42238456e-02   5.76220430e-04   3.44903097e-04
    2.71968213e-01   1.67460745e-01]
 [  1.59454113e-01   1.08480759e-01   1.62921785e-01   1.08099280e-02
    3.19121200e-01   2.39212214e-01]
 [  7.97610419e-01   4.37695093e-02   1.96919857e-02   7.53523955e-14
    6.07225807e-02   7.82055057e-02]
 [  9.21039148e-01   7.39201027e-03   9.23244812e-03   1.02585603e-14
    4.52913533e-02   1.70450408e-02]
 [  7.16801177e-01   1.06575092e-02  

In [23]:
# Estimated classes on train set
predicted_train = logreg_train.predict(X_train)
print(predicted_train)

['0-25' '0-25' '0-25' '0-25' '0-25' '0-25' '0-25' '50-75' '0-25' '0-25'
 '0-25' '0-25' '0-25' '0-25' '0-25' '0-25' '0-25' '0-25' '50-75' '0-25'
 '0-25' '0-25' '0-25' '50-75' '0-25' '0-25' '0-25' '0-25' '0-25' '0-25'
 '0-25' '0-25' '0-25' '0-25' '0-25' '0-25' '0-25' '0-25' '0-25' '100-125'
 '0-25' '0-25' '0-25' '0-25' '0-25' '50-75' '0-25' '0-25' '0-25' '50-75'
 '0-25' '0-25' '0-25' '50-75' '50-75' '0-25' '25-50' '0-25' '100-125'
 '0-25' '0-25' '0-25' '25-50' '50-75' '50-75' '0-25' '100-125' '50-75'
 '0-25' '0-25' '50-75' '0-25' '100-125' '0-25' '0-25' '0-25' '50-75' '0-25'
 '100-125' '50-75' '50-75' '0-25' '0-25' '0-25' '0-25' '100-125' '0-25'
 '0-25' '0-25' '0-25' '50-75' '0-25' '0-25' '0-25' '0-25' '0-25' '0-25'
 '100-125' '0-25' '100-125' '0-25' '25-50' '0-25' '50-75' '0-25' '50-75'
 '50-75' '0-25' '0-25' '0-25' '0-25' '0-25' '0-25' '0-25' '0-25' '50-75'
 '0-25' '0-25' '100-125' '50-75' '0-25' '25-50' '0-25' '0-25' '0-25' '0-25'
 '0-25' '0-25' '0-25' '0-25' '50-75' '25-50' '0-25' '0

In [24]:
# Estimated classes on train set
predicted_test = logreg_train.predict(X_test)
print(predicted_test)

['0-25' '100-125' '50-75' '0-25' '50-75' '0-25' '25-50' '0-25' '0-25'
 '0-25' '0-25' '0-25' '0-25' '50-75' '0-25' '0-25' '100-125' '0-25' '50-75'
 '0-25' '0-25' '0-25' '0-25' '0-25' '0-25' '0-25' '0-25' '0-25' '50-75'
 '0-25' '0-25' '0-25' '0-25' '0-25' '0-25' '0-25' '100-125' '50-75' '0-25'
 '50-75' '0-25' '0-25' '0-25' '50-75']


In [25]:
# accuracy of trained predictions
print(metrics.accuracy_score(y_train, predicted_train))

0.761627906977


In [26]:
# accuracy of trained predictions
print(metrics.accuracy_score(y_test, predicted_test))

0.590909090909


## Model evaluation on out-of-sample data - without rooms

In [27]:
# Split the data into train and test sets
# Take a fifth of data samples as test data, rest as training data
X_train, X_test, y_train, y_test = train_test_split(x[['Intercept','Capacity', 'NumReg', 'AvgNumWifiConn', 'mon', 'tues', 'wed', 'thurs', 'fri', '9AM', '10AM', '11AM', '12PM', '1PM', '2PM', '3PM', '4PM']], y, test_size=0.3)

# print("Training data:\n", pd.concat([X_train, y_train], axis=1))
# print("\nTest data:\n", pd.concat([X_test, y_test], axis=1))

In [28]:
# Train on the training sample and test on the test sample.
logreg_train = LogisticRegression().fit(X_train, y_train)
# Print the weights learned for each feature.
print(logreg_train.coef_)

[[  1.83486430e+00  -1.13977952e-02   7.27659130e-03  -8.88075918e-02
    1.03317974e-01   1.26091900e-01   3.59282069e-01   8.72245628e-01
    3.73926725e-01   5.21428218e-01  -2.75296486e-01  -9.49478885e-02
    3.62660305e-01   5.58580786e-01   7.44780688e-02  -2.03561530e-01
    8.91522823e-01]
 [ -2.03469114e+00   9.75306984e-03   9.96146174e-03   3.02530707e-03
   -7.03179110e-01  -2.64868037e-02  -1.04587654e+00  -4.33130483e-01
    1.73981791e-01  -1.05480495e+00   3.69329096e-01  -4.96847155e-01
   -4.07223298e-01   1.67820472e-01   2.92542745e-01  -1.78632053e-01
   -7.26876004e-01]
 [ -9.38134666e-01  -6.82817919e-02   9.33096233e-02   3.16205875e-03
   -1.22033870e-01   4.18433429e-01  -6.72745323e-01  -3.39284869e-01
   -2.22504034e-01   6.34429814e-01  -1.54219454e-01  -2.18915300e-01
   -2.75938222e-01  -3.78522260e-01  -1.05424307e-01  -3.52544651e-01
   -8.70002855e-02]
 [ -3.81784089e-02  -2.42183484e-01  -3.00012264e-01   4.40374866e-01
   -1.39512306e-04  -4.6620143

In [29]:
# Estimated class probabilities on test set
print(logreg_train.predict_proba(X_test))

[[  4.79282113e-02   4.94663403e-01   3.30771209e-04   2.01617695e-25
    1.67423767e-03   4.55403377e-01]
 [  1.63509648e-02   2.24244243e-01   7.14163822e-02   6.11459483e-25
    1.81299243e-03   6.86175418e-01]
 [  3.96160801e-02   3.44271448e-01   1.63973869e-03   1.17041331e-26
    3.25935301e-03   6.11213380e-01]
 [  7.54879926e-01   2.49606686e-02   4.73814274e-03   3.60973588e-12
    1.12463071e-01   1.02958192e-01]
 [  1.67204645e-01   6.64164393e-02   7.30172428e-02   1.74474438e-05
    4.86756172e-01   2.06588053e-01]
 [  9.00811956e-01   1.29989667e-02   8.88897495e-04   6.84013997e-10
    5.26433488e-02   3.26568303e-02]
 [  7.39686509e-01   8.18187223e-02   1.07029025e-02   2.34303859e-12
    1.14177955e-01   5.36139110e-02]
 [  5.09809666e-03   1.27036885e-01   3.22221980e-07   2.23601046e-11
    1.15468319e-02   8.56317864e-01]
 [  1.55898857e-02   2.53573158e-01   1.61676861e-03   1.73577157e-24
    3.33526590e-03   7.25884922e-01]
 [  6.04513093e-01   4.83488750e-02  

In [30]:
# Estimated classes on train set
predicted_train = logreg_train.predict(X_train)
print(predicted_train)

['0-25' '100-125' '0-25' '0-25' '0-25' '0-25' '0-25' '0-25' '0-25' '50-75'
 '0-25' '0-25' '0-25' '0-25' '0-25' '0-25' '50-75' '0-25' '0-25' '0-25'
 '0-25' '25-50' '50-75' '0-25' '0-25' '0-25' '0-25' '0-25' '0-25' '0-25'
 '50-75' '0-25' '0-25' '0-25' '0-25' '25-50' '0-25' '0-25' '25-50' '0-25'
 '25-50' '0-25' '0-25' '0-25' '25-50' '0-25' '0-25' '0-25' '0-25' '50-75'
 '0-25' '0-25' '0-25' '50-75' '25-50' '0-25' '50-75' '50-75' '50-75'
 '50-75' '0-25' '25-50' '50-75' '50-75' '0-25' '0-25' '0-25' '50-75'
 '50-75' '0-25' '0-25' '0-25' '0-25' '0-25' '0-25' '0-25' '0-25' '50-75'
 '0-25' '50-75' '50-75' '0-25' '0-25' '0-25' '50-75' '0-25' '0-25' '0-25'
 '0-25' '0-25' '0-25' '0-25' '50-75' '50-75' '50-75' '0-25' '0-25' '0-25'
 '0-25' '0-25' '0-25' '0-25' '200-225' '0-25' '0-25' '0-25' '0-25' '0-25'
 '50-75' '0-25' '50-75' '0-25' '0-25' '0-25' '0-25' '0-25' '0-25' '0-25'
 '0-25' '0-25' '0-25' '50-75' '0-25' '0-25' '25-50' '0-25' '0-25' '0-25'
 '0-25' '0-25' '0-25' '50-75' '0-25' '50-75' '50-75' 

In [31]:
# Estimated classes on test set
predicted_test = logreg_train.predict(X_test)
print(predicted_test)

['100-125' '50-75' '50-75' '0-25' '25-50' '0-25' '0-25' '50-75' '50-75'
 '0-25' '0-25' '0-25' '0-25' '0-25' '50-75' '50-75' '25-50' '0-25' '0-25'
 '0-25' '0-25' '0-25' '0-25' '0-25' '0-25' '0-25' '0-25' '0-25' '0-25'
 '0-25' '25-50' '0-25' '0-25' '0-25' '50-75' '0-25' '0-25' '0-25' '50-75'
 '0-25' '0-25' '0-25' '0-25' '0-25' '50-75' '0-25' '0-25' '0-25' '0-25'
 '0-25' '0-25' '50-75' '0-25' '0-25' '0-25' '0-25' '0-25' '0-25' '0-25'
 '0-25' '0-25' '50-75' '0-25' '25-50' '0-25']


In [32]:
# accuracy of trained predictions
print(metrics.accuracy_score(y_train, predicted_train))

0.761589403974


In [33]:
# accuracy of test predictions
print(metrics.accuracy_score(y_test, predicted_test))

0.661538461538


## Model evaluation on out-of-sample data - with rooms, without capacity

In [34]:
# Split the data into train and test sets
# Take a fifth of data samples as test data, rest as training data
X_train, X_test, y_train, y_test = train_test_split(x[['Intercept', 'NumReg', 'AvgNumWifiConn', 'mon', 'tues', 'wed', 'thurs', 'fri', '9AM', '10AM', '11AM', '12PM', '1PM', '2PM', '3PM', '4PM', 'B002', 'B003', 'B004']], y, test_size=0.3)

# print("Training data:\n", pd.concat([X_train, y_train], axis=1))
# print("\nTest data:\n", pd.concat([X_test, y_test], axis=1))

In [35]:
# Train on the training sample and test on the test sample.
logreg_train = LogisticRegression().fit(X_train, y_train)
# Print the weights learned for each feature.
print(logreg_train.coef_)

[[ 0.95601432  0.00282982 -0.08177323 -0.26530969 -0.46454855  1.08817025
   0.64563463 -0.04793233 -0.01474651  0.26218484 -0.42504461 -0.28971085
   0.74155561 -0.13814864  0.12931829  0.69060618  1.30035657  0.79119695
  -1.1355392 ]
 [-1.37702793  0.00347751  0.01179643 -0.29000594  0.09011922 -0.93043776
  -0.10577048 -0.14093296 -1.14891037  0.33099803 -0.3288651  -0.24136157
   0.40544443  0.17827463 -0.00629761 -0.56631038 -1.13660479 -1.13930237
   0.89887924]
 [-1.64708644  0.00248698  0.00429414 -0.30439423  0.04706871 -0.41956926
  -0.54146576 -0.4287259   0.38347947 -0.23908062 -0.28402203 -0.27230869
  -0.31059243 -0.33453949 -0.34510962 -0.24491303 -0.69005014 -0.68137605
  -0.27566024]
 [-1.74210207 -0.0299262   0.03826022 -0.32144482 -0.54857894 -0.37015893
  -0.13017038 -0.371749    0.17799037 -0.19177847 -0.23021676 -0.2073399
  -0.2820147  -0.46988989 -0.33550533 -0.20334738 -0.53824151 -0.75653038
  -0.44733018]
 [-0.97413027 -0.00449085  0.00915715  0.12946093  0.

In [36]:
# Estimated class probabilities on test set
print(logreg_train.predict_proba(X_test))

[[  8.29714647e-01   1.16135953e-02   9.02473444e-03   1.27102197e-02
    5.12306537e-02   8.57061499e-02]
 [  7.29314897e-01   2.68600563e-02   1.29996545e-02   7.19646154e-03
    1.54367166e-01   6.92617645e-02]
 [  1.71676265e-02   4.03829497e-01   2.62620510e-02   8.69393044e-03
    2.15702831e-02   5.22476612e-01]
 [  6.81937977e-02   5.14216646e-02   2.91099451e-02   1.63865117e-02
    3.77561977e-01   4.57326104e-01]
 [  8.71413241e-02   1.14191447e-01   2.41477697e-02   4.50416676e-03
    1.26060102e-02   7.57409282e-01]
 [  6.85043002e-01   3.88178407e-02   1.20784685e-02   6.22665441e-03
    1.03169042e-01   1.54664992e-01]
 [  8.61067747e-01   1.03622486e-02   7.92017396e-03   1.33381169e-02
    7.35715519e-02   3.37401613e-02]
 [  1.57116754e-01   2.57897736e-02   1.51691619e-02   1.50817803e-02
    2.79340677e-01   5.07501853e-01]
 [  6.32747234e-02   2.63777495e-01   2.29963560e-02   4.83180910e-03
    2.87340103e-02   6.16385607e-01]
 [  1.02317389e-01   3.47974968e-01  

In [37]:
# Estimated classes on train set
predicted_train = logreg_train.predict(X_train)
print(predicted_train)

['50-75' '100-125' '25-50' '100-125' '50-75' '0-25' '0-25' '0-25' '100-125'
 '0-25' '0-25' '0-25' '50-75' '0-25' '0-25' '0-25' '0-25' '0-25' '0-25'
 '0-25' '50-75' '0-25' '0-25' '0-25' '0-25' '50-75' '0-25' '0-25' '0-25'
 '0-25' '100-125' '0-25' '0-25' '0-25' '50-75' '0-25' '0-25' '200-225'
 '50-75' '0-25' '0-25' '100-125' '0-25' '50-75' '0-25' '0-25' '0-25'
 '100-125' '0-25' '50-75' '0-25' '0-25' '0-25' '0-25' '0-25' '0-25' '0-25'
 '0-25' '0-25' '50-75' '0-25' '0-25' '0-25' '0-25' '0-25' '0-25' '0-25'
 '0-25' '0-25' '0-25' '0-25' '0-25' '0-25' '0-25' '50-75' '50-75' '0-25'
 '50-75' '0-25' '0-25' '50-75' '0-25' '0-25' '50-75' '0-25' '50-75' '0-25'
 '0-25' '50-75' '0-25' '0-25' '0-25' '0-25' '0-25' '0-25' '0-25' '0-25'
 '0-25' '0-25' '0-25' '0-25' '0-25' '0-25' '0-25' '0-25' '0-25' '0-25'
 '0-25' '0-25' '0-25' '50-75' '0-25' '0-25' '0-25' '0-25' '25-50' '0-25'
 '0-25' '50-75' '0-25' '50-75' '50-75' '50-75' '0-25' '0-25' '0-25' '0-25'
 '0-25' '0-25' '50-75' '0-25' '50-75' '50-75' '0-25' 

In [38]:
# Estimated classes on test set
predicted_test = logreg_train.predict(X_test)
print(predicted_test)

['0-25' '0-25' '50-75' '50-75' '50-75' '0-25' '0-25' '50-75' '50-75'
 '50-75' '50-75' '0-25' '0-25' '0-25' '0-25' '0-25' '50-75' '0-25' '0-25'
 '0-25' '0-25' '0-25' '0-25' '50-75' '50-75' '0-25' '50-75' '0-25' '50-75'
 '50-75' '0-25' '0-25' '0-25' '0-25' '0-25' '0-25' '0-25' '0-25' '0-25'
 '100-125' '50-75' '0-25' '0-25' '0-25' '0-25' '0-25' '0-25' '50-75' '0-25'
 '0-25' '0-25' '0-25' '0-25' '0-25' '50-75' '25-50' '0-25' '0-25' '0-25'
 '50-75' '0-25' '0-25' '50-75' '0-25' '50-75']


In [39]:
# accuracy of trained predictions
print(metrics.accuracy_score(y_train, predicted_train))

0.781456953642


In [40]:
# accuracy of test predictions
print(metrics.accuracy_score(y_test, predicted_test))

0.661538461538


## Model evaluation on out-of-sample data - with rooms, without NumReg

In [41]:
# Split the data into train and test sets
# Take a fifth of data samples as test data, rest as training data
X_train, X_test, y_train, y_test = train_test_split(x[['Intercept', 'Capacity', 'AvgNumWifiConn', 'mon', 'tues', 'wed', 'thurs', 'fri', '9AM', '10AM', '11AM', '12PM', '1PM', '2PM', '3PM', '4PM', 'B002', 'B003', 'B004']], y, test_size=0.3)

# print("Training data:\n", pd.concat([X_train, y_train], axis=1))
# print("\nTest data:\n", pd.concat([X_test, y_test], axis=1))

In [42]:
# Train on the training sample and test on the test sample.
logreg_train = LogisticRegression().fit(X_train, y_train)
# Print the weights learned for each feature.
print(logreg_train.coef_)

[[  8.38766999e-01   1.02206731e-03  -5.98212865e-02   5.85457643e-02
   -5.15704241e-01   3.27430167e-01   9.22906975e-01   4.55883337e-02
   -1.35293817e-01   3.06892769e-01  -6.69433272e-01  -3.17894447e-01
    8.41610229e-01  -2.71214767e-01   7.84608727e-01   2.99491577e-01
    1.23986671e+00   1.79577274e-01  -5.80676984e-01]
 [ -1.42593225e+00  -2.04290950e-03   2.68174753e-02  -3.14377868e-01
   -6.83405144e-01  -6.28554774e-01  -9.78379858e-02   2.98243524e-01
   -4.56418181e-01  -4.48555367e-01  -9.39290694e-01  -2.65350847e-01
    7.24497391e-01   2.69786644e-01   3.06807033e-01  -6.17408228e-01
   -1.14726108e+00  -1.26583932e+00   9.87168150e-01]
 [ -8.86518568e-01  -3.89754831e-02   5.84151279e-02  -2.38084191e-01
    3.36257316e-01  -4.32160028e-01  -3.29952700e-01  -2.22578965e-01
    6.95673000e-01  -1.86585820e-01  -2.38187464e-01  -3.48773933e-01
   -2.44097091e-01  -1.72361765e-01  -1.83789655e-01  -2.08395839e-01
   -6.05980733e-01  -8.93981648e-01   6.13443813e-01

In [43]:
# Estimated class probabilities on test set
print(logreg_train.predict_proba(X_test))

[[  6.05016056e-01   1.35333428e-01   3.04933212e-04   8.48412516e-04
    2.58497171e-01]
 [  8.06839509e-01   8.63608460e-03   2.53437449e-03   1.48407019e-01
    3.35830130e-02]
 [  6.66683658e-01   8.95541735e-02   6.80647925e-05   1.39136052e-03
    2.42302744e-01]
 [  7.15777596e-01   4.23158357e-03   6.03196487e-03   2.02497234e-01
    7.14616214e-02]
 [  8.00161696e-01   1.25634620e-02   4.72226828e-03   1.27595744e-01
    5.49568297e-02]
 [  1.79037326e-03   3.35319538e-01   2.76708692e-02   2.09367496e-02
    6.14282470e-01]
 [  8.39437524e-01   1.52564717e-02   3.66829486e-03   1.16817012e-01
    2.48206977e-02]
 [  4.67585276e-01   3.55614249e-02   4.48408880e-02   3.99931977e-01
    5.20804339e-02]
 [  9.31351011e-01   3.40247301e-02   2.64421446e-03   2.64959733e-02
    5.48407152e-03]
 [  8.08869412e-01   6.58930515e-02   1.44614810e-02   9.87943776e-02
    1.19816784e-02]
 [  6.32298829e-01   2.53759751e-02   1.69730284e-02   2.50400075e-01
    7.49520931e-02]
 [  2.2169

In [44]:
# Estimated classes on train set
predicted_train = logreg_train.predict(X_train)
print(predicted_train)

['0-25' '100-125' '0-25' '0-25' '0-25' '100-125' '0-25' '0-25' '0-25'
 '0-25' '100-125' '0-25' '0-25' '0-25' '0-25' '50-75' '0-25' '0-25' '50-75'
 '0-25' '0-25' '50-75' '0-25' '0-25' '0-25' '0-25' '0-25' '100-125' '0-25'
 '0-25' '0-25' '50-75' '0-25' '100-125' '0-25' '0-25' '0-25' '50-75'
 '25-50' '0-25' '0-25' '0-25' '0-25' '0-25' '50-75' '0-25' '0-25' '0-25'
 '100-125' '0-25' '0-25' '0-25' '0-25' '0-25' '0-25' '0-25' '50-75' '0-25'
 '0-25' '0-25' '0-25' '50-75' '25-50' '50-75' '0-25' '50-75' '0-25' '0-25'
 '25-50' '50-75' '0-25' '0-25' '0-25' '50-75' '50-75' '0-25' '0-25' '0-25'
 '50-75' '50-75' '0-25' '0-25' '0-25' '0-25' '50-75' '0-25' '0-25' '50-75'
 '0-25' '0-25' '0-25' '0-25' '0-25' '0-25' '0-25' '25-50' '0-25' '0-25'
 '25-50' '0-25' '0-25' '0-25' '0-25' '0-25' '0-25' '0-25' '0-25' '0-25'
 '0-25' '50-75' '0-25' '0-25' '50-75' '0-25' '50-75' '0-25' '0-25' '0-25'
 '50-75' '0-25' '0-25' '0-25' '0-25' '50-75' '0-25' '0-25' '0-25' '0-25'
 '0-25' '0-25' '50-75' '0-25' '0-25' '0-25' '0

In [45]:
# Estimated classes on test set
predicted_test = logreg_train.predict(X_test)
print(predicted_test)

['0-25' '0-25' '0-25' '0-25' '0-25' '50-75' '0-25' '0-25' '0-25' '0-25'
 '0-25' '50-75' '0-25' '0-25' '0-25' '0-25' '0-25' '0-25' '0-25' '0-25'
 '50-75' '0-25' '0-25' '25-50' '0-25' '0-25' '25-50' '0-25' '0-25' '0-25'
 '0-25' '50-75' '0-25' '0-25' '0-25' '0-25' '0-25' '50-75' '0-25' '0-25'
 '0-25' '50-75' '100-125' '50-75' '0-25' '0-25' '0-25' '0-25' '0-25'
 '25-50' '0-25' '50-75' '0-25' '0-25' '0-25' '0-25' '0-25' '100-125'
 '150-175' '0-25' '0-25' '0-25' '0-25' '50-75' '0-25']


In [46]:
# accuracy of trained predictions
print(metrics.accuracy_score(y_train, predicted_train))

0.774834437086


In [47]:
# accuracy of test predictions
print(metrics.accuracy_score(y_test, predicted_test))

0.692307692308


### Model evaluation on out-of-sample data - with rooms, without NumReg, without Capacity

In [48]:
# Split the data into train and test sets
# Take a fifth of data samples as test data, rest as training data
X_train, X_test, y_train, y_test = train_test_split(x[['Intercept', 'Capacity', 'AvgNumWifiConn', 'mon', 'tues', 'wed', 'thurs', 'fri', '9AM', '10AM', '11AM', '12PM', '1PM', '2PM', '3PM', '4PM', 'B002', 'B003', 'B004']], y, test_size=0.3)

# print("Training data:\n", pd.concat([X_train, y_train], axis=1))
# print("\nTest data:\n", pd.concat([X_test, y_test], axis=1))

In [49]:
# Train on the training sample and test on the test sample.
logreg_train = LogisticRegression().fit(X_train, y_train)
# Print the weights learned for each feature.
print(logreg_train.coef_)

[[  1.01462511e+00   3.01201847e-04  -7.16971988e-02  -2.03945470e-02
   -2.41990790e-01   8.80058430e-01   4.50833240e-01  -5.38812233e-02
    3.64837607e-01  -3.75005769e-02  -2.60842889e-01   1.21236739e-01
    8.87054451e-01  -6.88250840e-01   2.01487378e-01   4.26603239e-01
    9.56315080e-01   7.60740480e-01  -7.02430451e-01]
 [ -1.34755865e+00  -1.70923780e-03   1.69996327e-02  -9.24284489e-02
   -5.91912486e-01  -8.99861702e-01   1.81875168e-01   5.47688224e-02
   -6.78324186e-01   3.49284162e-01  -8.39296528e-01  -9.06858557e-02
    3.78593868e-01  -2.71923879e-01   2.95921949e-01  -4.91128176e-01
   -1.10832606e+00  -1.17214466e+00   9.32912069e-01]
 [ -8.96493449e-01  -3.82589940e-02   5.61439998e-02  -1.99417200e-01
    2.53901062e-01  -4.83070898e-01  -2.53222107e-01  -2.14684305e-01
    7.19146634e-01  -2.07862662e-01  -2.24949218e-01  -4.07875031e-01
   -2.11026805e-01  -1.99797778e-01  -1.60274031e-01  -2.03854558e-01
   -6.30669030e-01  -8.86179429e-01   6.20355011e-01

In [50]:
# Estimated class probabilities on test set
print(logreg_train.predict_proba(X_test))

[[  9.17915103e-01   1.56046240e-02   4.71515164e-03   3.22459411e-02
    2.95191799e-02]
 [  8.79841149e-01   6.64059569e-03   4.99890983e-03   7.68518009e-02
    3.16675445e-02]
 [  6.57092349e-01   4.64421590e-02   6.17622369e-04   4.67183172e-04
    2.95380686e-01]
 [  7.08439291e-01   3.18594635e-02   6.51662522e-03   1.37875768e-01
    1.15308853e-01]
 [  7.72692996e-01   7.16784986e-02   6.82059673e-05   9.85544642e-05
    1.55461745e-01]
 [  9.19317717e-01   1.06202323e-02   1.86874960e-03   4.76027986e-02
    2.05905023e-02]
 [  1.38450319e-01   3.83989377e-02   3.40827553e-02   5.22965671e-01
    2.66102317e-01]
 [  6.87552944e-01   1.41699212e-02   8.45272476e-03   1.53404592e-01
    1.36419818e-01]
 [  9.29984031e-01   3.64399566e-03   3.00591255e-03   3.12184019e-02
    3.21476593e-02]
 [  8.09580997e-01   6.35145504e-02   4.43178394e-05   7.78549051e-05
    1.26782280e-01]
 [  8.13385413e-01   1.38561735e-02   1.27278169e-02   1.05013087e-01
    5.50175087e-02]
 [  7.1412

In [51]:
# Estimated classes on train set
predicted_train = logreg_train.predict(X_train)
print(predicted_train)

['0-25' '0-25' '0-25' '0-25' '50-75' '0-25' '0-25' '0-25' '0-25' '0-25'
 '0-25' '0-25' '0-25' '0-25' '50-75' '50-75' '50-75' '0-25' '0-25' '0-25'
 '0-25' '0-25' '0-25' '0-25' '50-75' '0-25' '0-25' '50-75' '0-25' '0-25'
 '0-25' '0-25' '0-25' '0-25' '0-25' '0-25' '0-25' '0-25' '0-25' '50-75'
 '0-25' '0-25' '0-25' '0-25' '0-25' '0-25' '0-25' '25-50' '0-25' '0-25'
 '50-75' '50-75' '0-25' '0-25' '0-25' '0-25' '0-25' '50-75' '0-25' '50-75'
 '50-75' '0-25' '50-75' '0-25' '0-25' '0-25' '0-25' '0-25' '0-25' '0-25'
 '0-25' '50-75' '50-75' '0-25' '0-25' '50-75' '0-25' '25-50' '50-75'
 '50-75' '0-25' '0-25' '0-25' '0-25' '0-25' '50-75' '50-75' '0-25' '0-25'
 '0-25' '0-25' '0-25' '0-25' '0-25' '50-75' '0-25' '0-25' '0-25' '50-75'
 '0-25' '0-25' '0-25' '0-25' '0-25' '0-25' '0-25' '0-25' '0-25' '0-25'
 '0-25' '0-25' '0-25' '0-25' '0-25' '50-75' '50-75' '0-25' '0-25' '0-25'
 '0-25' '50-75' '0-25' '0-25' '0-25' '100-125' '0-25' '50-75' '0-25' '0-25'
 '0-25' '0-25' '50-75' '0-25' '25-50' '0-25' '0-25' '

In [52]:
# Estimated classes on test set
predicted_test = logreg_train.predict(X_test)
print(predicted_test)

['0-25' '0-25' '0-25' '0-25' '0-25' '0-25' '25-50' '0-25' '0-25' '0-25'
 '0-25' '0-25' '0-25' '0-25' '50-75' '0-25' '0-25' '50-75' '50-75' '50-75'
 '0-25' '50-75' '0-25' '0-25' '0-25' '0-25' '0-25' '0-25' '50-75' '0-25'
 '0-25' '0-25' '0-25' '50-75' '0-25' '0-25' '0-25' '0-25' '50-75' '0-25'
 '50-75' '50-75' '0-25' '0-25' '50-75' '0-25' '25-50' '50-75' '50-75'
 '25-50' '0-25' '0-25' '0-25' '50-75' '0-25' '0-25' '0-25' '0-25' '0-25'
 '0-25' '0-25' '0-25' '50-75' '50-75' '0-25']


In [53]:
# accuracy of trained predictions
print(metrics.accuracy_score(y_train, predicted_train))

0.76821192053


In [54]:
# accuracy of test predictions
print(metrics.accuracy_score(y_test, predicted_test))

0.646153846154




[ 0.52        0.60869565  0.56521739  0.69565217  0.71428571  0.71428571
  0.8         0.55        0.7         0.6       ]


# References
[1] Wirth, R. and Hipp, J., 2000, April. CRISP-DM: Towards a standard process model for data mining. In Proceedings of the 4th international conference on the practical applications of knowledge discovery and data mining (pp. 29-39).

[2] Ifrim, G., ‘Lecture8-DataUnderstanding-Stats-Visualisation’, (COMP47350 Lecture Notes), University College Dublin, 2016

[3] Ifrim, G., ‘Lecture12-DataUnderstanding-Correlation’, (COMP47350 Lecture Notes), University College Dublin, 2016

[4] Ifrim, G.,  ‘Lecture10-DataUnderstanding-MotorInsurance-handson’, (COMP47350 Lecture Notes), University College Dublin, 2016

[5] Ifrim, G., ‘Lecture15-Regression-LinearRegression-Interpretation-updated’, (COMP47350 Lecture Notes), University College Dublin, 2016

[6] Ifrim, G., ‘Lecture16-LinearRegression-handson, (COMP47350 Lecture Notes)’, University College Dublin, 2016

[7] Ifrim, G., ‘Lecture19-ModelEvaluation-ExperimentDesign, (COMP47350 Lecture Notes)’, University College Dublin, 2016