In [107]:
import MySQLdb as db
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from pandas.tools.plotting import scatter_matrix
from sklearn.linear_model import LogisticRegression
from sklearn.cross_validation import train_test_split
from sklearn import metrics
from sklearn.cross_validation import cross_val_score
%matplotlib inline

name = 'DatabaseMain'
conn = db.connect(host = "localhost", user = "root", passwd ="", db=name)
cursor = conn.cursor()

sql = ''' SELECT G.DateTime, W.Room, R.Capacity, T.Module, M.NumReg, W.Associated, G.PercentageEstimate, G.BinaryEstimate
          FROM DatabaseMain.WiFiLogData W, DatabaseMain.GroundTruth G, DatabaseMain.Rooms R, DatabaseMain.TimeModule T,
          DatabaseMain.Modules M
          WHERE W.Room = G.Room AND W.DateTime BETWEEN G.DateTime AND DATE_ADD(G.DateTime, INTERVAL 1 HOUR) 
          AND R.Room = W.Room AND R.Room = G.Room AND T.Room = G.Room AND T.Room = R.Room AND T.Room =  W.Room 
          AND T.DateTime = G.DateTime AND M.ModuleName = T.Module
          ORDER BY G.DateTime'''
df = pd.read_sql_query(sql, conn)
df.head()



Unnamed: 0,DateTime,Room,Capacity,Module,NumReg,Associated,PercentageEstimate,BinaryEstimate
0,2015-11-03 09:00:00,B-002,90,,0,18,0.0,0
1,2015-11-03 09:00:00,B-003,90,,0,1,0.0,0
2,2015-11-03 09:00:00,B-003,90,,0,1,0.0,0
3,2015-11-03 09:00:00,B-004,220,COMP41450 & COMP30120,143,113,0.75,1
4,2015-11-03 09:00:00,B-003,90,,0,2,0.0,0


### For logistic regression, we need to make the target feature categorical. For this, we can bin the value ranges. 

In [110]:
df.dtypes

DateTime              datetime64[ns]
Room                        category
Capacity                       int64
Module                      category
NumReg                         int64
Associated                     int64
PercentageEstimate           float64
BinaryEstimate                 int64
dtype: object

In [111]:
df['Room'] = df['Room'].astype('category')
df['Module'] = df['Module'].astype('category')
df['DateTime'] = df['DateTime'].astype('datetime64[ns]')
df.dtypes

DateTime              datetime64[ns]
Room                        category
Capacity                       int64
Module                      category
NumReg                         int64
Associated                     int64
PercentageEstimate           float64
BinaryEstimate                 int64
dtype: object

In [112]:
df.shape

(2502, 8)

### For logistic regression, we need to make the training features to be continuous. To do this we need to create dummy values for catergorical features. 


In [113]:
days = ['mon', 'tues', 'wed', 'thurs', 'fri', 'sat', 'sun']
# http://stackoverflow.com/questions/13740672/in-pandas-how-can-i-groupby-weekday-for-a-datetime-column
# http://chrisalbon.com/python/pandas_apply_operations_to_dataframes.html
df['weekday'] = df['DateTime'].apply(lambda dt: dt.weekday())
df.head()


Unnamed: 0,DateTime,Room,Capacity,Module,NumReg,Associated,PercentageEstimate,BinaryEstimate,weekday
0,2015-11-03 09:00:00,B-002,90,,0,18,0.0,0,1
1,2015-11-03 09:00:00,B-003,90,,0,1,0.0,0,1
2,2015-11-03 09:00:00,B-003,90,,0,1,0.0,0,1
3,2015-11-03 09:00:00,B-004,220,COMP41450 & COMP30120,143,113,0.75,1,1
4,2015-11-03 09:00:00,B-003,90,,0,2,0.0,0,1


In [114]:
for i in range(len(days)):
    df[days[i]] =(df.weekday==i)*1
df.drop('weekday', axis=1, inplace=True)
df.drop('sat', axis=1, inplace=True)
df.drop('sun', axis=1, inplace=True)


df.head(10)

Unnamed: 0,DateTime,Room,Capacity,Module,NumReg,Associated,PercentageEstimate,BinaryEstimate,mon,tues,wed,thurs,fri
0,2015-11-03 09:00:00,B-002,90,,0,18,0.0,0,0,1,0,0,0
1,2015-11-03 09:00:00,B-003,90,,0,1,0.0,0,0,1,0,0,0
2,2015-11-03 09:00:00,B-003,90,,0,1,0.0,0,0,1,0,0,0
3,2015-11-03 09:00:00,B-004,220,COMP41450 & COMP30120,143,113,0.75,1,0,1,0,0,0
4,2015-11-03 09:00:00,B-003,90,,0,2,0.0,0,0,1,0,0,0
5,2015-11-03 09:00:00,B-004,220,COMP41450 & COMP30120,143,122,0.75,1,0,1,0,0,0
6,2015-11-03 09:00:00,B-003,90,,0,6,0.0,0,0,1,0,0,0
7,2015-11-03 09:00:00,B-004,220,COMP41450 & COMP30120,143,125,0.75,1,0,1,0,0,0
8,2015-11-03 09:00:00,B-004,220,COMP41450 & COMP30120,143,20,0.75,1,0,1,0,0,0
9,2015-11-03 09:00:00,B-002,90,,0,0,0.0,0,0,1,0,0,0


In [115]:
times = ['9AM', '10AM', '11AM', '12PM', '1PM', '2PM', '3PM', '4PM']
df['time'] = df['DateTime'].apply(lambda dt: dt.hour)

for i in range(0,len(times)):
    df[times[i]] =(df.time==i+9)*1
df.drop('time', axis=1, inplace=True)
df.head(10)

Unnamed: 0,DateTime,Room,Capacity,Module,NumReg,Associated,PercentageEstimate,BinaryEstimate,mon,tues,...,thurs,fri,9AM,10AM,11AM,12PM,1PM,2PM,3PM,4PM
0,2015-11-03 09:00:00,B-002,90,,0,18,0.0,0,0,1,...,0,0,1,0,0,0,0,0,0,0
1,2015-11-03 09:00:00,B-003,90,,0,1,0.0,0,0,1,...,0,0,1,0,0,0,0,0,0,0
2,2015-11-03 09:00:00,B-003,90,,0,1,0.0,0,0,1,...,0,0,1,0,0,0,0,0,0,0
3,2015-11-03 09:00:00,B-004,220,COMP41450 & COMP30120,143,113,0.75,1,0,1,...,0,0,1,0,0,0,0,0,0,0
4,2015-11-03 09:00:00,B-003,90,,0,2,0.0,0,0,1,...,0,0,1,0,0,0,0,0,0,0
5,2015-11-03 09:00:00,B-004,220,COMP41450 & COMP30120,143,122,0.75,1,0,1,...,0,0,1,0,0,0,0,0,0,0
6,2015-11-03 09:00:00,B-003,90,,0,6,0.0,0,0,1,...,0,0,1,0,0,0,0,0,0,0
7,2015-11-03 09:00:00,B-004,220,COMP41450 & COMP30120,143,125,0.75,1,0,1,...,0,0,1,0,0,0,0,0,0,0
8,2015-11-03 09:00:00,B-004,220,COMP41450 & COMP30120,143,20,0.75,1,0,1,...,0,0,1,0,0,0,0,0,0,0
9,2015-11-03 09:00:00,B-002,90,,0,0,0.0,0,0,1,...,0,0,1,0,0,0,0,0,0,0


In [116]:
for i in range(2, 5):
    df['B00'+str(i)] =(df.Room=='B-00'+str(i))*1
df.head(10)

Unnamed: 0,DateTime,Room,Capacity,Module,NumReg,Associated,PercentageEstimate,BinaryEstimate,mon,tues,...,10AM,11AM,12PM,1PM,2PM,3PM,4PM,B002,B003,B004
0,2015-11-03 09:00:00,B-002,90,,0,18,0.0,0,0,1,...,0,0,0,0,0,0,0,1,0,0
1,2015-11-03 09:00:00,B-003,90,,0,1,0.0,0,0,1,...,0,0,0,0,0,0,0,0,1,0
2,2015-11-03 09:00:00,B-003,90,,0,1,0.0,0,0,1,...,0,0,0,0,0,0,0,0,1,0
3,2015-11-03 09:00:00,B-004,220,COMP41450 & COMP30120,143,113,0.75,1,0,1,...,0,0,0,0,0,0,0,0,0,1
4,2015-11-03 09:00:00,B-003,90,,0,2,0.0,0,0,1,...,0,0,0,0,0,0,0,0,1,0
5,2015-11-03 09:00:00,B-004,220,COMP41450 & COMP30120,143,122,0.75,1,0,1,...,0,0,0,0,0,0,0,0,0,1
6,2015-11-03 09:00:00,B-003,90,,0,6,0.0,0,0,1,...,0,0,0,0,0,0,0,0,1,0
7,2015-11-03 09:00:00,B-004,220,COMP41450 & COMP30120,143,125,0.75,1,0,1,...,0,0,0,0,0,0,0,0,0,1
8,2015-11-03 09:00:00,B-004,220,COMP41450 & COMP30120,143,20,0.75,1,0,1,...,0,0,0,0,0,0,0,0,0,1
9,2015-11-03 09:00:00,B-002,90,,0,0,0.0,0,0,1,...,0,0,0,0,0,0,0,1,0,0


In [120]:
# http://chrisalbon.com/python/pandas_create_column_using_conditional.html

df['estimate'] = df['Capacity'] * df['PercentageEstimate']

# for i, row in df.iterrows():
#     if row.estimate>=220:
#         row.estimate = row.NumReg

df[df['estimate']>200] = df['NumReg']
# df['estimateAsPercent'] = df['estimate'] / df['Capacity']
# groups = [ '0%', '25%', '50%', '75%', '100%',]
# bins = [-0.01, 0.00, 0.25, 0.50, 0.75, 1.00]
# df['PercentagePred'] = pd.cut(df['estimateAsPercent'], bins, labels = groups )
df.loc[549:580]


ValueError: Must have equal len keys and value when setting with an iterable

In [72]:
# df['CapacityString'] = ' of '+df['Capacity'].map(str)
df['PercentageCat'] = df[['PercentagePred', 'Room']].apply(lambda x: ''.join(x), axis=1)
df = df.drop(['PercentagePred', 'estimateAsPercent'], axis = 1)
df

KeyError: "['PercentagePred'] not in index"

In [24]:
df.corr()

Unnamed: 0,Capacity,NumReg,Associated,PercentageEstimate,BinaryEstimate,mon,tues,wed,thurs,fri,...,11AM,12PM,1PM,2PM,3PM,4PM,B002,B003,B004,estimate
Capacity,1.0,0.455778,0.298461,-0.069604,-0.064116,0.002427,0.003646,0.003634,-0.012892,0.003697,...,0.002543,0.0026,0.002557,0.0026,0.002614,0.002628,-0.498185,-0.498185,1.0,0.384364
NumReg,0.455778,1.0,0.676914,0.320714,0.287019,-0.040382,0.033913,-0.038447,0.074718,-0.039002,...,0.06395,0.105279,0.01495,-0.015179,-0.008359,-0.174846,-0.11704,-0.337083,0.455778,0.5448
Associated,0.298461,0.676914,1.0,0.531765,0.453885,0.007461,0.145069,-0.096445,0.023109,-0.076906,...,0.031759,0.15642,0.020023,0.155363,-0.045115,-0.172875,-0.199146,-0.098231,0.298461,0.656843
PercentageEstimate,-0.069604,0.320714,0.531765,1.0,0.783154,0.060094,0.120773,-0.097147,-0.057357,-0.012222,...,0.073088,0.154783,-0.046817,0.137695,0.012406,-0.175446,-0.070137,0.139488,-0.069604,0.809299
BinaryEstimate,-0.064116,0.287019,0.453885,0.783154,1.0,0.210188,0.065323,-0.168189,0.011449,-0.067954,...,0.135403,0.195034,-0.08407,0.132563,0.048919,-0.168321,0.051192,0.012692,-0.064116,0.6658
mon,0.002427,-0.040382,0.007461,0.060094,0.210188,1.0,-0.189583,-0.188921,-0.188259,-0.192229,...,-0.003702,0.0027,-0.00499,0.0027,0.001406,0.000124,-0.001209,-0.001209,0.002427,0.064208
tues,0.003646,0.033913,0.145069,0.120773,0.065323,-0.189583,1.0,-0.283835,-0.28284,-0.288805,...,0.005409,-0.002418,0.003427,-0.011143,-0.004335,0.002419,-0.001817,-0.001817,0.003646,0.146927
wed,0.003634,-0.038447,-0.096445,-0.097147,-0.168189,-0.188921,-0.283835,1.0,-0.281852,-0.287796,...,-0.002388,-0.001317,-0.00433,0.007426,-0.003231,-0.00513,-0.00181,-0.00181,0.003634,-0.150976
thurs,-0.012892,0.074718,0.023109,-0.057357,0.011449,-0.188259,-0.28284,-0.281852,1.0,-0.286787,...,-0.001309,-0.000211,0.01449,-0.000211,0.006601,0.004669,0.006422,0.006422,-0.012892,0.000456
fri,0.003697,-0.039002,-0.076906,-0.012222,-0.067954,-0.192229,-0.288805,-0.287796,-0.286787,1.0,...,0.001082,0.001878,-0.009676,0.001878,-8.3e-05,-0.002028,-0.001842,-0.001842,0.003697,-0.04511


In [25]:
df.shape

(2490, 26)

In [26]:
intercept = pd.DataFrame({'Intercept':np.ones(len(df))})
intercept

Unnamed: 0,Intercept
0,1.0
1,1.0
2,1.0
3,1.0
4,1.0
5,1.0
6,1.0
7,1.0
8,1.0
9,1.0


In [27]:
x = pd.concat([intercept, df[['Capacity', 'NumReg', 'Associated', 'mon', 'tues', 'wed', 'thurs', 'fri', '9AM', '10AM', '11AM', '12PM', '1PM', '2PM', '3PM', '4PM', 'B002', 'B003', 'B004' ]]], axis = 1)
y = df['PercentageCat']

# Test 1: All features but modules. 

In [28]:
log = LogisticRegression().fit(x[['Intercept','Capacity', 'NumReg', 'Associated', 'mon', 'tues', 'wed', 'thurs', 'fri', '9AM', '10AM', '11AM', '12PM', '1PM', '2PM', '3PM', '4PM', 'B002', 'B003', 'B004']], y)

In [29]:
predictions = log.predict(x[['Intercept','Capacity', 'NumReg', 'Associated', 'mon', 'tues', 'wed', 'thurs', 'fri', '9AM', '10AM', '11AM', '12PM', '1PM', '2PM', '3PM', '4PM', 'B002', 'B003', 'B004']])
log.score(x[['Intercept','Capacity', 'NumReg', 'Associated', 'mon', 'tues', 'wed', 'thurs', 'fri', '9AM', '10AM', '11AM', '12PM', '1PM', '2PM', '3PM', '4PM', 'B002', 'B003', 'B004']], y)

0.71044176706827311

In [30]:
x_train, x_test, y_train, y_test = train_test_split(x[['Intercept','Capacity', 'NumReg', 'Associated', 'mon', 'tues', 'wed', 'thurs', 'fri', '9AM', '10AM', '11AM', '12PM', '1PM', '2PM', '3PM', '4PM', 'B002', 'B003', 'B004']], y, test_size=0.2)

In [31]:
log_train = LogisticRegression().fit(x_train, y_train)

pred = log_train.predict(x_train)
print(metrics.accuracy_score(y_train, pred))

pred = log_train.predict(x_test)
print(metrics.accuracy_score(y_test, pred))

0.701807228916
0.696787148594


# Test 2: Now without rooms

In [32]:
log2 = LogisticRegression().fit(x[['Intercept','Capacity', 'NumReg', 'Associated', 'mon', 'tues', 'wed', 'thurs', 'fri', '9AM', '10AM', '11AM', '12PM', '1PM', '2PM', '3PM', '4PM']], y)

In [33]:
predictions2 = log2.predict(x[['Intercept','Capacity', 'NumReg', 'Associated', 'mon', 'tues', 'wed', 'thurs', 'fri', '9AM', '10AM', '11AM', '12PM', '1PM', '2PM', '3PM', '4PM']])
log2.score(x[['Intercept','Capacity', 'NumReg', 'Associated', 'mon', 'tues', 'wed', 'thurs', 'fri', '9AM', '10AM', '11AM', '12PM', '1PM', '2PM', '3PM', '4PM']], y)

0.61485943775100405

In [34]:
x_train2, x_test2, y_train2, y_test2 = train_test_split(x[['Intercept','Capacity', 'NumReg', 'Associated', 'mon', 'tues', 'wed', 'thurs', 'fri', '9AM', '10AM', '11AM', '12PM', '1PM', '2PM', '3PM', '4PM']], y, test_size=0.2)
log_train2 = LogisticRegression().fit(x_train2, y_train2)

pred2 = log_train2.predict(x_train2)
print(metrics.accuracy_score(y_train2, pred2))

pred2 = log_train2.predict(x_test2)
print(metrics.accuracy_score(y_test2, pred2))

0.600401606426
0.622489959839


# Test 3: Now without rooms and Capacity

In [35]:
log3 = LogisticRegression().fit(x[['Intercept', 'NumReg', 'Associated', 'mon', 'tues', 'wed', 'thurs', 'fri', '9AM', '10AM', '11AM', '12PM', '1PM', '2PM', '3PM', '4PM']], y)

In [36]:
predictions3 = log3.predict(x[['Intercept', 'NumReg', 'Associated', 'mon', 'tues', 'wed', 'thurs', 'fri', '9AM', '10AM', '11AM', '12PM', '1PM', '2PM', '3PM', '4PM']])
log3.score(x[['Intercept', 'NumReg', 'Associated', 'mon', 'tues', 'wed', 'thurs', 'fri', '9AM', '10AM', '11AM', '12PM', '1PM', '2PM', '3PM', '4PM']], y)

0.44216867469879517

In [37]:
x_train3, x_test3, y_train3, y_test3 = train_test_split(x[['Intercept', 'NumReg', 'Associated', 'mon', 'tues', 'wed', 'thurs', 'fri', '9AM', '10AM', '11AM', '12PM', '1PM', '2PM', '3PM', '4PM']], y, test_size=0.3)
log_train3 = LogisticRegression().fit(x_train3, y_train3)

pred3 = log_train3.predict(x_train3)
print(metrics.accuracy_score(y_train3, pred3))

pred3 = log_train3.predict(x_test3)
print(metrics.accuracy_score(y_test3, pred3))

0.438324727481
0.389558232932


# Test 4: Now without rooms and Number registered

In [38]:
log4 = LogisticRegression().fit(x[['Intercept', 'Capacity', 'Associated', 'mon', 'tues', 'wed', 'thurs', 'fri', '9AM', '10AM', '11AM', '12PM', '1PM', '2PM', '3PM', '4PM']], y)

In [39]:
predictions4 = log4.predict(x[['Intercept', 'Capacity', 'Associated', 'mon', 'tues', 'wed', 'thurs', 'fri', '9AM', '10AM', '11AM', '12PM', '1PM', '2PM', '3PM', '4PM']])
log4.score(x[['Intercept', 'Capacity', 'Associated', 'mon', 'tues', 'wed', 'thurs', 'fri', '9AM', '10AM', '11AM', '12PM', '1PM', '2PM', '3PM', '4PM']], y)

0.54779116465863453

In [40]:
x_train4, x_test4, y_train4, y_test4 = train_test_split(x[['Intercept', 'Capacity', 'Associated', 'mon', 'tues', 'wed', 'thurs', 'fri', '9AM', '10AM', '11AM', '12PM', '1PM', '2PM', '3PM', '4PM']], y, test_size=0.3)
log_train4 = LogisticRegression().fit(x_train4, y_train4)

pred4 = log_train4.predict(x_train4)
print(metrics.accuracy_score(y_train4, pred4))

pred4 = log_train4.predict(x_test4)
print(metrics.accuracy_score(y_test4, pred4))

0.562822719449
0.511378848728


# Test 5: Now without Number Registered or capacity

In [41]:
log5 = LogisticRegression().fit(x[['Intercept', 'Associated', 'mon', 'tues', 'wed', 'thurs', 'fri', '9AM', '10AM', '11AM', '12PM', '1PM', '2PM', '3PM', '4PM',  'B002', 'B003', 'B004']], y)

In [42]:
predictions5 = log5.predict(x[['Intercept',  'Associated', 'mon', 'tues', 'wed', 'thurs', 'fri', '9AM', '10AM', '11AM', '12PM', '1PM', '2PM', '3PM', '4PM',  'B002', 'B003', 'B004']])
log5.score(x[['Intercept', 'Associated', 'mon', 'tues', 'wed', 'thurs', 'fri', '9AM', '10AM', '11AM', '12PM', '1PM', '2PM', '3PM', '4PM', 'B002', 'B003', 'B004']], y)

0.70441767068273098

In [43]:
x_train5, x_test5, y_train5, y_test5 = train_test_split(x[['Intercept', 'Associated', 'mon', 'tues', 'wed', 'thurs', 'fri', '9AM', '10AM', '11AM', '12PM', '1PM', '2PM', '3PM', '4PM',  'B002', 'B003', 'B004']], y, test_size=0.3)
log_train5 = LogisticRegression().fit(x_train5, y_train5)

pred5 = log_train5.predict(x_train5)
print(metrics.accuracy_score(y_train5, pred5))

pred5 = log_train5.predict(x_test5)
print(metrics.accuracy_score(y_test5, pred5))

0.716580608147
0.668005354752


# Test 6: With just Wifi Log data

In [44]:
log6 =  LogisticRegression().fit(x[['Intercept', 'Associated']], y)

In [45]:
predictions6 = log6.predict(x[['Intercept',  'Associated']])
log6.score(x[['Intercept', 'Associated']], y)

0.27028112449799196

In [46]:
x_train6, x_test6, y_train6, y_test6 = train_test_split(x[['Intercept', 'Associated']], y, test_size=0.3)
log_train6 = LogisticRegression().fit(x_train6, y_train6)

pred6 = log_train6.predict(x_train6)
print(metrics.accuracy_score(y_train6, pred6))

pred6 = log_train6.predict(x_test6)
print(metrics.accuracy_score(y_test6, pred6))

0.273092369478
0.275769745649


### Other stuff

In [47]:
x_best = pd.concat([intercept, df[[  'Associated', 'mon', 'tues', 'wed', 'thurs', 'fri', '9AM', '10AM', '11AM', '12PM', '1PM', '2PM', '3PM', '4PM',  'B002', 'B003', 'B004' ]]], axis = 1)


In [48]:
model_scores = cross_val_score(LogisticRegression(), x_best, y, scoring = 'accuracy', cv = 10)
print(model_scores)
print(model_scores.mean())

[ 0.55642023  0.54117647  0.46825397  0.7250996   0.68548387  0.41463415
  0.51219512  0.67755102  0.55102041  0.48979592]
0.56216307601
