In [12]:
import MySQLdb as db
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from pandas.tools.plotting import scatter_matrix
from sklearn.linear_model import LogisticRegression
from sklearn.cross_validation import train_test_split
from sklearn import metrics
from sklearn.cross_validation import cross_val_score
%matplotlib inline

name = 'DatabaseMain'
conn = db.connect(host = "localhost", user = "root", passwd ="", db=name)
cursor = conn.cursor()

sql = ''' SELECT G.DateTime, W.Room, R.Capacity, T.Module, M.NumReg, W.Associated, G.PercentageEstimate, G.BinaryEstimate
          FROM DatabaseMain.WiFiLogData W, DatabaseMain.GroundTruth G, DatabaseMain.Rooms R, DatabaseMain.TimeModule T,
          DatabaseMain.Modules M
          WHERE W.Room = G.Room AND W.DateTime BETWEEN G.DateTime AND DATE_ADD(G.DateTime, INTERVAL 1 HOUR) 
          AND R.Room = W.Room AND R.Room = G.Room AND T.Room = G.Room AND T.Room = R.Room AND T.Room =  W.Room 
          AND T.DateTime = G.DateTime AND M.ModuleName = T.Module
          ORDER BY G.DateTime'''
df = pd.read_sql_query(sql, conn)
df.head()



Unnamed: 0,DateTime,Room,Capacity,Module,NumReg,Associated,PercentageEstimate,BinaryEstimate
0,2015-11-03 09:00:00,B-002,90,,0,1,0.0,0
1,2015-11-03 09:00:00,B-003,90,,0,1,0.0,0
2,2015-11-03 09:00:00,B-004,220,COMP41450 & COMP30120,143,124,0.75,1
3,2015-11-03 09:00:00,B-002,90,,0,0,0.0,0
4,2015-11-03 09:00:00,B-003,90,,0,1,0.0,0


In [13]:
df.describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
Capacity,2502.0,133.333333,61.294838,90.0,90.0,90.0,220.0,220.0
NumReg,2502.0,38.288569,40.911113,0.0,0.0,29.0,60.0,143.0
Associated,2502.0,30.284972,28.965944,0.0,10.0,22.0,42.0,232.0
PercentageEstimate,2502.0,0.249101,0.217957,0.0,0.0,0.25,0.25,1.0
BinaryEstimate,2502.0,0.68745,0.463625,0.0,0.0,1.0,1.0,1.0


In [14]:
import seaborn as sns
sns.pairplot(df, diag_kind = 'hist')

ImportError: No module named 'seaborn'

In [15]:
df.corr()

Unnamed: 0,Capacity,NumReg,Associated,PercentageEstimate,BinaryEstimate
Capacity,1.0,0.463803,0.307442,-0.043771,-0.059141
NumReg,0.463803,1.0,0.672959,0.348959,0.290459
Associated,0.307442,0.672959,1.0,0.561168,0.429123
PercentageEstimate,-0.043771,0.348959,0.561168,1.0,0.77078
BinaryEstimate,-0.059141,0.290459,0.429123,0.77078,1.0


### For logistic regression, we need to make the target feature categorical. For this, we can bin the value ranges. 

In [16]:
df.dtypes

DateTime              datetime64[ns]
Room                          object
Capacity                       int64
Module                        object
NumReg                         int64
Associated                     int64
PercentageEstimate           float64
BinaryEstimate                 int64
dtype: object

In [17]:
df['Room'] = df['Room'].astype('category')
df['Module'] = df['Module'].astype('category')
df.dtypes

DateTime              datetime64[ns]
Room                        category
Capacity                       int64
Module                      category
NumReg                         int64
Associated                     int64
PercentageEstimate           float64
BinaryEstimate                 int64
dtype: object

In [18]:
df.shape

(2502, 8)

### For logistic regression, we need to make the training features to be continuous. To do this we need to create dummy values for catergorical features. 


In [19]:
days = ['mon', 'tues', 'wed', 'thurs', 'fri', 'sat', 'sun']
# http://stackoverflow.com/questions/13740672/in-pandas-how-can-i-groupby-weekday-for-a-datetime-column
# http://chrisalbon.com/python/pandas_apply_operations_to_dataframes.html
df['weekday'] = df['DateTime'].apply(lambda dt: dt.weekday())
df.head()


Unnamed: 0,DateTime,Room,Capacity,Module,NumReg,Associated,PercentageEstimate,BinaryEstimate,weekday
0,2015-11-03 09:00:00,B-002,90,,0,1,0.0,0,1
1,2015-11-03 09:00:00,B-003,90,,0,1,0.0,0,1
2,2015-11-03 09:00:00,B-004,220,COMP41450 & COMP30120,143,124,0.75,1,1
3,2015-11-03 09:00:00,B-002,90,,0,0,0.0,0,1
4,2015-11-03 09:00:00,B-003,90,,0,1,0.0,0,1


In [20]:
for i in range(len(days)):
    df[days[i]] =(df.weekday==i)*1
df.drop('weekday', axis=1, inplace=True)
df.drop('sat', axis=1, inplace=True)
df.drop('sun', axis=1, inplace=True)


df.head(10)

Unnamed: 0,DateTime,Room,Capacity,Module,NumReg,Associated,PercentageEstimate,BinaryEstimate,mon,tues,wed,thurs,fri
0,2015-11-03 09:00:00,B-002,90,,0,1,0.0,0,0,1,0,0,0
1,2015-11-03 09:00:00,B-003,90,,0,1,0.0,0,0,1,0,0,0
2,2015-11-03 09:00:00,B-004,220,COMP41450 & COMP30120,143,124,0.75,1,0,1,0,0,0
3,2015-11-03 09:00:00,B-002,90,,0,0,0.0,0,0,1,0,0,0
4,2015-11-03 09:00:00,B-003,90,,0,1,0.0,0,0,1,0,0,0
5,2015-11-03 09:00:00,B-004,220,COMP41450 & COMP30120,143,119,0.75,1,0,1,0,0,0
6,2015-11-03 09:00:00,B-002,90,,0,2,0.0,0,0,1,0,0,0
7,2015-11-03 09:00:00,B-004,220,COMP41450 & COMP30120,143,62,0.75,1,0,1,0,0,0
8,2015-11-03 09:00:00,B-003,90,,0,9,0.0,0,0,1,0,0,0
9,2015-11-03 09:00:00,B-002,90,,0,18,0.0,0,0,1,0,0,0


In [21]:
times = ['9AM', '10AM', '11AM', '12PM', '1PM', '2PM', '3PM', '4PM']
df['time'] = df['DateTime'].apply(lambda dt: dt.hour)

for i in range(0,len(times)):
    df[times[i]] =(df.time==i+9)*1
df.drop('time', axis=1, inplace=True)
df.head(10)

Unnamed: 0,DateTime,Room,Capacity,Module,NumReg,Associated,PercentageEstimate,BinaryEstimate,mon,tues,...,thurs,fri,9AM,10AM,11AM,12PM,1PM,2PM,3PM,4PM
0,2015-11-03 09:00:00,B-002,90,,0,1,0.0,0,0,1,...,0,0,1,0,0,0,0,0,0,0
1,2015-11-03 09:00:00,B-003,90,,0,1,0.0,0,0,1,...,0,0,1,0,0,0,0,0,0,0
2,2015-11-03 09:00:00,B-004,220,COMP41450 & COMP30120,143,124,0.75,1,0,1,...,0,0,1,0,0,0,0,0,0,0
3,2015-11-03 09:00:00,B-002,90,,0,0,0.0,0,0,1,...,0,0,1,0,0,0,0,0,0,0
4,2015-11-03 09:00:00,B-003,90,,0,1,0.0,0,0,1,...,0,0,1,0,0,0,0,0,0,0
5,2015-11-03 09:00:00,B-004,220,COMP41450 & COMP30120,143,119,0.75,1,0,1,...,0,0,1,0,0,0,0,0,0,0
6,2015-11-03 09:00:00,B-002,90,,0,2,0.0,0,0,1,...,0,0,1,0,0,0,0,0,0,0
7,2015-11-03 09:00:00,B-004,220,COMP41450 & COMP30120,143,62,0.75,1,0,1,...,0,0,1,0,0,0,0,0,0,0
8,2015-11-03 09:00:00,B-003,90,,0,9,0.0,0,0,1,...,0,0,1,0,0,0,0,0,0,0
9,2015-11-03 09:00:00,B-002,90,,0,18,0.0,0,0,1,...,0,0,1,0,0,0,0,0,0,0


In [22]:
for i in range(2, 5):
    df['B00'+str(i)] =(df.Room=='B-00'+str(i))*1
df.head(10)

Unnamed: 0,DateTime,Room,Capacity,Module,NumReg,Associated,PercentageEstimate,BinaryEstimate,mon,tues,...,10AM,11AM,12PM,1PM,2PM,3PM,4PM,B002,B003,B004
0,2015-11-03 09:00:00,B-002,90,,0,1,0.0,0,0,1,...,0,0,0,0,0,0,0,1,0,0
1,2015-11-03 09:00:00,B-003,90,,0,1,0.0,0,0,1,...,0,0,0,0,0,0,0,0,1,0
2,2015-11-03 09:00:00,B-004,220,COMP41450 & COMP30120,143,124,0.75,1,0,1,...,0,0,0,0,0,0,0,0,0,1
3,2015-11-03 09:00:00,B-002,90,,0,0,0.0,0,0,1,...,0,0,0,0,0,0,0,1,0,0
4,2015-11-03 09:00:00,B-003,90,,0,1,0.0,0,0,1,...,0,0,0,0,0,0,0,0,1,0
5,2015-11-03 09:00:00,B-004,220,COMP41450 & COMP30120,143,119,0.75,1,0,1,...,0,0,0,0,0,0,0,0,0,1
6,2015-11-03 09:00:00,B-002,90,,0,2,0.0,0,0,1,...,0,0,0,0,0,0,0,1,0,0
7,2015-11-03 09:00:00,B-004,220,COMP41450 & COMP30120,143,62,0.75,1,0,1,...,0,0,0,0,0,0,0,0,0,1
8,2015-11-03 09:00:00,B-003,90,,0,9,0.0,0,0,1,...,0,0,0,0,0,0,0,0,1,0
9,2015-11-03 09:00:00,B-002,90,,0,18,0.0,0,0,1,...,0,0,0,0,0,0,0,1,0,0


In [23]:
df.corr()

Unnamed: 0,Capacity,NumReg,Associated,PercentageEstimate,BinaryEstimate,mon,tues,wed,thurs,fri,...,10AM,11AM,12PM,1PM,2PM,3PM,4PM,B002,B003,B004
Capacity,1.0,0.463803,0.307442,-0.043771,-0.059141,-5.888996e-19,-9.666792000000001e-18,-8.677957e-18,-1.046443e-17,-4.7465760000000003e-17,...,-7.927556000000001e-18,-5.397626e-18,-7.787863e-18,-6.1880570000000004e-18,-7.822786e-18,-8.904165000000001e-18,-9.422837000000001e-18,-0.5,-0.5,1.0
NumReg,0.4638035,1.0,0.672959,0.348959,0.290459,-0.04409793,0.02678603,-0.04435085,0.09593623,-0.04500949,...,-0.01001189,0.05833151,0.09888495,0.01010139,-0.01961437,-0.01293042,-0.1767349,-0.123762,-0.3400415,0.4638035
Associated,0.3074415,0.672959,1.0,0.561168,0.429123,-0.002828365,0.1169769,-0.1016912,0.07128029,-0.0842454,...,-0.01667288,0.01872531,0.131432,0.008037062,0.1304752,-0.05119701,-0.166992,-0.199392,-0.1080495,0.3074415
PercentageEstimate,-0.04377113,0.348959,0.561168,1.0,0.77078,0.05244817,0.1083507,-0.103067,-0.02422544,-0.02081782,...,-0.00396269,0.06477813,0.1439347,-0.05164132,0.127348,0.005704586,-0.1766626,-0.07976073,0.1235319,-0.04377113
BinaryEstimate,-0.05914106,0.290459,0.429123,0.77078,1.0,0.2087426,0.06347829,-0.1696136,0.01740534,-0.06958479,...,-0.05080598,0.1340036,0.193519,-0.08516232,0.1311382,0.04760828,-0.169323,0.04877614,0.01036493,-0.05914106
mon,-5.888996e-19,-0.044098,-0.002828,0.052448,0.208743,1.0,-0.1884884,-0.1878311,-0.1898022,-0.1911151,...,0.003345779,-0.003066062,0.003345779,-0.004349591,0.003345779,0.002056942,0.0007785118,-1.570016e-18,-1.1588210000000001e-18,-7.476265e-20
tues,-9.666792000000001e-18,0.026786,0.116977,0.108351,0.063478,-0.1884884,1.0,-0.2820897,-0.2850501,-0.2870218,...,-0.001442298,0.006355108,-0.001442298,0.004381059,-0.0101588,-0.00335175,0.00340055,4.7670420000000004e-18,5.618299e-18,4.7670420000000004e-18
wed,-8.677957e-18,-0.044351,-0.101691,-0.103067,-0.169614,-0.1878311,-0.2820897,1.0,-0.284056,-0.2860209,...,0.008387993,-0.001437491,-0.0003455727,-0.003371751,0.008387993,-0.002252782,-0.004144744,7.505733000000001e-18,1.5864390000000002e-17,1.182722e-17
thurs,-1.046443e-17,0.095936,0.07128,-0.024225,0.017405,-0.1898022,-0.2850501,-0.284056,1.0,-0.2890225,...,-0.003623071,-0.00463679,-0.003623071,0.01100527,-0.003623071,0.003110704,0.00117734,9.271182e-18,1.1815100000000001e-17,9.271182e-18
fri,-4.7465760000000003e-17,-0.045009,-0.084245,-0.020818,-0.069585,-0.1911151,-0.2870218,-0.2860209,-0.2890225,1.0,...,-0.005787361,0.002045945,0.002862566,-0.008695199,0.002862566,0.0009090104,-0.001028835,5.856975e-18,6.307512e-18,5.856975e-18


In [24]:
df.shape

(2502, 24)

In [25]:
intercept = pd.DataFrame({'Intercept':np.ones(2502)})
intercept

Unnamed: 0,Intercept
0,1.0
1,1.0
2,1.0
3,1.0
4,1.0
5,1.0
6,1.0
7,1.0
8,1.0
9,1.0


In [26]:
x = pd.concat([intercept, df[['Capacity', 'NumReg', 'Associated', 'mon', 'tues', 'wed', 'thurs', 'fri', '9AM', '10AM', '11AM', '12PM', '1PM', '2PM', '3PM', '4PM', 'B002', 'B003', 'B004' ]]], axis = 1)
y = df['BinaryEstimate']

# Test 1: All features but modules. 

In [27]:
log = LogisticRegression().fit(x[['Intercept','Capacity', 'NumReg', 'Associated', 'mon', 'tues', 'wed', 'thurs', 'fri', '9AM', '10AM', '11AM', '12PM', '1PM', '2PM', '3PM', '4PM', 'B002', 'B003', 'B004']], y)

In [28]:
predictions = log.predict(x[['Intercept','Capacity', 'NumReg', 'Associated', 'mon', 'tues', 'wed', 'thurs', 'fri', '9AM', '10AM', '11AM', '12PM', '1PM', '2PM', '3PM', '4PM', 'B002', 'B003', 'B004']])
log.score(x[['Intercept','Capacity', 'NumReg', 'Associated', 'mon', 'tues', 'wed', 'thurs', 'fri', '9AM', '10AM', '11AM', '12PM', '1PM', '2PM', '3PM', '4PM', 'B002', 'B003', 'B004']], y)

0.82573940847322147

In [29]:
x_train, x_test, y_train, y_test = train_test_split(x[['Intercept','Capacity', 'NumReg', 'Associated', 'mon', 'tues', 'wed', 'thurs', 'fri', '9AM', '10AM', '11AM', '12PM', '1PM', '2PM', '3PM', '4PM', 'B002', 'B003', 'B004']], y, test_size=0.2)

In [30]:
log_train = LogisticRegression().fit(x_train, y_train)

pred = log_train.predict(x_train)
print(metrics.accuracy_score(y_train, pred))

pred = log_train.predict(x_test)
print(metrics.accuracy_score(y_test, pred))

0.826586706647
0.820359281437


# Test 2: Now without rooms

In [31]:
log2 = LogisticRegression().fit(x[['Intercept','Capacity', 'NumReg', 'Associated', 'mon', 'tues', 'wed', 'thurs', 'fri', '9AM', '10AM', '11AM', '12PM', '1PM', '2PM', '3PM', '4PM']], y)

In [32]:
predictions2 = log2.predict(x[['Intercept','Capacity', 'NumReg', 'Associated', 'mon', 'tues', 'wed', 'thurs', 'fri', '9AM', '10AM', '11AM', '12PM', '1PM', '2PM', '3PM', '4PM']])
log2.score(x[['Intercept','Capacity', 'NumReg', 'Associated', 'mon', 'tues', 'wed', 'thurs', 'fri', '9AM', '10AM', '11AM', '12PM', '1PM', '2PM', '3PM', '4PM']], y)

0.82773780975219824

In [48]:
x_train2, x_test2, y_train2, y_test2 = train_test_split(x[['Intercept','Capacity', 'NumReg', 'Associated', 'mon', 'tues', 'wed', 'thurs', 'fri', '9AM', '10AM', '11AM', '12PM', '1PM', '2PM', '3PM', '4PM']], y, test_size=0.2)
log_train2 = LogisticRegression().fit(x_train2, y_train2)

pred2 = log_train2.predict(x_train2)
print(metrics.accuracy_score(y_train2, pred2))

pred2 = log_train2.predict(x_test2)
print(metrics.accuracy_score(y_test2, pred2))

0.829085457271
0.832335329341


# Test 3: Now without rooms and Capacity

In [34]:
log3 = LogisticRegression().fit(x[['Intercept', 'NumReg', 'Associated', 'mon', 'tues', 'wed', 'thurs', 'fri', '9AM', '10AM', '11AM', '12PM', '1PM', '2PM', '3PM', '4PM']], y)

In [35]:
predictions3 = log3.predict(x[['Intercept', 'NumReg', 'Associated', 'mon', 'tues', 'wed', 'thurs', 'fri', '9AM', '10AM', '11AM', '12PM', '1PM', '2PM', '3PM', '4PM']])
log3.score(x[['Intercept', 'NumReg', 'Associated', 'mon', 'tues', 'wed', 'thurs', 'fri', '9AM', '10AM', '11AM', '12PM', '1PM', '2PM', '3PM', '4PM']], y)

0.81814548361310946

In [49]:
x_train3, x_test3, y_train3, y_test3 = train_test_split(x[['Intercept', 'NumReg', 'Associated', 'mon', 'tues', 'wed', 'thurs', 'fri', '9AM', '10AM', '11AM', '12PM', '1PM', '2PM', '3PM', '4PM']], y, test_size=0.3)
log_train3 = LogisticRegression().fit(x_train3, y_train3)

pred3 = log_train3.predict(x_train3)
print(metrics.accuracy_score(y_train3, pred3))

pred3 = log_train3.predict(x_test3)
print(metrics.accuracy_score(y_test3, pred3))

0.805825242718
0.829560585885


# Test 4: Now without rooms and Number registered

In [37]:
log4 = LogisticRegression().fit(x[['Intercept', 'Capacity', 'Associated', 'mon', 'tues', 'wed', 'thurs', 'fri', '9AM', '10AM', '11AM', '12PM', '1PM', '2PM', '3PM', '4PM']], y)

In [38]:
predictions4 = log4.predict(x[['Intercept', 'Capacity', 'Associated', 'mon', 'tues', 'wed', 'thurs', 'fri', '9AM', '10AM', '11AM', '12PM', '1PM', '2PM', '3PM', '4PM']])
log4.score(x[['Intercept', 'Capacity', 'Associated', 'mon', 'tues', 'wed', 'thurs', 'fri', '9AM', '10AM', '11AM', '12PM', '1PM', '2PM', '3PM', '4PM']], y)

0.82254196642685851

In [50]:
x_train4, x_test4, y_train4, y_test4 = train_test_split(x[['Intercept', 'Capacity', 'Associated', 'mon', 'tues', 'wed', 'thurs', 'fri', '9AM', '10AM', '11AM', '12PM', '1PM', '2PM', '3PM', '4PM']], y, test_size=0.3)
log_train4 = LogisticRegression().fit(x_train4, y_train4)

pred4 = log_train4.predict(x_train4)
print(metrics.accuracy_score(y_train4, pred4))

pred4 = log_train4.predict(x_test4)
print(metrics.accuracy_score(y_test4, pred4))

0.822958309537
0.833555259654


# Test 5: Now without Number Registered or capacity

In [40]:
log5 = LogisticRegression().fit(x[['Intercept', 'Associated', 'mon', 'tues', 'wed', 'thurs', 'fri', '9AM', '10AM', '11AM', '12PM', '1PM', '2PM', '3PM', '4PM',  'B002', 'B003', 'B004']], y)

In [41]:
predictions5 = log5.predict(x[['Intercept',  'Associated', 'mon', 'tues', 'wed', 'thurs', 'fri', '9AM', '10AM', '11AM', '12PM', '1PM', '2PM', '3PM', '4PM',  'B002', 'B003', 'B004']])
log5.score(x[['Intercept', 'Associated', 'mon', 'tues', 'wed', 'thurs', 'fri', '9AM', '10AM', '11AM', '12PM', '1PM', '2PM', '3PM', '4PM', 'B002', 'B003', 'B004']], y)

0.8229416466826539

In [51]:
x_train5, x_test5, y_train5, y_test5 = train_test_split(x[['Intercept', 'Associated', 'mon', 'tues', 'wed', 'thurs', 'fri', '9AM', '10AM', '11AM', '12PM', '1PM', '2PM', '3PM', '4PM',  'B002', 'B003', 'B004']], y, test_size=0.3)
log_train5 = LogisticRegression().fit(x_train5, y_train5)

pred5 = log_train5.predict(x_train5)
print(metrics.accuracy_score(y_train5, pred5))

pred5 = log_train5.predict(x_test5)
print(metrics.accuracy_score(y_test5, pred5))

0.818389491719
0.82689747004


# Test 6: With just Wifi Log data

In [43]:
log6 =  LogisticRegression().fit(x[['Intercept', 'Associated']], y)

In [44]:
predictions6 = log6.predict(x[['Intercept',  'Associated']])
log6.score(x[['Intercept', 'Associated']], y)

0.79336530775379699

In [53]:
x_train6, x_test6, y_train6, y_test6 = train_test_split(x[['Intercept', 'Associated']], y, test_size=0.3)
log_train6 = LogisticRegression().fit(x_train6, y_train6)

pred6 = log_train6.predict(x_train6)
print(metrics.accuracy_score(y_train6, pred6))

pred6 = log_train6.predict(x_test6)
print(metrics.accuracy_score(y_test6, pred6))

0.802398629355
0.784287616511


### Other stuff

In [46]:
x_best = pd.concat([intercept, df[['Capacity', 'NumReg', 'Associated', 'mon', 'tues', 'wed', 'thurs', 'fri', '9AM', '10AM', '11AM', '12PM', '1PM', '2PM', '3PM', '4PM' ]]], axis = 1)


In [47]:
model_scores = cross_val_score(LogisticRegression(), x_best, y, scoring = 'accuracy', cv = 10)
print('Logistic regression, Target feature: Binary ground Truth estimate\n')
print(model_scores)
print('Mean score: ',model_scores.mean())

Logistic regression, Target feature: Binary ground Truth estimate

[ 0.84860558  0.64143426  0.78        0.748       0.876       0.812       0.756
  0.616       0.764       0.604     ]
Mean score:  0.744603984064
