In [1]:
import MySQLdb as db
import pandas as pd
import numpy as np
from sklearn.cross_validation import cross_val_score
from sklearn.linear_model import LogisticRegression
from sklearn.dummy import DummyClassifier
import matplotlib.pyplot as plt
%matplotlib inline

In [2]:
# http://stackoverflow.com/questions/464864/python-code-to-pick-out-all-possible-combinations-from-a-list
import itertools

# All possible combinations of features
subsets = []
features = ['Day', 'Time', 'Module', 'Room', 'NumReg', 'Capacity']
for L in range(1, len(features)+1):
    for subset in itertools.combinations(features, L):
        subsets.append(subset)       

In [3]:
# Connect to Database
name = "DatabaseMain"
con = db.connect(host="localhost", user="root", passwd='', db=name)
cursor = con.cursor()

# ABT - all wifi data into a df
sql = """SELECT G.DateTime, W.Room, R.Capacity, T.Module, M.NumReg, W.Associated, G.PercentageEstimate, G.BinaryEstimate
        FROM DatabaseMain.WifiLogData W, DatabaseMain.GroundTruth G, DatabaseMain.Rooms R, DatabaseMain.TimeModule T, DatabaseMain.Modules M
        WHERE W.Room = G.Room AND W.DateTime BETWEEN G.DateTime AND DATE_ADD(G.DateTime, INTERVAL 1 HOUR) AND R.Room = W.Room AND R.Room = G.Room AND T.Room = G.Room AND T.Room = R.Room AND T.Room =  W.Room AND T.DateTime = G.DateTime AND M.ModuleName = T.Module"""
df = pd.read_sql_query(sql, con)

# ABT - wifi averages into a df
sql_avgs = """SELECT G.DateTime, W.Room, R.Capacity, T.Module, M.NumReg, W.AvgNumWifiConn, G.PercentageEstimate, G.BinaryEstimate
        FROM DatabaseMain.AverageNumWifiConnections W, DatabaseMain.GroundTruth G, DatabaseMain.Rooms R, DatabaseMain.TimeModule T, DatabaseMain.Modules M
        WHERE W.Room = G.Room AND W.DateTime = G.DateTime AND W.Room = G.Room AND W.Room = R.Room AND W.Room = T.Room AND R.Room = W.Room AND R.Room = G.Room AND T.Room = G.Room AND T.Room = R.Room AND T.Room =  W.Room AND T.DateTime = G.DateTime AND M.ModuleName = T.Module
        ORDER BY W.Room"""
df_avgs = pd.read_sql_query(sql_avgs, con)

In [91]:
# Index DateTime column - want separate day of week and time
# so can be separated by category (e.g day of week or time)

df['Day'] = df['DateTime'].dt.dayofweek
df['Time'] = df['DateTime'].dt.time

df_avgs['Day'] = df_avgs['DateTime'].dt.dayofweek
df_avgs['Time'] = df_avgs['DateTime'].dt.time

categories = ['Day', 'Time', 'Module', 'Room']
for cat in categories:
    df[cat] = df[cat].astype('category')
    df_avgs[cat] = df_avgs[cat].astype('category')
df.dtypes

DateTime              datetime64[ns]
Room                        category
Capacity                       int64
Module                      category
NumReg                         int64
Associated                     int64
PercentageEstimate           float64
BinaryEstimate                 int64
Day                         category
Time                        category
dtype: object

In [87]:
# Find best model: can only use tues-fri wk1, mon-fri wk2, 9-5
def FeatureEngineering(df, target, wifiData):
    results = pd.DataFrame(columns=['Features', 'CrossValScore', 'NullAccuracy'])
    intercept = pd.DataFrame({'Intercept':np.ones(len(df))})
    y = df[target]
    
    for ind, sub in enumerate(subsets): 
        x = pd.concat([intercept, df[wifiData]], axis=1)
        features = [wifiData]
        for s in sub:
            features.append(s)
            if pd.core.common.is_categorical_dtype(df[s]):
                x = pd.concat([x, pd.get_dummies(df[s], prefix=s)], axis=1)
            else:
                x = pd.concat([x, df[s]], axis=1)
        results.loc[ind] = [features, cross_val_score(LogisticRegression(), x, y, scoring='accuracy', cv = 10).mean(), cross_val_score(DummyClassifier(strategy='most_frequent'), x, y, scoring='accuracy', cv = 10).mean()]
    return results

def PredictOccupancy(results, df, target, wifiData):
    features = results['Features'][results['CrossValScore'].idxmax()]
    x = pd.DataFrame(pd.concat([pd.DataFrame({'Intercept':np.ones(len(df))})], axis=1))
    y = df[target]

    for feat in features:
        if pd.core.common.is_categorical_dtype(df[feat]):
            x = pd.concat([x, pd.get_dummies(df[feat], prefix=feat)], axis=1)
        else:
            x = pd.concat([x, df[feat]], axis=1)
  
    logreg = LogisticRegression().fit(x, y)
    predictions = pd.DataFrame(logreg.predict(x), columns=["Predictions"])
    return pd.concat([df['DateTime'], df['Room'], predictions], axis=1)

In [62]:
target = 'BinaryEstimate'
wifiData = 'Associated'
wifiDataAvgs = 'AvgNumWifiConn'

# FeatureEngineering returns a df with all possible models,
# df contains colums with features used in the model, the cross validation score and the null_accuracy cross validation score
res = FeatureEngineering(df, target, wifiData)
res_avgs = FeatureEngineering(df_avgs, target, wifiDataAvgs)

In [80]:
# Find best model (highest cross val score)
print("All wifi data: ", res['CrossValScore'].max())
print("Averages: ", res_avgs['CrossValScore'].max())
print("Features in best model (all wifi data): ", res['Features'][res['CrossValScore'].idxmax()] )
print("Features in best model (averages): ", res_avgs['Features'][res_avgs['CrossValScore'].idxmax()] )

All wifi data:  0.799403984064
Averages:  0.852077922078
Features in best model (all wifi data):  ['Associated', 'Time', 'Room', 'Capacity']
Features in best model (averages):  ['AvgNumWifiConn', 'Room', 'Capacity']


In [88]:
# Make predictions for the best model
predictions = PredictOccupancy(res, df, target, wifiData)
predictions_avgs = PredictOccupancy(res_avgs, df_avgs, target, wifiDataAvgs)

In [89]:
# want only one prediction for each hour 
# all wifi data predictions predict for every five minutes of the hour
# take most common result of each hour as final prediction

# list of all datetimes
DateTimes = df['DateTime'].unique()
# list of all rooms
Rooms = df['Room'].unique()

# dataframes to hold results
final_predictions = pd.DataFrame(columns=['DateTime', 'Room', 'Estimate'])
room = pd.DataFrame(columns=['DateTime', 'Room', 'Estimate'])

# iterate through each datetime for eech room
for r in Rooms:
    for ind, dt in enumerate(DateTimes):
        # get all the predictions for particular hour for particular room
        # http://stackoverflow.com/questions/17071871/select-rows-from-a-dataframe-based-on-values-in-a-column-in-pandas
        group = predictions['Predictions'].loc[((predictions['DateTime'] == dt) & (predictions['Room'] == r))]
    #     print(group['Predictions'].value_counts())
    #     print('MAX:', group['Predictions'].value_counts().idxmax())
        # add datetime, room and most common prediction for the hour to a dataframe
        # most frequent value: http://stackoverflow.com/questions/15138973/how-to-get-the-number-of-the-most-frequent-value-in-a-column
        room.loc[ind] = [dt, r, group.value_counts().idxmax()]
    # add dataframe (room) containing final predictions for a certain room to the final_predictions df
    # need to do this becuase 'ind' gets reset for each room so will overwrite rows already in dataframe 'room'
    final_predictions = final_predictions.append(room, ignore_index=True)

print(final_predictions)

               DateTime   Room  Estimate
0   2015-11-03 09:00:00  B-002       0.0
1   2015-11-03 10:00:00  B-002       1.0
2   2015-11-03 11:00:00  B-002       1.0
3   2015-11-03 12:00:00  B-002       1.0
4   2015-11-03 13:00:00  B-002       0.0
5   2015-11-03 14:00:00  B-002       1.0
6   2015-11-03 15:00:00  B-002       1.0
7   2015-11-03 16:00:00  B-002       1.0
8   2015-11-04 09:00:00  B-002       0.0
9   2015-11-04 10:00:00  B-002       1.0
10  2015-11-04 11:00:00  B-002       1.0
11  2015-11-04 12:00:00  B-002       1.0
12  2015-11-04 13:00:00  B-002       1.0
13  2015-11-04 14:00:00  B-002       1.0
14  2015-11-04 15:00:00  B-002       1.0
15  2015-11-04 16:00:00  B-002       1.0
16  2015-11-05 09:00:00  B-002       0.0
17  2015-11-05 10:00:00  B-002       1.0
18  2015-11-05 11:00:00  B-002       1.0
19  2015-11-05 12:00:00  B-002       1.0
20  2015-11-05 13:00:00  B-002       0.0
21  2015-11-05 14:00:00  B-002       1.0
22  2015-11-05 15:00:00  B-002       1.0
23  2015-11-05 1

In [90]:
# Add results to table in database
name = 'DatabaseMain'
conn = db.connect(host = "localhost", user = "root", passwd ="", db=name)
cursor = conn.cursor()

final_predictions.to_sql(con=con, name='BinaryPredictions', if_exists='replace', flavor='mysql')
predictions_avgs.to_sql(con=con, name='BinaryPredictionsAvgs', if_exists='replace', flavor='mysql')

  chunksize=chunksize, dtype=dtype)
