In [1]:
import MySQLdb as db
import pandas as pd
import numpy as np
from sklearn.cross_validation import cross_val_score
from sklearn.linear_model import LogisticRegression
from sklearn.dummy import DummyClassifier

In [2]:
# Connect to Database
name = "DatabaseMain"
con = db.connect(host="localhost", user="root", passwd='', db=name)
cursor = con.cursor()

In [3]:
# ABT - wifi averages into a df
sql_avgs = """SELECT G.DateTime, W.Room, R.Capacity, T.Module, M.NumReg, W.AvgNumWifiConn, G.PercentageEstimate, G.BinaryEstimate
        FROM DatabaseMain.AverageNumWifiConnections W, DatabaseMain.GroundTruth G, DatabaseMain.Rooms R, DatabaseMain.TimeModule T, DatabaseMain.Modules M
        WHERE W.Room = G.Room AND W.DateTime = G.DateTime AND W.Room = G.Room AND W.Room = R.Room AND W.Room = T.Room AND R.Room = W.Room AND R.Room = G.Room AND T.Room = G.Room AND T.Room = R.Room AND T.Room =  W.Room AND T.DateTime = G.DateTime AND M.ModuleName = T.Module
        ORDER BY W.Room"""
df_avgs = pd.read_sql_query(sql_avgs, con)

In [4]:
# Index DateTime column - want separate day of week and time
# so can be separated by category (e.g day of week or time)

df_avgs['Day'] = df_avgs['DateTime'].dt.dayofweek
df_avgs['Time'] = df_avgs['DateTime'].dt.time

# BINARY MODEL

In [16]:
x = pd.concat([pd.DataFrame({'Intercept':np.ones(len(df_avgs))}), df_avgs[['AvgNumWifiConn', 'Capacity']],pd.get_dummies(df_avgs['Room'], prefix='Room')], axis=1)
y = df_avgs['BinaryEstimate']

In [17]:
logreg = LogisticRegression().fit(x, y)

In [31]:
sql = """SELECT W.Room, W.AvgNumWifiConn, R.Capacity
        FROM DatabaseMain.ExtraAverageNumWifiConnections W, Rooms R
        WHERE R.Room=W.Room AND W.DateTime BETWEEN '2016-07-18 09:00:00' AND '2016-07-18 17:00:00' """
test_data = pd.read_sql_query(sql, con)

In [36]:
sql = """SELECT BinaryEstimate, DateTime, Room
            FROM GroundTruthExtra"""
test_results = pd.read_sql_query(sql, con)

In [34]:
x_test = pd.concat([pd.DataFrame({'Intercept':np.ones(len(test_data))}), test_data[['AvgNumWifiConn', 'Capacity']],pd.get_dummies(test_data['Room'], prefix='Room')], axis=1)

In [42]:
predictions = logreg.predict(x_test)

In [43]:
accuracy = 0 
for i in range(0,len(predictions)):
    if (predictions[i] == test_results['BinaryEstimate'][i]):
        accuracy += 1

accuracy = accuracy/len(predictions)        
print('Accuracy Linear Binary: ', accuracy)

Accuracy Linear Binary:  0.7037037037037037


# PERCENTAGE MODEL

In [5]:
# Add estimate column
df_avgs['Estimate'] = df_avgs['Capacity'] * df_avgs['PercentageEstimate']
# Handle outliers - replace them with the NumReg
df_avgs['Estimate'].loc[df_avgs['Estimate'] > 200] = df_avgs['NumReg']

df_avgs['EstimateAsPercent'] = df_avgs['Estimate'] / df_avgs['Capacity']
groups = [ '0%', '25%', '50%', '75%', '100%',]
bins = [-0.01, 0.00, 0.25, 0.50, 0.75, 1.00]
df_avgs['PercentagePred'] = pd.cut(df_avgs['EstimateAsPercent'], bins, labels = groups )
df_avgs['PercentageCat'] = df_avgs[['PercentagePred', 'Room']].apply(lambda x: ''.join(x), axis=1)
df_avgs = df_avgs.drop(['PercentagePred', 'EstimateAsPercent'], axis = 1)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self._setitem_with_indexer(indexer, value)


In [6]:
# Model: avgNumconn, Day, Room, NumReg
x = pd.concat([pd.DataFrame({'Intercept':np.ones(len(df_avgs))}), df_avgs[['AvgNumWifiConn', 'NumReg']],pd.get_dummies(df_avgs['Room'], prefix='Room'), pd.get_dummies(df_avgs['Day'], prefix='Day')], axis=1)
y = df_avgs['PercentageCat']

In [7]:
logreg = LogisticRegression().fit(x, y)

In [8]:
sql = """SELECT W.Room, W.AvgNumWifiConn, W.DateTime
        FROM DatabaseMain.ExtraAverageNumWifiConnections W, Rooms R
        WHERE R.Room=W.Room AND W.DateTime BETWEEN '2016-07-18 09:00:00' AND '2016-07-18 17:00:00' """
test_data = pd.read_sql_query(sql, con)

In [9]:
test_data = pd.concat([test_data, pd.DataFrame({'NumReg':np.zeros(len(test_data))})], axis=1)

In [10]:
test_data['Day'] = test_data['DateTime'].dt.dayofweek

In [11]:
sql = """SELECT PercentageEstimate, DateTime, Room
            FROM GroundTruthExtra"""
test_results = pd.read_sql_query(sql, con)

In [12]:
x_test = pd.concat([pd.DataFrame({'Intercept':np.ones(len(test_data))}), test_data[['AvgNumWifiConn', 'NumReg']],pd.get_dummies(test_data['Room'], prefix='Room'), pd.get_dummies(test_data['Day'], prefix='Day')], axis=1)
x_test = pd.concat([x_test, pd.DataFrame({'Day_1':np.zeros(len(test_data))}), pd.DataFrame({'Day_2':np.zeros(len(test_data))}), pd.DataFrame({'Day_3':np.zeros(len(test_data))}), pd.DataFrame({'Day_4':np.zeros(len(test_data))})], axis=1)

In [13]:
predictions = logreg.predict(x_test)

In [14]:
predictions = [int(predictions[i].split('%')[0])/100 for i in range(0, len(predictions))]

In [15]:
accuracy = 0 
for i in range(0,len(predictions)):
    if (predictions[i] == test_results['PercentageEstimate'][i]):
        accuracy += 1

accuracy = accuracy/len(predictions)        
print('Accuracy Linear Binary: ', accuracy)

Accuracy Linear Binary:  0.4444444444444444


# ESTIMATE MODEL


In [18]:
# RUN SECTION 2-4 AGAIN FIRST TO GET ORIGINAL ABTS
# Bin results into categories for logistic regression. 
bins = [-1, 25, 50, 75, 100, 125, 150, 175, 200, 220]
groups = [ '0-25', '25-50', '50-75', '75-100', '100-125', '125-150', '150-175', '175-200', '200-220']
df_avgs['OccupantEstimate'] = pd.cut(df_avgs['Estimate'], bins, labels = groups )

In [19]:
# Model: 'AvgNumWifiConn', 'Day', 'Module', 'NumReg', 'Capacity'
x = pd.concat([pd.DataFrame({'Intercept':np.ones(len(df_avgs))}), df_avgs[['AvgNumWifiConn', 'NumReg', 'Capacity']],pd.get_dummies(df_avgs['Module'], prefix='Module'), pd.get_dummies(df_avgs['Day'], prefix='Day')], axis=1)
y = df_avgs['OccupantEstimate']

In [20]:
logreg = LogisticRegression().fit(x, y)

In [27]:
sql = """SELECT W.AvgNumWifiConn, W.DateTime, R.Capacity
        FROM DatabaseMain.ExtraAverageNumWifiConnections W, Rooms R
        WHERE R.Room=W.Room AND W.DateTime BETWEEN '2016-07-18 09:00:00' AND '2016-07-18 17:00:00' """
test_data = pd.read_sql_query(sql, con)

In [37]:
test_data['Day'] = test_data['DateTime'].dt.dayofweek
test_data = pd.concat([test_data, pd.DataFrame({'Module': 'None', 'NumReg':np.zeros(len(test_data)) })],axis=1)

In [34]:
sql = """SELECT G.PercentageEstimate, G.DateTime, G.Room, R.Capacity
            FROM GroundTruthExtra G, Rooms R
            WHERE R.Room = G.Room"""
test_results = pd.read_sql_query(sql, con)
test_results['Estimate'] = test_results['Capacity'] * test_results['PercentageEstimate']
test_results['OccupantEstimate'] = pd.cut(test_results['Estimate'], bins, labels = groups )

In [56]:
x_test = pd.concat([pd.DataFrame({'Intercept':np.ones(len(test_data))}), test_data[['AvgNumWifiConn', 'NumReg', 'Capacity']],pd.get_dummies(test_data['Day'], prefix='Day')], axis=1)
x_test = pd.concat([x_test, pd.DataFrame({'Day_1':np.zeros(len(test_data))}), pd.DataFrame({'Day_2':np.zeros(len(test_data))}), pd.DataFrame({'Day_3':np.zeros(len(test_data))}), pd.DataFrame({'Day_4':np.zeros(len(test_data))})], axis=1)

In [57]:
for i in range(4,len(x.columns)-5):
    if x.columns[i] == 'Module_None':
        x_test = pd.concat([x_test, pd.DataFrame({x.columns[i]:np.ones(len(test_data))})], axis=1)
    else:
        x_test = pd.concat([x_test, pd.DataFrame({x.columns[i]:np.zeros(len(test_data))})], axis=1)                    

In [59]:
predictions = logreg.predict(x_test)


In [62]:
accuracy = 0 
for i in range(0,len(predictions)):
    if (predictions[i] == test_results['OccupantEstimate'][i]):
        accuracy += 1

accuracy = accuracy/len(predictions)        
print('Accuracy Linear Binary: ', accuracy)

Accuracy Linear Binary:  0.9629629629629629


# References
[1] Wirth, R. and Hipp, J., 2000, April. CRISP-DM: Towards a standard process model for data mining. In Proceedings of the 4th international conference on the practical applications of knowledge discovery and data mining (pp. 29-39).

[2] Ifrim, G., ‘Lecture8-DataUnderstanding-Stats-Visualisation’, (COMP47350 Lecture Notes), University College Dublin, 2016

[3] Ifrim, G., ‘Lecture12-DataUnderstanding-Correlation’, (COMP47350 Lecture Notes), University College Dublin, 2016

[4] Ifrim, G.,  ‘Lecture10-DataUnderstanding-MotorInsurance-handson’, (COMP47350 Lecture Notes), University College Dublin, 2016

[5] Ifrim, G., ‘Lecture15-Regression-LinearRegression-Interpretation-updated’, (COMP47350 Lecture Notes), University College Dublin, 2016

[6] Ifrim, G., ‘Lecture16-LinearRegression-handson, (COMP47350 Lecture Notes)’, University College Dublin, 2016

[7] Ifrim, G., ‘Lecture19-ModelEvaluation-ExperimentDesign, (COMP47350 Lecture Notes)’, University College Dublin, 2016