In [1]:
import time
import numpy as np
import pandas as pd

In [2]:
def prepareDataFrame(data_file):
    #Takes about one minute to load
    data=pd.read_csv(data_conv,header=None,sep=" ")
    data.columns=["Date","Hour","Sensor","Value","Voltage"]
    data=data.sort_values(['Date','Hour']).reset_index(drop=True)
    
    data['datetime']=pd.to_datetime(data.Date+' '+data.Hour)
    data['relative_datetime']=data['datetime']-data['datetime'][0]
    data['seconds']=data['relative_datetime'].dt.total_seconds()
    
    sensorId_type=data.Sensor.str.split("-",expand=True)
    sensorId_type.columns=['SensorId','Type']
    data['SensorId']=sensorId_type['SensorId'].astype(int)
    data['Type']=sensorId_type['Type'].astype(int)
    
    #Drop features not needed for the simulation
    data=data.drop(['datetime','relative_datetime','Sensor','Date','Hour','Voltage'],axis=1)
    return data

In [3]:
data_conv = "../data/data.conv.txt"
data = prepareDataFrame(data_conv)

In [60]:
temp_1=data[(data.SensorId==1) & (data.Type==0) & (data.seconds<=8*86400)]
temp_1=temp_1.reset_index(drop=True)

In [61]:
temp_24=data[(data.SensorId==24) & (data.Type==0) & (data.seconds<=8*86400)]
temp_24=temp_24.reset_index(drop=True)

In [62]:
# 5 closest neighbors of sensor 1 are sensors 2, 3, 33, 34, 35
neighbors_1 = [data[(data.SensorId==2) & (data.Type==0) & (data.seconds<=8*86400)],\
               data[(data.SensorId==3) & (data.Type==0) & (data.seconds<=8*86400)],\
               data[(data.SensorId==33) & (data.Type==0) & (data.seconds<=8*86400)],\
               data[(data.SensorId==34) & (data.Type==0) & (data.seconds<=8*86400)],\
               data[(data.SensorId==35) & (data.Type==0) & (data.seconds<=8*86400)]]

In [63]:
# 5 closest neighbors of sensor 24 are sensors 22, 23, 25, 26, 27
neighbors_24 = [data[(data.SensorId==22) & (data.Type==0) & (data.seconds<=8*86400)],\
                data[(data.SensorId==23) & (data.Type==0) & (data.seconds<=8*86400)],\
                data[(data.SensorId==25) & (data.Type==0) & (data.seconds<=8*86400)],\
                data[(data.SensorId==26) & (data.Type==0) & (data.seconds<=8*86400)],\
                data[(data.SensorId==27) & (data.Type==0) & (data.seconds<=8*86400)]]

In [64]:
def convertTimeToSlots(dataframe):
    """
    Add a column 'slot' to the dataframe and divides the relative time in slots
    Also replace the 'seconds' value by the amount of seconds at the center of the slot
    :param dataframe: dataframe on which the operation are performed
    :return: modified dataframe
    """
    interval_slot = 30
    #divide data in slots of 30sec, add each slot value to each entry
    dataframe["slot"] = (dataframe["seconds"]//interval_slot).astype(int)
    #transform the seconds so that for each slot, its corresponding 'seconds' value is at the center of this slot (usefull for plots)
    dataframe["seconds"] = interval_slot*(dataframe["slot"] + dataframe["slot"]+1) / 2
    
    #Take care of the potential multiple value appearing within the same slot -> average them
    dataframe = dataframe.groupby(["slot"]).agg("mean")
    dataframe.reset_index(level=0, inplace=True)
    return dataframe

def fillMissingRows(dataframe):
    """
    Fill missing rows of the dataframe to ensure that there is a value at each time step (slot)
    so a prediction and a correction can be performed
    :param dataframe: dataframe on which the operation are performed
    :return: completed dataframe
    """
    interval_slot = 30
    nb_slots = 23040
    sensor_type = dataframe["SensorId"].values[0]
    missing_values = {"slot": [], "seconds": [], "SensorId": [], "Type": []}
    for i in range (nb_slots): #total nb of slots for 8 days
        if i not in dataframe["slot"].values:
            seconds = interval_slot*(i + i+1) / 2
            missing_values["slot"].append(i)
            missing_values["seconds"].append(seconds)
            missing_values["SensorId"].append(sensor_type)
            missing_values["Type"].append(0)
    #Build DataFrame with missing values
    temp_missing = pd.DataFrame(missing_values)
    #Merge the two Dataframe and sort them by values of the 'slot' column
    #At this point, the temperature values are still missing -> NaN
    complete_temp = dataframe.append(temp_missing).sort_values('slot')
    #Replace NaN by values extracted from a linear method based on the neighbors
    complete_temp["Value"] = complete_temp["Value"].interpolate(limit_direction="both")    
    return complete_temp

def preprocessDataFrames(output_df, input_dfs):
    """
    Preprocess the different dataframes to add their time slots and their missing values
    :param output_df: dataframe containing information of the desired sensor
    :param input_dfs: list of dataframes containing information of the neighbors sensors
    :return: complete output_df and merged list of complete neighbors df
    """
    output_df = fillMissingRows(convertTimeToSlots(output_df))
    output_df.reset_index(level=0, inplace=True)
    for i in range (len(input_dfs)):
        input_dfs[i] = fillMissingRows(convertTimeToSlots(input_dfs[i]))
    merged_inputs_dfs = pd.concat(input_dfs) 
    merged_inputs_dfs["slot"] = merged_inputs_dfs["slot"].astype(int)
    merged_inputs_dfs.reset_index(level=0, inplace=True)
    return output_df, merged_inputs_dfs


In [66]:
complete_temp_1, merged_neighbors_1 = preprocessDataFrames(temp_1, neighbors_1)
complete_temp_24, merged_neighbors_24 = preprocessDataFrames(temp_24, neighbors_24)

In [67]:
def applyRLS(output_df, input_dfs):
    """
    Apply the Recursive Least Square algorithm to predict the temperature of the desired sensor
    :param output_df: dataframe containing information of the desired sensor -> what we want to predict
    :param input_dfs: dataframe containing information of the neighbors -> features to learn
    """
    first_day_slots = 2880
    nb_slots = 23040
    # init weight, covariance matrix and forgetting parameter
    nb_features = len(input_dfs[input_dfs.slot == 0])
    betas = np.zeros(nb_features)
    covar_matrix = np.diag(np.zeros(nb_features)+1)
    forgetting_factor = 1.0
    
    predictions = []
    for slot in range (nb_slots-1):
        # truth value of next slot
        output = output_df[output_df.slot == slot+1].Value
        # features for current slot (neighbors values)
        inputs = np.asarray(input_dfs[input_dfs.slot == slot].Value)
        # compute signal
        signal = np.dot(inputs.T,betas)
        predictions.append(signal)
        
        # compute error and update weight
        error = output - signal
        delta_weight = covar_matrix.dot(inputs.T) *  error.values[0] #error.values[0] because error is a Series, we want a value only
        
        # update weight and covariance matrix
        betas = betas + delta_weight
        # update covariance matrix
        numerator = np.dot(np.dot(np.dot(covar_matrix, inputs), inputs), covar_matrix)
        denominator = forgetting_factor + np.dot(np.dot(inputs.T, covar_matrix), inputs)
        covar_matrix = (1/forgetting_factor) * (covar_matrix - (numerator/denominator))

    return predictions[first_day_slots-1:]  #Only need to have the predictions starting from day 2 (day 1 is used to bootstrap the prediction system)

In [68]:
predictions_1 = applyRLS(complete_temp_1, merged_neighbors_1)
predictions_24 = applyRLS(complete_temp_24, merged_neighbors_24)

In [69]:
truth_1 = complete_temp_1.Value
truth_1 = truth_1.loc[2880:len(truth_1)]
seconds_1 = complete_temp_1.seconds
seconds_1 = seconds_1.loc[2880:len(seconds_1)]

truth_24 = complete_temp_24.Value
truth_24 = truth_24.loc[2880:len(truth_24)]
seconds_24 = complete_temp_24.seconds
seconds_24 = seconds_24.loc[2880:len(seconds_24)]

In [71]:
def computeMSE(truth, predictions, sensor_id):
    MSE=np.mean((np.array(truth)-np.array(predictions))**2)
    print("MSE of model on day for sensor ", sensor_id, " for the 8 days: "+str(MSE))
    return MSE

MSE_1 = computeMSE(truth_1, predictions_1, 1)
MSE_24 = computeMSE(truth_24, predictions_24, 24)

MSE of model on day for sensor  1  for the 8 days: 0.24023144022324755
MSE of model on day for sensor  24  for the 8 days: 0.44776196574744676


In [72]:
#For plots
import plotly.graph_objs as go
from plotly.offline import download_plotlyjs, init_notebook_mode, plot, iplot
init_notebook_mode()

In [73]:
def plotPersistenceModel(sensor_id, truth, predictions, seconds, MSE):
    trace_truth = go.Scatter(
        y = truth,
        x = seconds,
        name="Truth"
    )

    trace_predictions = go.Scatter(
        y = predictions,
        x = seconds,
        name="Predictions"
    )

    layout= go.Layout(
        title= 'Truth and predictions for sensor ' + str(sensor_id) + ', for the 8 days<br>Persistence model <br>'+\
                'Mean square error: '+str(MSE),
        xaxis= dict(
            title= 'Time (seconds)',
        ),
        yaxis=dict(
            title= 'Temperature',
        ),
        showlegend= True
    )

    fig= go.Figure(data=[trace_truth,trace_predictions], layout=layout)
    iplot(fig)

In [74]:
plotPersistenceModel(1,truth_1, predictions_1, seconds_1, MSE_1)
plotPersistenceModel(24,truth_24, predictions_24, seconds_24, MSE_24)