In [1]:
import time
import numpy as np
import pandas as pd

In [2]:
def prepareDataFrame(data_file):
    #Takes about one minute to load
    data=pd.read_csv(data_conv,header=None,sep=" ")
    data.columns=["Date","Hour","Sensor","Value","Voltage"]
    data=data.sort_values(['Date','Hour']).reset_index(drop=True)
    
    data['datetime']=pd.to_datetime(data.Date+' '+data.Hour)
    data['relative_datetime']=data['datetime']-data['datetime'][0]
    data['seconds']=data['relative_datetime'].dt.total_seconds()
    
    sensorId_type=data.Sensor.str.split("-",expand=True)
    sensorId_type.columns=['SensorId','Type']
    data['SensorId']=sensorId_type['SensorId'].astype(int)
    data['Type']=sensorId_type['Type'].astype(int)
    
    #Drop features not needed for the simulation
    data=data.drop(['datetime','relative_datetime','Sensor','Date','Hour','Voltage'],axis=1)
    return data

In [3]:
data_conv = "../data/data.conv.txt"
data = prepareDataFrame(data_conv)

# DO NOT RUN ABOVE

In [4]:
temp_1=data[(data.SensorId==1) & (data.Type==0) & (data.seconds<=8*86400)]
temp_1=temp_1.reset_index(drop=True)

In [5]:
temp_24=data[(data.SensorId==24) & (data.Type==0) & (data.seconds<=8*86400)]
temp_24=temp_24.reset_index(drop=True)

In [6]:
# 5 closest neighbors of sensor 1 are sensors 2, 3, 33, 34, 35
neighbors_1 = [data[(data.SensorId==2) & (data.Type==0) & (data.seconds<=8*86400)],\
               data[(data.SensorId==3) & (data.Type==0) & (data.seconds<=8*86400)],\
               data[(data.SensorId==33) & (data.Type==0) & (data.seconds<=8*86400)],\
               data[(data.SensorId==34) & (data.Type==0) & (data.seconds<=8*86400)],\
               data[(data.SensorId==35) & (data.Type==0) & (data.seconds<=8*86400)]]

In [7]:
# 5 closest neighbors of sensor 24 are sensors 22, 23, 25, 26, 27
neighbors_24 = [data[(data.SensorId==22) & (data.Type==0) & (data.seconds<=8*86400)],\
                data[(data.SensorId==23) & (data.Type==0) & (data.seconds<=8*86400)],\
                data[(data.SensorId==25) & (data.Type==0) & (data.seconds<=8*86400)],\
                data[(data.SensorId==26) & (data.Type==0) & (data.seconds<=8*86400)],\
                data[(data.SensorId==27) & (data.Type==0) & (data.seconds<=8*86400)]]

In [8]:
def convertTimeToSlots(dataframe):
    """
    Add a column 'slot' to the dataframe and divides the relative time in slots
    Also replace the 'seconds' value by the amount of seconds at the center of the slot
    :param dataframe: dataframe on which the operation are performed
    :return: modified dataframe
    """
    interval_slot = 30
    #divide data in slots of 30sec, add each slot value to each entry
    dataframe["slot"] = (dataframe["seconds"]//interval_slot).astype(int)
    #transform the seconds so that for each slot, its corresponding 'seconds' value is at the center of this slot (usefull for plots)
    dataframe["seconds"] = interval_slot*(dataframe["slot"] + dataframe["slot"]+1) / 2
    
    #Take care of the potential multiple value appearing within the same slot -> average them
    dataframe = dataframe.groupby(["slot"]).agg("mean")
    dataframe.reset_index(level=0, inplace=True)
    return dataframe

def fillMissingRows(dataframe):
    """
    Fill missing rows of the dataframe to ensure that there is a value at each time step (slot)
    so a prediction and a correction can be performed
    :param dataframe: dataframe on which the operation are performed
    :return: completed dataframe
    """
    interval_slot = 30
    nb_slots = 23040
    sensor_type = dataframe["SensorId"].values[0]
    missing_values = {"slot": [], "seconds": [], "SensorId": [], "Type": []}
    for i in range (nb_slots): #total nb of slots for 8 days
        if i not in dataframe["slot"].values:
            seconds = interval_slot*(i + i+1) / 2
            missing_values["slot"].append(i)
            missing_values["seconds"].append(seconds)
            missing_values["SensorId"].append(sensor_type)
            missing_values["Type"].append(0)
    #Build DataFrame with missing values
    temp_missing = pd.DataFrame(missing_values)
    #Merge the two Dataframe and sort them by values of the 'slot' column
    #At this point, the temperature values are still missing -> NaN
    complete_temp = dataframe.append(temp_missing).sort_values('slot')
    #Replace NaN by values extracted from a linear method based on the neighbors
    complete_temp["Value"] = complete_temp["Value"].interpolate(limit_direction="both")    
    return complete_temp

def preprocessDataFrames(output_df, input_dfs):
    """
    Preprocess the different dataframes to add their time slots and their missing values
    :param output_df: dataframe containing information of the desired sensor
    :param input_dfs: list of dataframes containing information of the neighbors sensors
    :return: complete output_df and merged list of complete neighbors df
    """
    output_df = fillMissingRows(convertTimeToSlots(output_df))
    output_df.reset_index(level=0, inplace=True)
    for i in range (len(input_dfs)):
        input_dfs[i] = fillMissingRows(convertTimeToSlots(input_dfs[i]))
    merged_inputs_dfs = pd.concat(input_dfs) 
    merged_inputs_dfs["slot"] = merged_inputs_dfs["slot"].astype(int)
    merged_inputs_dfs.reset_index(level=0, inplace=True)
    return output_df, merged_inputs_dfs



In [9]:
complete_temp_1, merged_neighbors_1 = preprocessDataFrames(temp_1, neighbors_1)
complete_temp_24, merged_neighbors_24 = preprocessDataFrames(temp_24, neighbors_24)

of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  sort=sort)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  # Remove the CWD from sys.path while we load stuff.
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  if sys.path[0] == '':
of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  sort=sort)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/

of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  sort=sort)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  # Remove the CWD from sys.path while we load stuff.
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  if sys.path[0] == '':
of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  sort=sort)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/

# SGD FROM HERE

In [11]:
print(complete_temp_1.head(5))
print(merged_neighbors_1.head(5))
print(complete_temp_24.head(5))
print(merged_neighbors_24.head(5))

   index  SensorId  Type      Value  seconds  slot
0      0         1     0  19.243600     15.0     0
1      1         1     0  19.243600     45.0     1
2      0         1     0  19.243600     75.0     2
3      2         1     0  19.237067    105.0     3
4      3         1     0  19.230533    135.0     4
        index  SensorId  Type    Value   seconds   slot
115195  15828        35     0  23.1538  691065.0  23035
115196  15829        35     0  23.1538  691095.0  23036
115197   7207        35     0  23.1538  691125.0  23037
115198  15830        35     0  23.1538  691155.0  23038
115199  15831        35     0  23.1636  691185.0  23039
   index  SensorId  Type    Value  seconds  slot
0      0        24     0  18.1362     15.0     0
1      0        24     0  18.1362     45.0     1
2      1        24     0  18.1166     75.0     2
3      2        24     0  18.1264    105.0     3
4      3        24     0  18.1362    135.0     4
   index  SensorId  Type    Value  seconds  slot
0      0       

In [14]:
day_1 = complete_temp_1.drop(complete_temp_1[complete_temp_1.slot >= 2880].index)
print(day_1)

   index  SensorId  Type      Value  seconds  slot
0      0         1     0  19.243600     15.0     0
1      1         1     0  19.243600     45.0     1
2      0         1     0  19.243600     75.0     2
3      2         1     0  19.237067    105.0     3
4      3         1     0  19.230533    135.0     4
      index  SensorId  Type     Value  seconds  slot
2875   2230         1     0  19.22400  86265.0  2875
2876   2231         1     0  18.47920  86295.0  2876
2877   2232         1     0  18.83690  86325.0  2877
2878   2233         1     0  19.22400  86355.0  2878
2879    645         1     0  19.06916  86385.0  2879
      index  SensorId  Type      Value  seconds  slot
0         0         1     0  19.243600     15.0     0
1         1         1     0  19.243600     45.0     1
2         0         1     0  19.243600     75.0     2
3         2         1     0  19.237067    105.0     3
4         3         1     0  19.230533    135.0     4
5         1         1     0  19.224000    165.0     

In [17]:
Y = day_1['Value']
bias = pd.Series(1,index=range(len(Y))) 
print(Y)

0       19.243600
1       19.243600
2       19.243600
3       19.237067
4       19.230533
5       19.224000
6       19.214200
7       19.199500
8       19.184800
9       19.189700
10      19.194600
11      19.184800
12      19.175000
13      19.184800
14      19.184800
15      19.184800
16      19.175000
17      19.168467
18      19.161933
19      19.155400
20      19.155400
21      19.148050
22      19.140700
23      19.133350
24      19.126000
25      19.145600
26      19.126000
27      19.129267
28      19.132533
29      19.135800
          ...    
2850    18.873650
2851    18.871200
2852    18.871200
2853    18.866300
2854    18.866300
2855    19.273000
2856    19.072100
2857    18.871200
2858    19.263200
2859    18.469400
2860    18.861400
2861    18.459600
2862    18.653150
2863    18.846700
2864    18.841800
2865    19.224000
2866    18.727467
2867    18.866300
2868    19.224000
2869    19.224000
2870    18.846700
2871    18.469400
2872    19.214200
2873    19.204400
2874    18

In [216]:
data = day_1.drop(["index", "SensorId", "Type", "slot", "seconds"], axis = 1)
data["Bias"] = bias
print(data)

          Value  Bias
0     19.243600     1
1     19.243600     1
2     19.243600     1
3     19.237067     1
4     19.230533     1
5     19.224000     1
6     19.214200     1
7     19.199500     1
8     19.184800     1
9     19.189700     1
10    19.194600     1
11    19.184800     1
12    19.175000     1
13    19.184800     1
14    19.184800     1
15    19.184800     1
16    19.175000     1
17    19.168467     1
18    19.161933     1
19    19.155400     1
20    19.155400     1
21    19.148050     1
22    19.140700     1
23    19.133350     1
24    19.126000     1
25    19.145600     1
26    19.126000     1
27    19.129267     1
28    19.132533     1
29    19.135800     1
...         ...   ...
2850  18.873650     1
2851  18.871200     1
2852  18.871200     1
2853  18.866300     1
2854  18.866300     1
2855  19.273000     1
2856  19.072100     1
2857  18.871200     1
2858  19.263200     1
2859  18.469400     1
2860  18.861400     1
2861  18.459600     1
2862  18.653150     1
2863  18.8

In [217]:
Header_X_Bias = list(data.columns.values)
Header_X_Bias = Header_X_Bias[:-1]
Header_X_Bias.insert(0,"Bias")

print(Header_X_Bias)
data = data[Header_X_Bias] 
print(data)

['Bias', 'Value']
      Bias      Value
0        1  19.243600
1        1  19.243600
2        1  19.243600
3        1  19.237067
4        1  19.230533
5        1  19.224000
6        1  19.214200
7        1  19.199500
8        1  19.184800
9        1  19.189700
10       1  19.194600
11       1  19.184800
12       1  19.175000
13       1  19.184800
14       1  19.184800
15       1  19.184800
16       1  19.175000
17       1  19.168467
18       1  19.161933
19       1  19.155400
20       1  19.155400
21       1  19.148050
22       1  19.140700
23       1  19.133350
24       1  19.126000
25       1  19.145600
26       1  19.126000
27       1  19.129267
28       1  19.132533
29       1  19.135800
...    ...        ...
2850     1  18.873650
2851     1  18.871200
2852     1  18.871200
2853     1  18.866300
2854     1  18.866300
2855     1  19.273000
2856     1  19.072100
2857     1  18.871200
2858     1  19.263200
2859     1  18.469400
2860     1  18.861400
2861     1  18.459600
2862     1  18

In [218]:
X = np.array(data)
Y = np.array(Y)
Y = Y.reshape(len(Y),1)
Theta = [0,0]
Theta = np.array(Theta)
Theta = Theta.reshape(2,1)

In [219]:
print(X)

[[ 1.      19.2436 ]
 [ 1.      19.2436 ]
 [ 1.      19.2436 ]
 ...
 [ 1.      18.8369 ]
 [ 1.      19.224  ]
 [ 1.      19.06916]]


In [220]:
print(Y)

[[19.2436 ]
 [19.2436 ]
 [19.2436 ]
 ...
 [18.8369 ]
 [19.224  ]
 [19.06916]]


In [221]:
print(Theta)

[[0]
 [0]]


In [222]:
def cost(X,Y,Theta):    
    Hypothesis = np.dot(X,Theta)    
    Error = Y - Hypothesis    
    #Matrix method for calculating Cost    
    Cost = np.dot(Error.T,Error)/(2*len(Error))    
    return Cost[0][0]

In [223]:
alpha = 0.01
Iterations = 1500

In [224]:
def gradient(X,Y,Theta,Iterations,alpha):    
    for i in range(Iterations):        
        Loss = Y - np.dot(X,Theta) + (np.dot(Theta.T,Theta)*0.001)
        Cost = cost(X,Y,Theta)        
        Loss = Loss*(-1)        
        dJ = (np.dot(X.T,Loss)*2)/len(Y)
        Theta = Theta - (alpha*dJ)
    return Theta

In [225]:
Theta_Iterated = gradient(X,Y,Theta,Iterations,alpha)

Theta_Normal = np.dot(np.dot(np.linalg.inv(np.dot(X.T,X)),X.T),Y)

  This is separate from the ipykernel package so we can avoid doing imports until


In [226]:
print(Theta_Iterated)

[[nan]
 [nan]]


In [227]:
print(Theta_Normal)

[[6.8567374e-13]
 [1.0000000e+00]]


In [228]:
prediction = np.dot(X, Theta_Normal)
real = Y

In [229]:
for i in range(len(prediction)):
    print("pred: {} |real : {}".format(prediction[i][0], real[i][0]))

pred: 19.243600000000335 |real : 19.2436
pred: 19.243600000000335 |real : 19.2436
pred: 19.243600000000335 |real : 19.2436
pred: 19.237066666667 |real : 19.237066666666667
pred: 19.230533333333668 |real : 19.230533333333334
pred: 19.224000000000334 |real : 19.224
pred: 19.214200000000336 |real : 19.2142
pred: 19.199500000000334 |real : 19.1995
pred: 19.184800000000333 |real : 19.1848
pred: 19.189700000000336 |real : 19.189700000000002
pred: 19.194600000000335 |real : 19.1946
pred: 19.184800000000337 |real : 19.184800000000003
pred: 19.175000000000335 |real : 19.175
pred: 19.184800000000333 |real : 19.1848
pred: 19.184800000000333 |real : 19.1848
pred: 19.184800000000333 |real : 19.1848
pred: 19.175000000000335 |real : 19.175
pred: 19.168466666667 |real : 19.168466666666667
pred: 19.161933333333668 |real : 19.161933333333334
pred: 19.155400000000334 |real : 19.1554
pred: 19.155400000000334 |real : 19.1554
pred: 19.14805000000033 |real : 19.148049999999998
pred: 19.140700000000333 |real 

pred: 20.870400000000302 |real : 20.8704
pred: 20.870400000000302 |real : 20.8704
pred: 23.516400000000257 |real : 23.5164
pred: 23.506600000000255 |real : 23.5066
pred: 23.496800000000256 |real : 23.4968
pred: 23.467400000000257 |real : 23.4674
pred: 20.772400000000307 |real : 20.7724
pred: 21.660933333333624 |real : 21.660933333333336
pred: 22.54946666666694 |real : 22.549466666666667
pred: 23.438000000000258 |real : 23.438000000000002
pred: 22.08560000000028 |real : 22.0856
pred: 22.095400000000282 |real : 22.0954
pred: 22.095400000000282 |real : 22.0954
pred: 22.08560000000028 |real : 22.0856
pred: 22.09050000000028 |real : 22.0905
pred: 22.09540000000028 |real : 22.095399999999998
pred: 22.09050000000028 |real : 22.0905
pred: 20.713600000000305 |real : 20.7136
pred: 20.718500000000304 |real : 20.7185
pred: 20.723400000000307 |real : 20.7234
pred: 21.161133333333634 |real : 21.161133333333336
pred: 21.598866666666957 |real : 21.598866666666666
pred: 22.03660000000028 |real : 22.036

In [265]:
def online_train(Xn, Yn, Theta):
    loss = Yn - np.dot(Xn,Theta) + (np.dot(Theta.T,Theta)*0.001)
    loss = loss*(-1)        
    dJ = (np.dot(Xn.T,loss)*2)/len(Y)
    Theta = Theta - (alpha*dJ)
    return Theta

# APPROACH FOR ALGO

In [378]:
SLOTS_PER_DAY = 2880
SIM_DAYS = 8

alpha = 0.01
Iterations = 1500

def applySGD(complete_temp_1):
    Y_per_day = []
    
    # extract data for each day
    for i in range(0, SIM_DAYS):
        
        # cut into slots / day
        day = complete_temp_1.drop(complete_temp_1[complete_temp_1.slot >= (i + 1) * SLOTS_PER_DAY].index)
        day = day.drop(day[day.slot < i * SLOTS_PER_DAY].index)
        
        # extract X, Y
        Y = day['Value']
        
        # append to lists
        Y_per_day.append(Y)
        
    # initialize bias terms
    bias = pd.Series(1,index=range(len(Y))) 
    
    # initialize weights
    Theta = [0,0]
    Theta = np.array(Theta)
    Theta = Theta.reshape(2,1)
    
    # predict starting from day 1
    for i in range(1, SIM_DAYS):
        
        # data for training = values observed previous day (Y - 1)
        X = Y_per_day[i-1].to_frame()
        X = X.reset_index()
        bias = pd.Series(1,index=range(len(Y))) 
        X["Bias"] = bias
        X = X.drop(["index"], axis=1)
        
        # put bias first
        Header_X_Bias = list(X.columns.values)
        Header_X_Bias = Header_X_Bias[:-1]
        Header_X_Bias.insert(0,"Bias")
        X = X[Header_X_Bias]
        
        # goal = values current day
        Y = Y_per_day[i]
        
        # convert
        X = np.array(X)
        Y = np.array(Y)
        Y = Y.reshape(len(Y),1)
        
        # update weights & normalize for pricrion
        Theta = online_train(X,Y, Theta)
        Theta_Norm = np.dot(np.dot(np.linalg.inv(np.dot(X.T,X)),X.T),Y)
        
        prediction = np.dot(X, Theta_Norm)
        
        # compare on first 5
        for i in range(5):
            print("supposed: {}| predicted: {}".format(Y[i], prediction[i]))
        print("-- --")

In [379]:
applySGD(complete_temp_1)

supposed: [18.91432]| predicted: [20.3946819]
supposed: [18.75948]| predicted: [20.3946819]
supposed: [18.60464]| predicted: [20.3946819]
supposed: [18.4498]| predicted: [20.38659025]
supposed: [18.4449]| predicted: [20.3784986]
-- --
supposed: [19.3269]| predicted: [19.57842349]
supposed: [19.33425]| predicted: [19.45993885]
supposed: [19.3416]| predicted: [19.34145421]
supposed: [19.3465]| predicted: [19.22296957]
supposed: [19.3514]| predicted: [19.21922005]
-- --
supposed: [20.057]| predicted: [20.26615093]
supposed: [20.0668]| predicted: [20.27299488]
supposed: [20.06026667]| predicted: [20.27983884]
supposed: [20.05373333]| predicted: [20.28440147]
supposed: [20.0472]| predicted: [20.28896411]
-- --
supposed: [20.5666]| predicted: [20.04769379]
supposed: [20.5666]| predicted: [20.05691903]
supposed: [20.56415]| predicted: [20.05076887]
supposed: [20.5617]| predicted: [20.04461871]
supposed: [20.55925]| predicted: [20.03846855]
-- --
supposed: [20.4196]| predicted: [20.87012482]
s