In [19]:
import numpy as np
import pandas as pd

In [20]:
# read raw csv
data = pd.read_csv("inputdata.csv") 
data.head()

Unnamed: 0,timestamp,current,lampIsRunning,voltage
0,2019-11-05 09:19:17,0.0,False,0.0
1,2019-11-05 09:19:17,0.0,False,0.0
2,2019-11-05 09:19:17,138.933486,False,34.535565
3,2019-11-05 09:19:17,138.933486,True,34.535565
4,2019-11-05 09:19:17,139.860172,True,33.039042


In [21]:
# normalize data

data = data.iloc[::-1] # reverse data

print(data["voltage"].max())
print(data["current"].max())

data["voltage"] = data["voltage"] / data["voltage"].max();
data["current"] = data["current"] / data["current"].max();

76.10825644869111
177.8291001434133


In [22]:
data.head()

Unnamed: 0,timestamp,current,lampIsRunning,voltage
10759,2019-11-04 22:48:20,0.670455,,
10758,2019-11-04 22:48:20,0.670455,,0.219111
10757,2019-11-04 22:48:22,0.672966,,0.219111
10756,2019-11-04 22:48:22,0.672966,,0.21445
10755,2019-11-04 22:48:24,0.674372,,0.21445


In [23]:
# remove and NaN rows and add TWX Analytics entityId feature (it is necessary for TWX Analytics)

data.dropna(inplace=True); # remove NaN rows
data.insert(0, "entityId", "Test1")
data

Unnamed: 0,entityId,timestamp,current,lampIsRunning,voltage
10675,Test1,2019-11-04 22:50:20,0.670266,False,0.834433
10674,Test1,2019-11-04 22:50:22,0.000000,False,0.000000
10673,Test1,2019-11-04 22:50:56,0.674805,True,0.210227
10672,Test1,2019-11-04 22:50:58,0.675624,True,0.201216
10671,Test1,2019-11-04 22:51:00,0.674761,True,0.194146
...,...,...,...,...,...
4,Test1,2019-11-05 09:19:17,0.786486,True,0.434106
3,Test1,2019-11-05 09:19:17,0.781275,True,0.453769
2,Test1,2019-11-05 09:19:17,0.781275,False,0.453769
1,Test1,2019-11-05 09:19:17,0.000000,False,0.000000


In [24]:
data.to_csv("normalizedData.csv", sep=',', encoding='utf-8', index=False)

# prepare vector with failures indexes

falses = data.index[data['lampIsRunning'] == False].tolist()
falses

[10675,
 10674,
 10633,
 10632,
 10255,
 10254,
 10115,
 10114,
 10044,
 10043,
 9986,
 9985,
 9984,
 9983,
 9982,
 2,
 1,
 0]

In [25]:
# add ttf feature (initialized with zeros)

data.insert(5, "ttf", 0)
data

Unnamed: 0,entityId,timestamp,current,lampIsRunning,voltage,ttf
10675,Test1,2019-11-04 22:50:20,0.670266,False,0.834433,0
10674,Test1,2019-11-04 22:50:22,0.000000,False,0.000000,0
10673,Test1,2019-11-04 22:50:56,0.674805,True,0.210227,0
10672,Test1,2019-11-04 22:50:58,0.675624,True,0.201216,0
10671,Test1,2019-11-04 22:51:00,0.674761,True,0.194146,0
...,...,...,...,...,...,...
4,Test1,2019-11-05 09:19:17,0.786486,True,0.434106,0
3,Test1,2019-11-05 09:19:17,0.781275,True,0.453769,0
2,Test1,2019-11-05 09:19:17,0.781275,False,0.453769,0
1,Test1,2019-11-05 09:19:17,0.000000,False,0.000000,0


In [26]:
# calculate TTF for each row

count = 0
for index, row in data.iterrows():
    data.at[index,'ttf'] = count
    count += 1
    if index == falses[0]:
        falses.pop(0)
        count = 0

data

Unnamed: 0,entityId,timestamp,current,lampIsRunning,voltage,ttf
10675,Test1,2019-11-04 22:50:20,0.670266,False,0.834433,0
10674,Test1,2019-11-04 22:50:22,0.000000,False,0.000000,0
10673,Test1,2019-11-04 22:50:56,0.674805,True,0.210227,0
10672,Test1,2019-11-04 22:50:58,0.675624,True,0.201216,1
10671,Test1,2019-11-04 22:51:00,0.674761,True,0.194146,2
...,...,...,...,...,...,...
4,Test1,2019-11-05 09:19:17,0.786486,True,0.434106,9977
3,Test1,2019-11-05 09:19:17,0.781275,True,0.453769,9978
2,Test1,2019-11-05 09:19:17,0.781275,False,0.453769,9979
1,Test1,2019-11-05 09:19:17,0.000000,False,0.000000,0


In [27]:
# add TTF Label for each row (time to failure label)

data.insert(6, "ttf_label", "")

ttf_labels = ["failure", "<30sec", "<1min", "1-2min", ">2min"]

for index, row in data.iterrows():
    if row["ttf"] > 400:
        data.drop(index=index, inplace=True)
    else:
        if row["ttf"] == 0:
            data.at[index,'ttf_label'] = ttf_labels[0]
        elif row["ttf"] <= 15:
            data.at[index,'ttf_label'] = ttf_labels[1]
        elif row["ttf"] <= 30:
            data.at[index,'ttf_label'] = ttf_labels[2]
        elif row["ttf"] <= 120:
            data.at[index,'ttf_label'] = ttf_labels[3]
        else:
            data.at[index,'ttf_label'] = ttf_labels[4]

In [28]:
# store results as csv (twice, as we must ensure index to be continous)

data = data.iloc[::-1]
data.to_csv("postprocessed.csv", sep=',', encoding='utf-8', index=False)

data = pd.read_csv("postprocessed.csv") 
data.to_csv("traindata.csv", sep=',', encoding='utf-8', index=True)


In [29]:
data.tail(50)

Unnamed: 0,entityId,timestamp,current,lampIsRunning,voltage,ttf,ttf_label
1047,Test1,2019-11-04 22:59:36,0.676179,True,0.199451,5,<30sec
1048,Test1,2019-11-04 22:59:34,0.673441,True,0.203553,4,<30sec
1049,Test1,2019-11-04 22:59:32,0.671376,True,0.207906,3,<30sec
1050,Test1,2019-11-04 22:59:30,0.672471,True,0.213103,2,<30sec
1051,Test1,2019-11-04 22:59:28,0.674805,True,0.210227,1,<30sec
1052,Test1,2019-11-04 22:59:28,0.0,True,0.0,0,failure
1053,Test1,2019-11-04 22:51:50,0.0,False,0.0,0,failure
1054,Test1,2019-11-04 22:51:48,0.060894,False,1.0,40,1-2min
1055,Test1,2019-11-04 22:51:48,0.060894,True,1.0,39,1-2min
1056,Test1,2019-11-04 22:51:46,0.063681,True,0.998578,38,1-2min
