# SmartBeds - Preprocesado de One-Class

## Grado en Ingeniería Informática
## Universidad de Burgos
##### José Luis Garrido Labrador

In [1]:
import pandas as pd # se importa pandas como pd
import numpy as np  #numpy como np
import matplotlib.pyplot as plt
import sys
sys.path.append('..')
from transformers import *
import pickle as pk
from utils import dibujado, start_end

In [2]:
import warnings
warnings.filterwarnings('ignore')

### Carga de los datos

In [3]:
trozos_limpios = None
with open('../data/trozos_limpios.pkl','rb') as f:
    trozos_limpios = pk.load(f)
print(len(trozos_limpios))

98


Extraemos los día que tienen crisis

In [4]:
dias_crisis = []
for tl in trozos_limpios:
    for index,row in tl.iterrows():
        if row['target']:
            dias_crisis.append(tl)
            break
len(dias_crisis)

3

In [5]:
dias_crisis[0].head()

Unnamed: 0,DateTime,P1,P2,P3,P4,P5,P6,target
4089271,2018-11-09 21:03:27,4.098361,18.852459,21.311475,22.131148,13.114754,6.557377,False
4089270,2018-11-09 21:03:27,4.098361,18.852459,22.131148,22.131148,13.114754,6.557377,False
4089272,2018-11-09 21:03:28,4.918033,19.672131,22.131148,22.95082,13.934426,7.377049,False
4089273,2018-11-09 21:03:29,4.098361,18.852459,21.311475,22.131148,13.114754,6.557377,False
4089274,2018-11-09 21:03:30,0.0,18.852459,22.131148,22.95082,13.114754,6.557377,False


In [6]:
#Almacenamos las crisis detectadas
with open('data/crisis_separadas.pkl','wb') as f:
    pk.dump(dias_crisis,f)

### Operaciones sobre los datos

In [7]:
svg = SavgolTransformer(15)
btr = ButterTransformer(3,0.05)
nor = Normalizer(max_=100)
avg = PipelineTransformer(StatisticsTransformer(mode='mean',window=25),nor)
std = PipelineTransformer(StatisticsTransformer(mode='std',window=25),nor)
ran = PipelineTransformer(StatisticsTransformer(mode='range',window=25),nor)
var = VarianceThresholdPD(threshold=0.5)
con = ConcatenateTransformer(avg,std,ran)

pps = PipelineTransformer(svg,con)
ppb = PipelineTransformer(btr,con)
ppr = PipelineTransformer(con)
ppr_s = PipelineTransformer(svg)
ppr_b = PipelineTransformer(btr)

### Preprocesamiento de las crisis por separado

#### Crisis del 10 de Noviembre de 2018
Se considera el inicio de la crisis a las 2018-11-10 03:36:10 y el final a las 2018-11-10 03:40:37. Considerando el inicio cuando P1 comienza a valor 0 y terminando cuando P1 baja de 8

In [8]:
crisis_analisis = dias_crisis[0]
start = pd.to_datetime("2018-11-10 03:36:10")
end = pd.to_datetime("2018-11-10 03:40:37")

crisis_10_nov = crisis_analisis.copy()
mask = (crisis_10_nov['DateTime'] < start) | (crisis_10_nov['DateTime'] > end)
crisis_10_nov.loc[mask,'target']=False

In [9]:
with open('data/crisis_18-nov-10-raw.pdd','wb') as fil:
    pk.dump(crisis_10_nov,fil)

In [10]:
datetimes_10_nov = crisis_10_nov['DateTime']
targets_10_nov = crisis_10_nov['target']
data_10_nov = crisis_10_nov.iloc[:,1:len(crisis_10_nov.columns)-1]
data_10_nov.head()

Unnamed: 0,P1,P2,P3,P4,P5,P6
4089271,4.098361,18.852459,21.311475,22.131148,13.114754,6.557377
4089270,4.098361,18.852459,22.131148,22.131148,13.114754,6.557377
4089272,4.918033,19.672131,22.131148,22.95082,13.934426,7.377049
4089273,4.098361,18.852459,21.311475,22.131148,13.114754,6.557377
4089274,0.0,18.852459,22.131148,22.95082,13.114754,6.557377


In [11]:
# Transformaciones
lsavgol_10_nov = ppr_s.fit_transform(data_10_nov)
butterworth_10_nov = ppr_b.fit_transform(data_10_nov)
stats_10_nov = ppr.fit_transform(data_10_nov)
savgol_stats_10_nov = pps.fit_transform(data_10_nov)
butter_stats_10_nov = ppb.fit_transform(data_10_nov)


In [12]:
name = "data/crisis_18-nov-10-"
names = ['savgol','butter','stats','savgol-stats','butter-stats']
datos = [lsavgol_10_nov,butterworth_10_nov,stats_10_nov,savgol_stats_10_nov,butter_stats_10_nov]
for i in range(len(datos)):
    with open(name+names[i]+".pdd",'wb') as fil:
        pk.dump(pd.concat((datetimes_10_nov,datos[i],targets_10_nov),axis=1),fil)

#### Crisis del 28 de Enero de 2019
Se considera el inicio como *2019-01-29 06:12:04* (cuando baja por debajo de 18 en P4) hasta *2019-01-29 06:15:37* que se estabiliza alrededor de 20

In [13]:
crisis_analisis = dias_crisis[1]
start = pd.to_datetime("2019-01-29 06:12:04")
end = pd.to_datetime("2019-01-29 06:15:37")

crisis_28_ene = crisis_analisis.copy()
mask = (crisis_28_ene['DateTime'] < start) | (crisis_28_ene['DateTime'] > end)
crisis_28_ene.loc[mask,'target']=False

In [14]:
with open('data/crisis_19-ene-28-raw.pdd','wb') as fil:
    pk.dump(crisis_28_ene,fil)

In [15]:
datetimes_28_ene = crisis_28_ene['DateTime']
targets_28_ene = crisis_28_ene['target']
data_28_ene = crisis_28_ene.iloc[:,1:len(crisis_28_ene.columns)-1]
data_28_ene.head()

Unnamed: 0,P1,P2,P3,P4,P5,P6
6931232,0.0,20.491803,18.032787,21.311475,9.016393,6.557377
6931233,0.0,20.491803,18.032787,21.311475,9.016393,6.557377
6931234,0.0,20.491803,18.032787,21.311475,9.016393,6.557377
6931235,0.0,19.672131,18.032787,21.311475,9.836066,6.557377
6931236,0.0,20.491803,18.852459,22.131148,9.836066,6.557377


In [16]:
# Transformaciones
lsavgol_28_ene = ppr_s.fit_transform(data_28_ene)
butterworth_28_ene = ppr_b.fit_transform(data_28_ene)
stats_28_ene = ppr.fit_transform(data_28_ene)
savgol_stats_28_ene = pps.fit_transform(data_28_ene)
butter_stats_28_ene = ppb.fit_transform(data_28_ene)


In [17]:
name = "data/crisis_19-ene-28-"
names = ['savgol','butter','stats','savgol-stats','butter-stats']
datos = [lsavgol_28_ene,butterworth_28_ene,stats_28_ene,savgol_stats_28_ene,butter_stats_28_ene]
for i in range(len(datos)):
    with open(name+names[i]+".pdd",'wb') as fil:
        pk.dump(pd.concat((datetimes_28_ene,datos[i],targets_28_ene),axis=1),fil)

### Concatenación de las crisis 1 y 2

In [18]:
datetimes_full = pd.concat((datetimes_10_nov,datetimes_28_ene),axis=0)
targets_full = pd.concat((targets_10_nov,targets_28_ene),axis=0)
data_full = pd.concat((data_10_nov,data_28_ene),axis=0)
savgol_full = pd.concat((lsavgol_10_nov,lsavgol_28_ene),axis=0)
butter_full = pd.concat((butterworth_10_nov,butterworth_28_ene),axis=0)
stats_full = pd.concat((stats_10_nov,stats_28_ene),axis=0)
savgol_stats_full = pd.concat((savgol_stats_10_nov,savgol_stats_28_ene),axis=0)
butter_stats_full = pd.concat((butter_stats_10_nov,butter_stats_28_ene),axis=0)

In [19]:
def prepare(data):
    global datetimes_full, targets_full
    return pd.concat((datetimes_full,data,targets_full),axis=1)

In [20]:
data_raw = prepare(data_full)
data_svg = prepare(savgol_full)
data_btt = prepare(butter_full)
data_sts = prepare(stats_full)
data_sst = prepare(savgol_stats_full)
data_bst = prepare(butter_stats_full)

In [21]:
# Almacenamos los conjuntos de entrenamiento
name = "data/train-"
names = ['raw','savgol','butter','stats','savgol-stats','butter-stats']
datos = [data_raw,data_svg,data_btt,data_sts,data_sst,data_bst]
for i in range(len(datos)):
    with open(name+names[i]+".pdd",'wb') as fil:
        pk.dump(datos[i],fil)


### Preparación de la crisis 3 para el testeo
Según los proveedores de los datos la crisis fue medida correctamente por lo que no se harán modificaciones de lo que es crisis

In [22]:
test = dias_crisis[2]
datetimes_test = test['DateTime']
targets_test = test['target']
data_test = test.iloc[:,1:len(test.columns)-1]
test.head()

Unnamed: 0,DateTime,P1,P2,P3,P4,P5,P6,target
2164084,2019-02-06 21:11:50,0.0,14.754098,27.04918,30.327869,10.655738,0.0,False
2164085,2019-02-06 21:11:51,0.0,15.57377,26.229508,24.590164,6.557377,0.0,False
2164087,2019-02-06 21:11:52,0.0,19.672131,29.508197,23.770492,4.918033,0.0,False
2164086,2019-02-06 21:11:52,0.0,18.032787,30.327869,27.04918,7.377049,0.0,False
2164088,2019-02-06 21:11:53,0.0,18.032787,25.409836,17.213115,0.0,0.0,False


#### Operamos sobre los datos de test
Como P1 y P6 tienen una varianza menos de 0.5 que es el umbral con el que generamos el entrenamiento, para no perderlo volvemos a crear los transformadores, pero sin el VarianceThreshold

In [23]:
# Transformadores sin VarianceThreshold
pps = PipelineTransformer(svg,con)
ppb = PipelineTransformer(btr,con)
ppr = PipelineTransformer(con)
ppr_s = PipelineTransformer(svg)
ppr_b = PipelineTransformer(btr)

In [24]:
# Transformaciones
lsavgol_test = ppr_s.fit_transform(data_test)
butterworth_test = ppr_b.fit_transform(data_test)
stats_test = ppr.fit_transform(data_test)
savgol_stats_test = pps.fit_transform(data_test)
butter_stats_test = ppb.fit_transform(data_test)


In [25]:
#Comprobamos que las columnas son las mismas
datos_test = [data_test,lsavgol_test,butterworth_test,stats_test,savgol_stats_test,butter_stats_test]
datos_full = [data_full,savgol_full,butter_full,stats_full,savgol_stats_full,butter_stats_test]
for i in range(len(datos_test)):
    t = datos_test[i]
    f = datos_full[i]
    sym = set(t.columns).symmetric_difference(set(f.columns))
    if len(sym) != 0:
        raise Exception("Las columnas no coinciden en el índice "+str(i))

In [26]:
datetimes_full = datetimes_test
targets_full = targets_test

In [27]:
test_raw = prepare(data_test)
test_svg = prepare(lsavgol_test)
test_btt = prepare(butterworth_test)
test_sts = prepare(stats_test)
test_sst = prepare(savgol_stats_test)
test_bst = prepare(butter_stats_test)

In [28]:
# Almacenamos los datos de test
name = "data/test-"
names = ['raw','savgol','butter','stats','savgol-stats','butter-stats']
datos = [test_raw,test_svg,test_btt,test_sts,test_sst,test_bst]
for i in range(len(datos)):
    with open(name+names[i]+".pdd",'wb') as fil:
        pk.dump(datos[i],fil)