# SmartBeds - Preprocesado de Datos

## Grado en Ingeniería Informática
## Universidad de Burgos
##### José Luis Garrido Labrador

In [1]:
import pandas as pd # se importa pandas como pd
import numpy as np  #numpy como np
import matplotlib.pyplot as plt
from transformers import *

In [2]:
import pickle as pk

### Lectura

In [3]:
import loadData as ld

datos = ld.load(r"../data/")
datos = ld.preprocess(datos)
datos.head()

Unnamed: 0,DateTime,P1,P2,P3,P4,P5,P6,P7,P8,P9,P10,P11,P12,target
7,2018-10-01 21:24:13,4.0,21.0,27.0,35.0,14.0,5.0,2.0,0.0,0.0,0.0,1.0,0.0,False
8,2018-10-01 21:24:14,4.0,20.0,26.0,35.0,14.0,5.0,2.0,0.0,0.0,0.0,1.0,0.0,False
9,2018-10-01 21:24:15,4.0,21.0,26.0,36.0,14.0,5.0,2.0,0.0,0.0,0.0,1.0,0.0,False
10,2018-10-01 21:24:16,2.0,20.0,25.0,34.0,13.0,4.0,1.0,0.0,0.0,0.0,0.0,0.0,False
11,2018-10-01 21:24:17,4.0,21.0,27.0,35.0,14.0,6.0,2.0,0.0,0.0,1.0,1.0,0.0,False


In [4]:
from sys import getsizeof
print(getsizeof(datos)/(2**30),'GiB')

0.4823642037808895 GiB


### Datos manuales

In [5]:
datos = ld.newSeizure(datos,'2018-11-10 3:30:00',60*20)
datos = ld.newSeizure(datos,'2019-02-06 21:41:00',60*19)
#Las siguientes crisis no tienen momeento final, se ha considerado 15 minutos
datos = ld.newSeizure(datos,'2018-09-29 07:10:00',60*15)
datos = ld.newSeizure(datos,'2019-01-9 23:30:00',60*15)
datos = ld.newSeizure(datos,'2019-01-29 6:12:00',60*15)

### Preprocesado

In [6]:
# Ordención
datos = datos.sort_values(by='DateTime')
with open('data/datos_raw.pdd','wb') as f:
    pk.dump(datos,f)

In [7]:
dateTime=datos['DateTime']
target=datos['target']
only_datos = datos[datos.columns[1:len(datos.columns)-1]]
only_datos.head()

Unnamed: 0,P1,P2,P3,P4,P5,P6,P7,P8,P9,P10,P11,P12
5615198,1.0,14.0,11.0,18.0,2.0,9.0,1.0,0.0,0.0,0.0,1.0,0.0
5615265,1.0,14.0,11.0,18.0,2.0,9.0,1.0,0.0,0.0,0.0,1.0,0.0
5615291,1.0,14.0,11.0,19.0,3.0,9.0,1.0,0.0,0.0,0.0,1.0,0.0
5615346,1.0,14.0,11.0,19.0,3.0,10.0,1.0,0.0,0.0,0.0,1.0,0.0
5615404,1.0,14.0,11.0,18.0,3.0,9.0,1.0,0.0,0.0,0.0,1.0,0.0


In [8]:
#Transformadores
nf = NoiseFilter(minimum=5.0)
norm = Normalizer(max_=100)
bt = ButterTransformer(N=3,Wn=0.05)
vt = VarianceThresholdPD(threshold=0.5)
clean_data = PipelineTransformer(nf,norm,vt).fit_transform(only_datos)

In [9]:
clean_data.head()

Unnamed: 0,P1,P2,P3,P4,P5,P6
5615198,0.0,11.47541,9.016393,14.754098,0.0,7.377049
5615265,0.0,11.47541,9.016393,14.754098,0.0,7.377049
5615291,0.0,11.47541,9.016393,15.57377,0.0,7.377049
5615346,0.0,11.47541,9.016393,15.57377,0.0,8.196721
5615404,0.0,11.47541,9.016393,14.754098,0.0,7.377049


In [10]:
datosLimpios = pd.concat((dateTime,clean_data,target),axis=1)
with open('data/datos_clean.pdd','wb') as f:
    pk.dump(datosLimpios,f)

### Separación por noches
Se localizan los momentos que no tengan datos una hora antes y los que no tengan datos una hora después, tras esto se parten los datos según ese criterio

In [None]:
inicios = list()
finales = list()

anterior = pd.to_datetime('1970-01-01 00:00:00')
margen = pd.to_timedelta(1,unit='h')
for index,dl in datosLimpios.iterrows():
    hora = dl['DateTime']
    diff = hora-anterior
    if diff >= margen:
        if len(inicios)>len(finales): #Primera vuelta no introduce la fecha anterior
            finales.append(anterior)
        inicios.append(hora)

inicios,finales

### Cálculo de las  estadísticas móviles
#### Widgets

In [11]:
import gc
gc.collect()

88

In [12]:
import ipywidgets as widgets
from IPython.display import display

slider = widgets.IntSlider(
    value=25,
    min=1,
    max=50,
    description='Ventana:',
    readout=True,
    readout_format='d'
)

display(slider)

IntSlider(value=25, description='Ventana:', max=50, min=1)

In [13]:
window = slider.value
#Transformadores estadisticos
mean = PipelineTransformer(StatisticsTransformer(mode='mean',window=window),norm)
std = PipelineTransformer(StatisticsTransformer(mode='std',window=window),norm)
ran = PipelineTransformer(StatisticsTransformer(mode='range',window=window),norm)
cnt = ConcatenateTransformer(mean,std,ran)

In [14]:
clean_stats = cnt.fit_transform(clean_data)

In [16]:
clean_stats.head()

Unnamed: 0,P1 mean 25,P2 mean 25,P3 mean 25,P4 mean 25,P5 mean 25,P6 mean 25,P1 std 25,P2 std 25,P3 std 25,P4 std 25,P5 std 25,P6 std 25,P1 range 25,P2 range 25,P3 range 25,P4 range 25,P5 range 25,P6 range 25
5615854,0.0,18.013381,14.153371,23.880597,0.0,11.785898,0.0,0.0,0.0,1.445319,0.0,1.067438,0.0,0.0,0.0,0.877193,0.0,0.877193
5615855,0.0,18.013381,14.153371,23.932064,0.0,11.785898,0.0,0.0,0.0,1.426425,0.0,1.067438,0.0,0.0,0.0,0.877193,0.0,0.877193
5615856,0.0,18.013381,14.153371,23.983531,0.0,11.837365,0.0,0.0,0.0,1.397605,0.0,1.164671,0.0,0.0,0.0,0.877193,0.0,0.877193
5615857,0.0,18.013381,14.153371,23.932064,0.0,11.837365,0.0,0.0,0.0,1.426425,0.0,1.164671,0.0,0.0,0.0,0.877193,0.0,0.877193
5615858,0.0,18.013381,14.153371,23.880597,0.0,11.837365,0.0,0.0,0.0,1.445319,0.0,1.164671,0.0,0.0,0.0,0.877193,0.0,0.877193


### Exportación

In [17]:
datosLimpios_Stats = pd.concat((dateTime,clean_stats,target),axis=1)

In [18]:
with open('data/datos_clean_stats.pdd','wb') as f:
    pk.dump(datosLimpios_Stats,f)