# SmartBeds - Preprocesado de Datos

## Grado en Ingeniería Informática
## Universidad de Burgos
##### José Luis Garrido Labrador

In [19]:
import pandas as pd # se importa pandas como pd
import numpy as np  #numpy como np
import matplotlib.pyplot as plt
from transformers import *

In [20]:
import pickle as pk

### Lectura

In [21]:
import loadData as ld

datos = ld.load(r"../data/")
datos = ld.preprocess(datos)
datos.head()

Unnamed: 0,DateTime,P1,P2,P3,P4,P5,P6,P7,P8,P9,P10,P11,P12,target
1,2018-09-14 21:04:18,0.0,2.0,3.0,4.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,False
4,2018-09-14 21:04:21,0.0,2.0,3.0,4.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,False
7,2018-09-14 21:04:24,0.0,2.0,3.0,4.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,False
8,2018-09-14 21:04:25,0.0,2.0,3.0,5.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,False
9,2018-09-14 21:04:26,0.0,2.0,3.0,5.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,False


In [22]:
from sys import getsizeof
print(getsizeof(datos)/(2**30),'GiB')

0.4823642037808895 GiB


### Datos manuales

In [23]:
datos = ld.newSeizure(datos,'2018-11-10 3:30:00',60*20)
datos = ld.newSeizure(datos,'2019-02-06 21:41:00',60*19)
#Las siguientes crisis no tienen momeento final, se ha considerado 15 minutos
datos = ld.newSeizure(datos,'2018-09-29 07:10:00',60*15)
datos = ld.newSeizure(datos,'2019-01-9 23:30:00',60*15)
datos = ld.newSeizure(datos,'2019-01-29 6:12:00',60*15)

### Preprocesado

In [24]:
# Ordención
datos = datos.sort_values(by='DateTime')
with open('data/datos_raw.pdd','wb') as f:
    pk.dump(datos,f)

In [25]:
dateTime=datos['DateTime']
target=datos['target']
only_datos = datos[datos.columns[1:len(datos.columns)-1]]
only_datos.head()

Unnamed: 0,P1,P2,P3,P4,P5,P6,P7,P8,P9,P10,P11,P12
1509723,1.0,14.0,11.0,18.0,2.0,9.0,1.0,0.0,0.0,0.0,1.0,0.0
5729641,1.0,14.0,11.0,18.0,2.0,9.0,1.0,0.0,0.0,0.0,1.0,0.0
5729667,1.0,14.0,11.0,19.0,3.0,9.0,1.0,0.0,0.0,0.0,1.0,0.0
5729722,1.0,14.0,11.0,19.0,3.0,10.0,1.0,0.0,0.0,0.0,1.0,0.0
5729780,1.0,14.0,11.0,18.0,3.0,9.0,1.0,0.0,0.0,0.0,1.0,0.0


In [26]:
#Transformadores
nf = NoiseFilter(minimum=5.0)
norm = Normalizer(max_=100)
vt = VarianceThresholdPD(threshold=0.5)
clean_data = PipelineTransformer(nf,norm,vt).fit_transform(only_datos)

In [27]:
clean_data.head()

Unnamed: 0,P1,P2,P3,P4,P5,P6
1509723,0.0,11.47541,9.016393,14.754098,0.0,7.377049
5729641,0.0,11.47541,9.016393,14.754098,0.0,7.377049
5729667,0.0,11.47541,9.016393,15.57377,0.0,7.377049
5729722,0.0,11.47541,9.016393,15.57377,0.0,8.196721
5729780,0.0,11.47541,9.016393,14.754098,0.0,7.377049


In [28]:
datosLimpios = pd.concat((dateTime,clean_data,target),axis=1)
with open('data/datos_clean.pdd','wb') as f:
    pk.dump(datosLimpios,f)

### Separación por noches
Se localizan los momentos que no tengan datos una hora antes y los que no tengan datos una hora después, tras esto se parten los datos según ese criterio

In [30]:
inicios = list()
finales = list()

anterior = pd.to_datetime('1970-01-01 00:00:00')
margen = pd.to_timedelta(1,unit='h')
for index,dl in datosLimpios.iterrows():
    hora = dl['DateTime']
    diff = hora-anterior
    if diff >= margen:
        if len(inicios)>len(finales): #Primera vuelta no introduce la fecha anterior
            finales.append(anterior)
        inicios.append(hora)
    anterior = hora

inicios,finales

([Timestamp('2018-08-28 12:04:57'),
  Timestamp('2018-08-28 14:51:57'),
  Timestamp('2018-08-29 05:51:21'),
  Timestamp('2018-08-30 04:31:46'),
  Timestamp('2018-08-30 23:14:40'),
  Timestamp('2018-09-02 15:25:10'),
  Timestamp('2018-09-02 20:18:51'),
  Timestamp('2018-09-03 00:10:35'),
  Timestamp('2018-09-03 15:43:12'),
  Timestamp('2018-09-04 05:53:38'),
  Timestamp('2018-09-07 11:25:27'),
  Timestamp('2018-09-07 17:29:36'),
  Timestamp('2018-09-08 16:19:38'),
  Timestamp('2018-09-11 21:24:39'),
  Timestamp('2018-09-12 21:30:52'),
  Timestamp('2018-09-13 21:25:09'),
  Timestamp('2018-09-14 21:04:18'),
  Timestamp('2018-09-15 21:18:59'),
  Timestamp('2018-09-17 21:13:32'),
  Timestamp('2018-09-18 21:29:33'),
  Timestamp('2018-09-20 13:54:40'),
  Timestamp('2018-09-20 19:08:43'),
  Timestamp('2018-09-21 21:22:02'),
  Timestamp('2018-09-22 21:04:12'),
  Timestamp('2018-09-23 21:07:35'),
  Timestamp('2018-09-24 09:12:01'),
  Timestamp('2018-09-25 07:33:30'),
  Timestamp('2018-09-25 21:1

In [36]:
finales.append(pd.Timestamp.now())

In [38]:
trozos = []
for i in range(len(inicios)):
    ini = inicios[i]
    fin = finales[i]
    mask = (datosLimpios['DateTime'] >= ini) & (datosLimpios['DateTime'] <= fin)
    trozos.append(datosLimpios.loc[mask])
trozos

[                   DateTime   P1         P2        P3         P4   P5  \
 1509723 2018-08-28 12:04:57  0.0  11.475410  9.016393  14.754098  0.0   
 5729641 2018-08-28 12:06:57  0.0  11.475410  9.016393  14.754098  0.0   
 5729667 2018-08-28 12:07:23  0.0  11.475410  9.016393  15.573770  0.0   
 5729722 2018-08-28 12:08:19  0.0  11.475410  9.016393  15.573770  0.0   
 5729780 2018-08-28 12:09:18  0.0  11.475410  9.016393  14.754098  0.0   
 5729781 2018-08-28 12:09:19  0.0  11.475410  9.016393  15.573770  0.0   
 5729799 2018-08-28 12:09:37  0.0  11.475410  9.016393  15.573770  0.0   
 5729802 2018-08-28 12:09:40  0.0  11.475410  9.016393  15.573770  0.0   
 5729803 2018-08-28 12:09:41  0.0  11.475410  9.016393  15.573770  0.0   
 5729804 2018-08-28 12:09:42  0.0  11.475410  9.016393  14.754098  0.0   
 5729805 2018-08-28 12:09:43  0.0  11.475410  9.016393  14.754098  0.0   
 5729962 2018-08-28 12:12:23  0.0  11.475410  9.016393  15.573770  0.0   
 5730119 2018-08-28 12:15:02  0.0  11.

In [39]:
with open('data/trozos_limpios.pkl','wb') as f:
    pk.dump(trozos,f)

### Cálculo de las  estadísticas móviles
#### Widgets

In [40]:
import gc
gc.collect()

0

In [12]:
import ipywidgets as widgets
from IPython.display import display

slider = widgets.IntSlider(
    value=25,
    min=1,
    max=50,
    description='Ventana:',
    readout=True,
    readout_format='d'
)

display(slider)

IntSlider(value=25, description='Ventana:', max=50, min=1)

In [13]:
window = slider.value
#Transformadores estadisticos
mean = PipelineTransformer(StatisticsTransformer(mode='mean',window=window),norm)
std = PipelineTransformer(StatisticsTransformer(mode='std',window=window),norm)
ran = PipelineTransformer(StatisticsTransformer(mode='range',window=window),norm)
cnt = ConcatenateTransformer(mean,std,ran)

In [14]:
clean_stats = cnt.fit_transform(clean_data)

In [16]:
clean_stats.head()

Unnamed: 0,P1 mean 25,P2 mean 25,P3 mean 25,P4 mean 25,P5 mean 25,P6 mean 25,P1 std 25,P2 std 25,P3 std 25,P4 std 25,P5 std 25,P6 std 25,P1 range 25,P2 range 25,P3 range 25,P4 range 25,P5 range 25,P6 range 25
5615854,0.0,18.013381,14.153371,23.880597,0.0,11.785898,0.0,0.0,0.0,1.445319,0.0,1.067438,0.0,0.0,0.0,0.877193,0.0,0.877193
5615855,0.0,18.013381,14.153371,23.932064,0.0,11.785898,0.0,0.0,0.0,1.426425,0.0,1.067438,0.0,0.0,0.0,0.877193,0.0,0.877193
5615856,0.0,18.013381,14.153371,23.983531,0.0,11.837365,0.0,0.0,0.0,1.397605,0.0,1.164671,0.0,0.0,0.0,0.877193,0.0,0.877193
5615857,0.0,18.013381,14.153371,23.932064,0.0,11.837365,0.0,0.0,0.0,1.426425,0.0,1.164671,0.0,0.0,0.0,0.877193,0.0,0.877193
5615858,0.0,18.013381,14.153371,23.880597,0.0,11.837365,0.0,0.0,0.0,1.445319,0.0,1.164671,0.0,0.0,0.0,0.877193,0.0,0.877193


### Exportación

In [17]:
datosLimpios_Stats = pd.concat((dateTime,clean_stats,target),axis=1)

In [18]:
with open('data/datos_clean_stats.pdd','wb') as f:
    pk.dump(datosLimpios_Stats,f)