### Instalar las dependencias

In [1]:
!apt-get install python-dev python-snappy


Reading package lists... Done
Building dependency tree       
Reading state information... Done
python-dev is already the newest version (2.7.15~rc1-1).
python-snappy is already the newest version (0.5-1.1build2).
0 upgraded, 0 newly installed, 0 to remove and 37 not upgraded.


In [2]:
!pip install -q tensorflow_data_validation

### Importar las librerías del proyecto

In [3]:
import pandas as pd
import tensorflow as tf
import tensorflow_data_validation as tfdv
from __future__ import print_function

## Análisis descriptivo

In [4]:
dataset = pd.read_csv('pollution-small.csv')

In [14]:
dataset.head(3)

Unnamed: 0,Date,pm10,no2,so2,soot
0,1/1/2009,98.67,14.1,44.38,34.81
1,1/2/2009,52.33,14.1,29.75,33.06
2,1/3/2009,74.67,20.5,36.25,39.25


In [5]:
dataset.shape

(2188, 5)

In [6]:
training_data = dataset[:1600]

In [7]:
training_data.describe()

Unnamed: 0,pm10,no2,so2,soot
count,1600.0,1600.0,1600.0,1600.0
mean,49.656494,30.980519,16.229981,21.551956
std,35.211906,12.400788,10.621896,12.127354
min,6.38,9.74,4.01,6.0
25%,28.345,22.5675,9.7775,14.4
50%,38.835,28.715,13.275,18.63
75%,58.05,36.37,19.2825,24.0725
max,277.25,138.01,123.13,107.65


In [8]:
test_set = dataset[1600:]

In [9]:
test_set.describe()

Unnamed: 0,pm10,no2,so2,soot
count,588.0,588.0,588.0,588.0
mean,44.648248,37.296922,13.60517,18.44131
std,28.992087,10.94005,5.098944,6.596459
min,11.9,15.07,4.99,8.0
25%,28.3375,29.2175,10.1225,14.41
50%,35.555,35.815,12.345,17.09
75%,50.8125,43.8725,15.855,20.9625
max,273.77,106.03,38.03,87.21


### Análisis descriptivo y validación con TFDV

In [10]:
# Generar análisis descriptivo del dataset
train_stats = tfdv.generate_statistics_from_dataframe( dataframe = dataset)

In [11]:
# Inferir el esquema
schema = tfdv.infer_schema(statistics = train_stats)

In [12]:
tfdv.display_schema(schema)

Unnamed: 0_level_0,Type,Presence,Valency,Domain
Feature name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
'Date',BYTES,required,,-
'pm10',FLOAT,required,,-
'no2',FLOAT,required,,-
'so2',FLOAT,required,,-
'soot',FLOAT,required,,-


In [13]:
# Calcular los estadísticos descriptivos del conjunto de testing
test_stats = tfdv.generate_statistics_from_dataframe( dataframe = test_set)

### Comparar estadísticos del conjunto de test con Schema

In [15]:
# Buscar anomalías en los nuevos datos
anomalies = tfdv.validate_statistics(statistics = test_stats, schema = schema)

**Mostrar todas las anomalías detectadas**
* Número entero mayor que 10
* Tipo STRING cuando se espera un tipo INT
* Tipo FLOAT cuando se espera un tipo INT
* Número entero menor que 0 

In [16]:
tfdv.display_anomalies(anomalies)

#### Nuevos datos **CON** anomalías

In [18]:
test_set_copy = test_set.copy()

In [19]:
test_set_copy.drop('soot', axis = 1, inplace = True)

In [20]:
# Estadísticos basados en datos con anomalías
test_set_copy_stats = tfdv.generate_statistics_from_dataframe(dataframe = test_set_copy)
anomalies_new = tfdv.validate_statistics(statistics = test_set_copy_stats, schema = schema)
tfdv.display_anomalies(anomalies_new)

Unnamed: 0_level_0,Anomaly short description,Anomaly long description
Feature name,Unnamed: 1_level_1,Unnamed: 2_level_1
'soot',Column dropped,Column is completely missing


### Preparar el esquema para subir a producción

In [21]:
schema.default_environment.append('TRAINING')
schema.default_environment.append('SERVING')

#### Eliminar la columna objetivo del esquema de producción

In [22]:
tfdv.get_feature(schema, 'soot').not_in_environment.append('SERVING')

#### Comprobar anomalías entre el entorno del servidor y nuevos datos

In [23]:
serving_env_anomalies = tfdv.validate_statistics(test_set_copy_stats, schema, environment = 'SERVING')

In [24]:
tfdv.display_anomalies(serving_env_anomalies)

### Congelar el esquema

In [25]:
tfdv.write_schema_text(schema = schema, output_path = 'pollution_schema.pbtxt')