# Schema from example

In [6]:
oto_data = {
        "_id" : "5e3868eb61e48d0017ab68b0",
        "kind" : "OutlierModel",
        "bt" : 1580755178368101,
        "tt" : 0,
        "modelId" : "5c1a7b648a6df285f82bdd4f",
        "distance" : 204311,
        "closest_centroid" : 82,
        "outlier_score" : 90.9653,
        "centroid_rareness" : 4.26592,
        "is_outlier" : 1,
        "source" : "40FWF4_R-",
        "account" : "analogdevices_dev"
    }


# Using schema

## Validation of a schema

In [2]:
from schema import Schema, And, Use, Optional, SchemaError

schema = Schema([{'name': And(str, len),
                   'age':  And(Use(int), lambda n: 18 <= n <= 99),
                   Optional('gender'): And(str, Use(str.lower),
                                           lambda s: s in ('squid', 'kid'))}])

data = [{'name': 'Sue', 'age': '28', 'gender': 'Squid'},
         {'name': 'Sam', 'age': '42'},
        {'name': 'Sacha', 'age': '20', 'gender': 'KID'}]



In [3]:
validated = schema.validate(data)

assert validated == [{'name': 'Sue', 'age': 28, 'gender': 'squid'},
                      {'name': 'Sam', 'age': 42},
                      {'name': 'Sacha', 'age' : 20, 'gender': 'kid'}]

In [35]:
"""{'_id': str,
 'kind': str,
 'bt': int,
 'tt': int,
 'modelId': str,
 'distance': int,
 'closest_centroid': int,
 'outlier_score': float,
 'centroid_rareness': float,
 'is_outlier': int,
 'source': str,
 'account': str}
"""
schema_otosense= Schema([{'_id': str,
                          'kind':Use(str),
                          'bt': Use(int),
                          'tt':Use(int),
                          'modelId':Use(str),
                          'distance':Use(int),
                          'closest_centroid':Use(int),
                          'outlier_score':Use(int),
                          'centroid_rareness':Use(float),
                          'is_outlier':Use(bool),
                          'source':Use(str),
                          'account':And(str,
                                           lambda s: s in ('analogdevices_dev', 'analogdevices_prod'))
}])
                  

In [13]:
schema_otosense.validate([oto_data])

[{'_id': '5e3868eb61e48d0017ab68b0',
  'kind': 'OutlierModel',
  'bt': 1580755178368101,
  'tt': 0,
  'modelId': '5c1a7b648a6df285f82bdd4f',
  'distance': 204311,
  'closest_centroid': 82,
  'outlier_score': 90,
  'centroid_rareness': 4.26592,
  'is_outlier': True,
  'source': '40FWF4_R-',
  'account': 'analogdevices_dev'}]

# Validation using Great expectations

In [11]:
import great_expectations as ge
import pandas as pd

In [14]:
data_list = [{'_id': '5e3868eb61e48d0017ab68b0',
  'kind': 'OutlierModel',
  'bt': 1580755178368101,
  'tt': 0,
  'modelId': '5c1a7b648a6df285f82bdd4f',
  'distance': 204311,
  'closest_centroid': 82,
  'outlier_score': 90,
  'centroid_rareness': 4.26592,
  'is_outlier': True,
  'source': '40FWF4_R-',
  'account': 'analogdevices_dev'},
  {'_id': '4e3868eb61e48d0017ab6898',
  'kind': 'OutlierModel',
  'bt': 1580755178368156,
  'tt': 0,
  'modelId': '5c1a7b648a6df285f82bdd4f',
  'distance': 204316,
  'closest_centroid': 80,
  'outlier_score': 10,
  'centroid_rareness': 4.26592,
  'is_outlier': True,
  'source': '40FWF4_R-',
  'account': 'analogdevices_dev'},
{'_id': '3e3868eb61e48d0017ab6800',
  'kind': 'OutlierModel',
  'bt': 1580755178368101,
  'tt': 0,
  'modelId': '5c1a7b648a6df285f82bdd4f',
  'distance': 20400,
  'closest_centroid': 17,
  'outlier_score': 9,
  'centroid_rareness': 3.1,
  'is_outlier': True,
  'source': '40FWF4_R-',
  'account': 'analogdevices_prod'}

  ]

In [23]:
df_csv = pd.DataFrame(data_list).to_csv('data_great_expect.csv')
df=ge.read_csv('data_great_expect.csv')

In [24]:
df

Unnamed: 0.1,Unnamed: 0,_id,kind,bt,tt,modelId,distance,closest_centroid,outlier_score,centroid_rareness,is_outlier,source,account
0,0,5e3868eb61e48d0017ab68b0,OutlierModel,1580755178368101,0,5c1a7b648a6df285f82bdd4f,204311,82,90,4.26592,True,40FWF4_R-,analogdevices_dev
1,1,4e3868eb61e48d0017ab6898,OutlierModel,1580755178368156,0,5c1a7b648a6df285f82bdd4f,204316,80,10,4.26592,True,40FWF4_R-,analogdevices_dev
2,2,3e3868eb61e48d0017ab6800,OutlierModel,1580755178368101,0,5c1a7b648a6df285f82bdd4f,20400,17,9,3.1,True,40FWF4_R-,analogdevices_prod


In [27]:
feature_columns = ['kind', 'bt', 'tt','modelId', 'distance','closest_centroid']
for col in feature_columns:
    df.expect_column_to_exist(col)

df.expect_column_values_to_be_of_type('kind', 'str')

{
  "result": {
    "element_count": 3,
    "missing_count": 0,
    "missing_percent": 0.0,
    "unexpected_count": 0,
    "unexpected_percent": 0.0,
    "unexpected_percent_total": 0.0,
    "unexpected_percent_nonmissing": 0.0,
    "partial_unexpected_list": []
  },
  "meta": {},
  "success": true,
  "exception_info": {
    "raised_exception": false,
    "exception_traceback": null,
    "exception_message": null
  }
}

In [28]:
df.get_expectation_suite()

	7 expectation(s) included in expectation_suite. result_format settings filtered.


{
  "meta": {
    "great_expectations_version": "0.13.38"
  },
  "expectations": [
    {
      "kwargs": {
        "column": "kind"
      },
      "expectation_type": "expect_column_to_exist",
      "meta": {},
      "ge_cloud_id": null
    },
    {
      "kwargs": {
        "column": "bt"
      },
      "expectation_type": "expect_column_to_exist",
      "meta": {},
      "ge_cloud_id": null
    },
    {
      "kwargs": {
        "column": "tt"
      },
      "expectation_type": "expect_column_to_exist",
      "meta": {},
      "ge_cloud_id": null
    },
    {
      "kwargs": {
        "column": "modelId"
      },
      "expectation_type": "expect_column_to_exist",
      "meta": {},
      "ge_cloud_id": null
    },
    {
      "kwargs": {
        "column": "distance"
      },
      "expectation_type": "expect_column_to_exist",
      "meta": {},
      "ge_cloud_id": null
    },
    {
      "kwargs": {
        "column": "closest_centroid"
      },
      "expectation_type": "expect_colum

# Generating data from a model

In [29]:
from sdv import SDV #synthetic data vault

Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  Primitive = Union[np.integer, int, np.float, float, str]


In [34]:
sdv = SDV()
df_data = pd.DataFrame(data_list)
sdv.fit(tables =[df_data])

TypeError: fit() missing 1 required positional argument: 'metadata'

In [33]:
df_data

Unnamed: 0,_id,kind,bt,tt,modelId,distance,closest_centroid,outlier_score,centroid_rareness,is_outlier,source,account
0,5e3868eb61e48d0017ab68b0,OutlierModel,1580755178368101,0,5c1a7b648a6df285f82bdd4f,204311,82,90,4.26592,True,40FWF4_R-,analogdevices_dev
1,4e3868eb61e48d0017ab6898,OutlierModel,1580755178368156,0,5c1a7b648a6df285f82bdd4f,204316,80,10,4.26592,True,40FWF4_R-,analogdevices_dev
2,3e3868eb61e48d0017ab6800,OutlierModel,1580755178368101,0,5c1a7b648a6df285f82bdd4f,20400,17,9,3.1,True,40FWF4_R-,analogdevices_prod
