[![View on GitHub](https://img.shields.io/badge/GitHub-View_on_GitHub-blue?logo=GitHub)](https://github.com/khuyentran1401/Data-science/blob/master/data_science_tools/schema.ipynb)

[<img src="https://deepnote.com/buttons/launch-in-deepnote.svg">](https://deepnote.com/project/Data-science-hxlyJpi-QrKFJziQgoMSmQ/%2FData-science%2Fdata_science_tools%2Fschema.ipynb)

 [![View Article](https://img.shields.io/badge/View%20Article-View%20Full%20Article-red)](https://towardsdatascience.com/introduction-to-schema-a-python-libary-to-validate-your-data-c6d99e06d56a)

# List of Dictionary

In [None]:
!pip install faker==4.1.1 schema==0.7.4

In [None]:
import pandas as pd 

In [None]:
from faker import Faker 
import json            # To create a json file                 
import numpy as np

Faker.seed(0)

fake = Faker() 
def create_data(x): 
  
    # dictionary 
    data = [] 
    for i in range(0, x): 
        data_i = {}
        data_i['name']= fake.name() 
        data_i['city']= fake.city() 
        data_i['closeness (1-5)'] = np.random.randint(1,5)
        data_i['extrovert'] = fake.pybool()
        data_i['favorite_temperature'] = fake.pyfloat(left_digits=2,
                                                  right_digits=2)
        data.append(data_i)
    
    return data
    
data = create_data(3)

In [None]:
data 

[{'name': 'Norma Fisher',
  'city': 'South Richard',
  'closeness (1-5)': 1,
  'extrovert': True,
  'favorite_temperature': -45.74},
 {'name': 'Colleen Taylor',
  'city': 'North Laurenshire',
  'closeness (1-5)': 4,
  'extrovert': False,
  'favorite_temperature': 93.9},
 {'name': 'Melinda Kennedy',
  'city': 'South Cherylside',
  'closeness (1-5)': 3,
  'extrovert': True,
  'favorite_temperature': 66.33}]

## Validate data types

In [None]:
from schema import Schema, And, Or,Use, Optional, SchemaError


In [None]:
schema = Schema([{'name': str,
                 'city': str, 
                 'closeness (1-5)': int,
                 'extrovert': bool,
                 'favorite_temperature': float}])

In [None]:
schema.validate(data)

[{'name': 'Norma Fisher',
  'city': 'South Richard',
  'closeness (1-5)': 1,
  'extrovert': True,
  'favorite_temperature': -45.74},
 {'name': 'Colleen Taylor',
  'city': 'North Laurenshire',
  'closeness (1-5)': 4,
  'extrovert': False,
  'favorite_temperature': 93.9},
 {'name': 'Melinda Kennedy',
  'city': 'South Cherylside',
  'closeness (1-5)': 3,
  'extrovert': True,
  'favorite_temperature': 66.33}]

In [None]:
schema.is_valid(data)

True

In [None]:
schema = Schema([{'name': int,
                 'city': str, 
                 'closeness (1-5)': int,
                 'extrovert': bool,
                 'favorite_temperature': float}])

This will throw an error because name is supposed to be a string

In [None]:
schema.validate(data)

SchemaError: Or({'name': <class 'int'>, 'city': <class 'str'>, 'closeness (1-5)': <class 'int'>, 'extrovert': <class 'bool'>, 'favorite_temperature': <class 'float'>}) did not validate {'name': 'Norma Fisher', 'city': 'South Richard', 'closeness (1-5)': 1, 'extrovert': True, 'favorite_temperature': -45.74}
Key 'name' error:
'Norma Fisher' should be instance of 'int'

## Validate datatype of some columns while ignoring the rest

In [None]:
schema = Schema([{'name': str,
                 'city': str, 
                 'favorite_temperature': float,
                  str: object
                 }])

In [None]:
schema.is_valid(data)

True

## Validate using function

In [None]:
schema = Schema([{'name': str,
                 'city': str, 
                 'favorite_temperature': float,
                  'closeness (1-5)': lambda n : 1 <= n <= 5,
                  str: object
                 }])

schema.is_valid(data)

True

## Validate Several Schemas

### And

In [None]:
schema = Schema([{'name': str,
                 'city': str, 
                 'favorite_temperature': float,
                  'closeness (1-5)': And(lambda n : 1 <= n <= 5, float),
                  str: object
                 }])

schema.is_valid(data)

False

In [None]:
schema = Schema([{'name': str,
                 'city': str, 
                 'favorite_temperature': float,
                  'closeness (1-5)': And(lambda n : 1 <= n <= 5, int),
                  str: object
                 }])

schema.is_valid(data)

True

### Or

In [None]:
data[0]['city'].split()

['South', 'Richard']

In [None]:
len(data[0]['city'].split()) == 2

True

In [None]:
schema = Schema([{'name': str,
                 'city': Or(lambda n: len(n.split())==2, lambda n: len(n.split()) ==1), 
                 'favorite_temperature': float,
                  'closeness (1-5)': int,
                  str: object
                 }])

schema.is_valid(data)

True

### Combination of And and Or

In [None]:
schema = Schema([{'name': str,
                 'city': And(str, Or(lambda n: len(n.split())==2, lambda n: len(n.split()) ==1)), 
                 'favorite_temperature': float,
                  'closeness (1-5)': int,
                  str: object
                 }])

schema.is_valid(data)

True

## Nested dictionary 

In [None]:

fake = Faker() 
Faker.seed(0)

def create_data(x): 
  
    # dictionary 
    data = [] 
    for i in range(0, x): 
        data_i = {}
        data_i['name']= fake.name() 
        data_i['city']= fake.city() 
        data_i['closeness (1-5)'] = np.random.randint(1,5)
        data_i['detailed_info'] = {'favorite_color': fake.color_name(),
                                  'phone number': fake.msisdn()}
        
        data.append(data_i)
    
    return data
    
data = create_data(2)

In [None]:
data 

[{'name': 'Norma Fisher',
  'city': 'South Richard',
  'closeness (1-5)': 4,
  'detailed_info': {'favorite_color': 'Pink',
   'phone number': '7593824219489'}},
 {'name': 'Emily Blair',
  'city': 'Suttonview',
  'closeness (1-5)': 4,
  'detailed_info': {'favorite_color': 'Chartreuse',
   'phone number': '9387784080160'}}]

In [None]:
schema = Schema([{'name': str,
                 'city':str,  
                  'closeness (1-5)': int,
                  'detailed_info': dict
                 }])
schema.is_valid(data)

True

In [None]:
schema = Schema([{'name': str,
                 'city':str,  
                  'closeness (1-5)': int,
                  'detailed_info': {'favorite_color': str, 'phone number': str}
                 }])
schema.is_valid(data)

True

## Optional

In [None]:
data.append({'name': fake.name(), 
            'city': fake.city(),
            'closeness (1-5)' : np.random.randint(1,5)})

In [None]:
data 

[{'name': 'Norma Fisher',
  'city': 'South Richard',
  'closeness (1-5)': 4,
  'detailed_info': {'favorite_color': 'Pink',
   'phone number': '7593824219489'}},
 {'name': 'Emily Blair',
  'city': 'Suttonview',
  'closeness (1-5)': 4,
  'detailed_info': {'favorite_color': 'Chartreuse',
   'phone number': '9387784080160'}},
 {'name': 'Samantha Cook', 'city': 'Janeton', 'closeness (1-5)': 3}]

In [None]:
schema = Schema([{'name': str,
                 'city':str,  
                  'closeness (1-5)': int,
                  Optional('detailed_info'): {'favorite_color': str, 'phone number': str}
                 }])
schema.is_valid(data)

True

## Data with null

In [None]:
data.append({'name': fake.name(), 
            'city': None,
            'closeness (1-5)' : np.random.randint(1,5),
            'detailed_info': {'favorite_color': fake.color_name(),
                                  'phone number': fake.msisdn()}})

In [None]:
schema = Schema([{'name': str,
                 'city': Or(None, str),  
                  'closeness (1-5)': int,
                  Optional('detailed_info'): {'favorite_color': str, 'phone number': str}
                 }])
schema.is_valid(data)

True

## Forbidden

In [None]:
from schema import Forbidden

schema = Schema([{'name': str,
                 'city':str,  
                  'closeness (1-5)': int,
                  Forbidden('detailed_info'): dict
                 }])
schema.validate(data)

SchemaError: Or({'name': <class 'str'>, 'city': <class 'str'>, 'closeness (1-5)': <class 'int'>, Forbidden('detailed_info'): <class 'dict'>}) did not validate {'name': 'Norma Fisher', 'city': 'South Richard', 'closeness (1-5)': 4, 'detailed_info': {'favorite_color': 'Pink', 'phone number': '7593824219489'}}
Forbidden key encountered: 'detailed_info' in {'name': 'Norma Fisher', 'city': 'South Richard', 'closeness (1-5)': 4, 'detailed_info': {'favorite_color': 'Pink', 'phone number': '7593824219489'}}

# Json

In [None]:
json = {"data":[{"stuff":[
    {"onetype":[
        {"id":1,"name":"John Doe"},
        {"id":2,"name":"Don Joeh"}
    ]},
    {"othertype":[
        {"id":2,"company":"ACME"}
    ]}]
},{"otherstuff":[
    {"thing":
        [[1,42],[2,2]]
    }]
}]}

In [None]:
schema = Schema({'data': list})
schema.is_valid(json)

True

In [None]:
schema = Schema({'data': [
    {'stuff': list},
    {'otherstuff': list}
]})
schema.is_valid(json)

True

# Convert dtype

In [None]:
Schema(Use(int)).validate('123')

123

<a style='text-decoration:none;line-height:16px;display:flex;color:#5B5B62;padding:10px;justify-content:end;' href='https://deepnote.com?utm_source=created-in-deepnote-cell&projectId=87197226-98be-42b2-8527-389082831299' target="_blank">
 </img>
Created in <span style='font-weight:600;margin-left:4px;'>Deepnote</span></a>