# Data TSA
Your own personal on-demand data defence force.

Data TSA is a generic data profiling tool that provides data type-specific inspection tools for your data. Additionally, metrics can be split over any slicer field, providing insight into how metrics change over periods of time, versions of software, etc.

Here's  quick example using some sample data.

In [1]:
import pandas as pd
from data_tsa.sample_data import SampleData
from data_tsa.profiler import Profiler

pd.set_option('display.max_rows', 100)
pd.set_option('display.max_columns', 20)
pd.set_option('display.width', 1000)

# Generate test data
samp = SampleData(1000)
df = samp.get_sample_data()
df.head()

Unnamed: 0,id,partial_null,created_at,duplicate_string,string_slicer,mixed_precision_datetime,date_string,mixed_sign_numbers
0,0,0.0,2018-01-01 00:00:00,test,A,2019-01-01 01:01:00,1/1/2019,-1
1,1,0.0,2018-01-01 06:00:00,Test,A,2019-01-01 01:00:00,1/1/2019,-1
2,2,,2018-01-01 12:00:00,test,B,2019-01-01 00:00:00,1/1/2019 12:00,-1
3,3,,2018-01-01 18:00:00,test,B,2019-01-01 01:01:00,1/1/2019 12:00:00,-1
4,4,,2018-01-02 00:00:00,testing,B,2019-01-01 01:00:00,1/1/2019,1


In [2]:
# Initialize Profiler with no slicer
profiler = Profiler(df)

# Profile dataframe columns
p1_output = profiler.profile()
p1_output.head(20)

Unnamed: 0,inspector,column,slice,measure,measure_value
0,number,id,,row_count,1000
1,number,id,,distinct_count,1000
2,number,id,,null_ratio,0
3,number,id,,min_value,0
4,number,id,,max_value,999
5,number,id,,negative_ratio,0
6,number,id,,mean_value,499.5
7,number,id,,median_value,499.5
8,number,id,,stdev,288.819
9,number,id,,zero_ratio,0.001


In [3]:
# Initialize Profiler with slcer
profiler_slicer = Profiler(df, slicer='string_slicer')

# Profile dataframe columns over slicer partitions
p2_output = profiler_slicer.profile(lags=1)
p2_output.head()

1 / 2
2 / 2


Unnamed: 0,inspector,column,slice,measure,measure_value,l1_measure_value
5,datetime,created_at,A,conversion_error_indicator,0,
5,datetime,created_at,B,conversion_error_indicator,0,0.0
1,datetime,created_at,A,distinct_count,513,
1,datetime,created_at,B,distinct_count,487,513.0
4,datetime,created_at,A,max_value,2018-09-07 18:00:00,


In [4]:
# See how metrics change over slices for any given column
profiler_slicer.show_column_result('duplicate_string')

Unnamed: 0_level_0,slice,measure_value,measure_value,measure_value,measure_value,measure_value,measure_value,measure_value,measure_value
measure,Unnamed: 1_level_1,distinct_count,empty_ratio,null_ratio,redundancy_indicator,row_count,special_character_ratio,strict_distinct_count,trim_required_ratio
0,A,5,0,0,1,513,0,2,0.191033
1,B,5,0,0,1,487,0,2,0.195072


## DataFrameInspector
The __DataFrameInspector__ class is not included by default in the Profiler class, but is a part of the data_tsa package.

It's one and only function at this time is to detect perfect duplicates (i.e. rows that are exactly identical).

In [5]:
from data_tsa.dataframe_inspector import DataFrameInspector

df_dupes = pd.DataFrame({'a': [0, 0], 'b': [0, 0]})
df_dupes

Unnamed: 0,a,b
0,0,0
1,0,0


In [6]:
# Apply DataFrameInspector
insp = DataFrameInspector(df_dupes)
insp.get_duplicate_row_indicator()

True

In [7]:
# DataFrameInspector will also return a dataframe containing only the duplicate rows
insp.get_duplicate_rows()

Unnamed: 0,a,b,6960f7c5-55b3-489a-828a-c59667fcbff0
0,0,0,b4b147bc522828731f1a016bfa72c073
1,0,0,b4b147bc522828731f1a016bfa72c073
