# Data Validation

## Imports

In [1]:
import pandas as pd
import numpy as np

## Get dataframe

[Inspired by Simple Example Dataframes In pandas (chrisalbon.com)](https://chrisalbon.com/python/data_wrangling/pandas_dataframe_examples/)

In [2]:
from random import choice, randrange, uniform

NAMES =             ['Jason', 'Molly', 'Tina', 'Jake', 'Amy']
SURNAMES =          ['Miller', 'Jacobson', 'Ali', 'Milner', 'Cooze']
MISSING_VALUES =    [0, None, float("nan"), '', '-', '?', 'N']
LEN = 100

COLUMNS = ['first_name', 'last_name', 'age', 'score', 'empty_values', 'mixed']

df_list = [[
    choice(NAMES), 
    choice(SURNAMES),
    randrange(0, 101, 1),
    round(uniform(0, 10), randrange(0, 4, 1)),
    choice(MISSING_VALUES),
    choice([
        choice(NAMES), 
        choice(SURNAMES), 
        randrange(0, 101, 1), 
        round(uniform(0, 10), randrange(0, 4, 1)),
        choice(MISSING_VALUES)
    ])
] for x in range(0,LEN)]

df = pd.DataFrame(df_list, columns=COLUMNS)

df_original = df.copy()

In [3]:
display(
    df.sample(n=10)
)

Unnamed: 0,first_name,last_name,age,score,empty_values,mixed
20,Amy,Milner,99,9.04,,46
49,Molly,Jacobson,59,4.39,?,Ali
73,Jake,Jacobson,74,3.22,0,Jake
14,Jake,Miller,29,1.0,,Amy
65,Amy,Milner,17,6.0,-,-
34,Jake,Miller,95,4.92,,Cooze
41,Jake,Milner,89,0.964,,46
30,Molly,Milner,35,5.0,,41
11,Tina,Cooze,33,0.4,-,21
31,Amy,Cooze,13,2.629,?,82


## Exploration of a feature

In [23]:
def explore_floats(df, feature):
    float_filter = df[feature].apply(lambda x: isinstance(x, float))
    float_num = float_filter.sum()
    float_nan = df[float_filter][feature].isnull().sum()
    float_dec = df[float_filter][feature].notnull().sum()
    row_num = df.shape[0]
    print(f"Float values - number of floats: {float_num} (~ {float_num/row_num*100:.2f}% of rows)",
        f"nan values: {float_nan} (~ {float_nan/float_num*100:.2f}% of floats ~ {float_nan/row_num*100:.2f}% of rows)",
        f"Decimal numbers: {float_dec} (~ {float_dec/float_num*100:.2f}% of floats)\n", sep="\n"
    )
    return None

def explore_strings(df, feature):
    string_filter = df[feature].apply(lambda x: isinstance(x, str))
    string_number = string_filter.sum()
    string_numeric = df[string_filter][feature].str.isnumeric().sum()
    string_oth = (~df[string_filter][feature].str.isnumeric()).sum()
    row_num = df.shape[0]
    print(f"String values - number of string values: {string_number} (~ {string_number/row_num*100:.2f}% of rows)",
            f"Numeric strings: {string_numeric} (~ {string_numeric/string_number*100:.2f}% of strings)",
            f"Other strings: {string_oth} (~ {string_oth/string_number*100:.2f}% of strings)\n", sep="\n"
    )
    print("Number of unique other strings: {}, the strings: {}\n".format(
        df[string_filter][(~df[string_filter][feature].str.isnumeric())][feature].nunique(),
        df[string_filter][(~df[string_filter][feature].str.isnumeric())][feature].unique()
    ))
    return None

def explore_other(df, feature, feature_type):
    none_filter = df[feature].apply(lambda x: isinstance(x, feature_type))
    none_number = none_filter.sum()
    row_num = df.shape[0]
    print("{} values - number: {} (~ {:.2f}% of rows)\n".format(
        feature_type, none_number, none_number/row_num*100
    ))


def explore_feature(df, feature):
    print("Number of rows: {}\nNumber of unique values (nan included): {}\n".format(
        df.shape[0], df[feature].nunique(False)
    ))

    display("Sample of how the values look like:",
        df[feature].sample(n=10)
    )
    # print(f"Type of \"{feature}\" feature: {df.dtypes[feature]}\n")

    value_types = df[feature].apply(lambda x: type(x)).unique()
    print(f"Value types: {value_types}\n")

    for value_type in value_types:
        if (value_type is float):
            explore_floats(df, feature)
        elif (value_type is str):
            explore_strings(df, feature)
        else:
            explore_other(df, feature, value_type)
    return None

In [24]:
df = df_original
feature = "empty_values"

explore_feature(df, feature)


Number of rows: 100
number of unique values (nan included) 7



'Sample of how the values look like:'

4        0
7      NaN
77    None
78       ?
54       N
16       ?
17       N
97     NaN
26       N
2        0
Name: empty_values, dtype: object

Value types: [<class 'str'> <class 'NoneType'> <class 'int'> <class 'float'>]

String values - number of string values: 56 (~ 56.00% of rows)
numeric strings: 0 (~ 0.00% of strings)
other strings: 56 (~ 100.00% of strings)

Number of unique other strings: 4, the strings: ['' '-' '?' 'N']

<class 'NoneType'> values - number: 16 (~ 16.00% of rows)

<class 'int'> values - number: 13 (~ 13.00% of rows)

Float values - number of floats: 15 (~ 15.00% of rows)
nan values: 15 (~ 100.00% of floats)
decimal numbers: 0 (~ 0.00% of floats)



In [None]:



# display("Number of floats (that are not nan): {}".format(
#     (df[FEATURE].apply(lambda x: type(x) == float) & df[FEATURE].notnull()).sum()
# ))


# df[FEATURE].unique()


In [13]:


display("Number of null values: {} ~ {:.2f}%".format(
    df[FEATURE].isnull().sum(), df[FEATURE].isnull().sum() / df.shape[0] * 100
))
display("Number of \"real\" values (not null): {} ~ {:.2f}%".format(
    df[FEATURE].notnull().sum(), df[FEATURE].notnull().sum() / df.shape[0] * 100
))
display("Number of integers: {}".format(
    (df[FEATURE].apply(lambda x: type(x) == int) & df[FEATURE].notnull()).sum()
))
display("Number of string values comprehensible as a number: {} (regex), same as {} (str.isnumeric)".format((
        df[FEATURE].str.match(r"\d+(\.\d+)*\Z")==True).sum(),
        df[FEATURE].str.isnumeric().sum()
))
display(f"""String values that can not be understood as a number, row count: {
        (df[FEATURE].str.isnumeric()==False).sum()
    }""", f"""list of unique values: {
        df[df[FEATURE].apply(lambda x: type(x) == str) & ~(df[FEATURE].str.isnumeric()==True)][FEATURE].unique()}"""
)
display(f"Histogram of \"{FEATURE}\" values:")
fig = px.histogram(df, x=FEATURE)
fig.show()
fig.write_image(f"figures/{FEATURE.lower().replace(' ', '_')}.png")

ValueError: Cannot take a larger sample than population when 'replace=False'