# Data Validation

## Imports

In [1]:
import pandas as pd
import numpy as np

## Get dataframe

[Inspired by Simple Example Dataframes In pandas (chrisalbon.com)](https://chrisalbon.com/python/data_wrangling/pandas_dataframe_examples/)

In [30]:
from random import choice, randrange, uniform

NAMES =             ['Jason', 'Molly', 'Tina', 'Jake', 'Amy']
SURNAMES =          ['Miller', 'Jacobson', 'Ali', 'Milner', 'Cooze']
MISSING_VALUES =    [0, None, float("nan"), '', '-', '?', 'N']
LEN = 100

COLUMNS = ['first_name', 'last_name', 'age', 'score', 'empty_values', 'mixed']

df_list = [[
    choice(NAMES), 
    choice(SURNAMES),
    randrange(0, 101, 1),
    round(uniform(0, 10), randrange(0, 4, 1)),
    choice(MISSING_VALUES),
    choice([
        choice(NAMES), 
        choice(SURNAMES), 
        randrange(0, 101, 1), 
        round(uniform(0, 10), randrange(0, 4, 1)),
        choice(MISSING_VALUES)
    ])
] for x in range(0,LEN)]

df = pd.DataFrame(df_list, columns=COLUMNS)

df_original = df.copy()

In [32]:
display(
    df.sample(n=10)
)

Unnamed: 0,first_name,last_name,age,score,empty_values,mixed
64,Molly,Milner,3,8.3,,
90,Molly,Milner,15,6.0,,N
52,Molly,Jacobson,8,1.0,-,Molly
95,Molly,Cooze,32,8.97,-,Jason
35,Jason,Miller,51,2.467,-,Milner
42,Amy,Miller,18,4.0,0,Jake
9,Amy,Jacobson,61,7.774,,?
25,Molly,Ali,87,0.7,?,Jacobson
73,Tina,Milner,9,6.0,0,
77,Jason,Cooze,57,1.43,,


## Exploration of a feature

In [61]:
df = df_original
feature = "empty_values"

row_num = df.shape[0]
print(f"Number of rows: {row_num}\n")

display("Sample of how the values looks like:",
    df[feature].sample(n=10)
)
print(f"Type of \"{feature}\" feature: {df.dtypes[feature]}\n")

print("Value types: {}\n".format(
    df[feature].apply(lambda x: type(x)).unique()
))

float_filter = df[feature].apply(lambda x: isinstance(x, float))
float_num = float_filter.sum()
float_nan = df[float_filter][feature].isnull().sum()
float_dec = df[float_filter][feature].notnull().sum()
print(f"Float values - number of floats: {float_num} (~ {float_num/row_num*100:.2f}% of rows)",
    f"nan values: {float_nan} (~ {float_nan/float_num*100:.2f}% of floats)",
    f"decimal numbers: {float_dec} (~ {float_dec/float_num*100:.2f}% of floats)\n", sep="\n"
)

string_filter = df[feature].apply(lambda x: isinstance(x, str))
string_number = string_filter.sum()
string_numeric = df[string_filter][feature].str.isnumeric().sum()
string_oth = (~df[string_filter][feature].str.isnumeric()).sum()
print(f"String values - number of string values: {string_number} (~ {string_number/row_num*100:.2f}% of rows)",
        f"numeric strings: {string_numeric} (~ {string_numeric/string_number*100:.2f}% of strings)",
        f"other strings: {string_oth} (~ {string_oth/string_number*100:.2f}% of strings)\n", sep="\n"
)
print("Number of unique other strings: {}, the strings: {}".format(
    df[string_filter][(~df[string_filter][feature].str.isnumeric())][feature].nunique(),
    df[string_filter][(~df[string_filter][feature].str.isnumeric())][feature].unique()
))

# display("Number of unique values (nan included): {}".format(
#     df[FEATURE].nunique(False)
# ))


# display("Number of floats (that are not nan): {}".format(
#     (df[FEATURE].apply(lambda x: type(x) == float) & df[FEATURE].notnull()).sum()
# ))


# df[FEATURE].unique()


Number of rows: 100



'Sample of how the values looks like:'

18    None
53       0
37       N
57        
70     NaN
5     None
24       -
72    None
31       0
36       0
Name: empty_values, dtype: object

Type of "empty_values" feature: object

Value types: [<class 'float'> <class 'str'> <class 'int'> <class 'NoneType'>]

Float values - number of floats: 12 (~ 12.00% of rows)
nan values: 12 (~ 100.00% of floats)
decimal numbers: 0 (~ 0.00% of floats)

String values - number of string values: 59 (~ 59.00% of rows)
strings comprehensible as a numbers: 0 (~ 0.00% of strings)
other strings: 59 (~ 100.00% of strings)

Number of unique other strings: 4, the strings: ['?' '' 'N' '-']


In [13]:


display("Number of null values: {} ~ {:.2f}%".format(
    df[FEATURE].isnull().sum(), df[FEATURE].isnull().sum() / df.shape[0] * 100
))
display("Number of \"real\" values (not null): {} ~ {:.2f}%".format(
    df[FEATURE].notnull().sum(), df[FEATURE].notnull().sum() / df.shape[0] * 100
))
display("Number of integers: {}".format(
    (df[FEATURE].apply(lambda x: type(x) == int) & df[FEATURE].notnull()).sum()
))
display("Number of string values comprehensible as a number: {} (regex), same as {} (str.isnumeric)".format((
        df[FEATURE].str.match(r"\d+(\.\d+)*\Z")==True).sum(),
        df[FEATURE].str.isnumeric().sum()
))
display(f"""String values that can not be understood as a number, row count: {
        (df[FEATURE].str.isnumeric()==False).sum()
    }""", f"""list of unique values: {
        df[df[FEATURE].apply(lambda x: type(x) == str) & ~(df[FEATURE].str.isnumeric()==True)][FEATURE].unique()}"""
)
display(f"Histogram of \"{FEATURE}\" values:")
fig = px.histogram(df, x=FEATURE)
fig.show()
fig.write_image(f"figures/{FEATURE.lower().replace(' ', '_')}.png")

ValueError: Cannot take a larger sample than population when 'replace=False'