In [1]:
%load_ext autoreload

%autoreload 2

In [2]:
import pandas as pd

test_df = pd.DataFrame(
  columns = ['a','b'],
  data = [
    [1, -42], # row 0
    [1, None], # row 1
    [1, None], # row 2
    [1, 3.14], # row 3
  ],
  dtype=object # prevent None from being converted to np.nan - ref: https://stackoverflow.com/a/48453225
)


In [3]:
test_df.where(test_df > 1)

Unnamed: 0,a,b
0,,
1,,
2,,
3,,3.14


In [4]:
test_df.where(test_df == 1)

Unnamed: 0,a,b
0,1,
1,1,
2,1,
3,1,


In [5]:
test_df = pd.DataFrame(
  columns = ['a','b'],
  data = [
    [1, -42], # row 0
    [1, None], # row 1
    [1, None], # row 2
    [1, 3.14], # row 3
  ]
)

test_df.where(test_df==1, -test_df)

Unnamed: 0,a,b
0,1,42.0
1,1,
2,1,
3,1,-3.14


In [6]:
test_df == 1

Unnamed: 0,a,b
0,True,False
1,True,False
2,True,False
3,True,False


In [7]:
[ x + x for x in test_df['a']]

[2, 2, 2, 2]

In [8]:
all((test_df == 1)['a'])

True

In [9]:
import validium as V

sum_eq_zero = V.Validator(
    lambda df: df['a'].sum() == 0, 
    'sum of column "a" must be 0'
)

if not sum_eq_zero.predicate(test_df): # should except on assert
    print('warn:', sum_eq_zero.fail_msg) 

sum_eq = lambda n: V.Validator(
    lambda df: df['a'].sum() == n, 
    'sum of column "a" must be {}'.format(n)
)

sum_eq(4).validate(test_df)


warn: sum of column "a" must be 0


In [10]:
sum_eq = lambda n: V.Validator(
    lambda df: df['a'].sum() == n, 
    'sum of column "a" must be {}'.format(n)
)

sum_eq(4).validate(test_df)

In [11]:
test_df[['a','b']]

Unnamed: 0,a,b
0,1,-42.0
1,1,
2,1,
3,1,3.14


In [12]:
test_df.loc[2:3,'a':'b']

Unnamed: 0,a,b
2,1,
3,1,3.14


In [13]:
nums_df = test_df.loc[[0,3],['a','b']]
print(nums_df.sum())
print('-------')
print(nums_df.sum().sum())
print('-------')
print(nums_df.values.sum())


a     2.00
b   -38.86
dtype: float64
-------
-36.86
-------
-36.86


In [14]:
nums_df.values

array([[  1.  , -42.  ],
       [  1.  ,   3.14]])

In [15]:
v = V.Validator(
    lambda df: df.where(isinstance(df, int)).sum().sum() == 4,
    'its just 4'
)

In [16]:
test_df.apply(lambda x : isinstance(x,int))




a    False
b    False
dtype: bool

In [17]:
def foo(bar):
    a = list(range(10))
    print(a[bar[0]:bar[1]])

foo([2,7])

[2, 3, 4, 5, 6]


In [18]:
test_df[test_df == 1].count().sum()



4

In [29]:
# lets try to do a simple one:
import validframe as vf

df_pass = pd.DataFrame(
      columns = ['a','b','c'],
      data = [
        [1, -42, 1], # row 0
        [1, None, 1], # row 1
        [1, None, 1], # row 2
        [1, 3.14, 1], # row 3
      ]
    )

# this is how it comes out working with the list of cells rather than the dataframe
# theres brevity in the logic
cv = vf.CellsValidator(
    lambda xs: all([x == 1 for x in xs]), 
    'all must equal 1',
    cols=['a', 'c'], rows=[0, 3]
)

cv.validate(df_pass)

# turns out its way worse when you have to work directly with the dataframe
# its unintuitive and unreadable unless you really really know the DataFrame api
fv = vf.FrameValidator(
    lambda df: df.loc[[0,3],['a','c']][df == 1].count().sum() == df.loc[[0,3],['a','c']].count().sum(),
    'all must equal 1'
)

fv.validate(df_pass)

# abstracting away the slicing helps but still pretty unintuitive and unreadable
fv2 = vf.FrameValidator(
    lambda df: df.where(df == 1).count().sum() == df.count().sum(),
    'all must equal 1',
    cols=['a', 'c'], rows=[0, 3]
)

fv2.validate(df_pass)

# FINAL THOUGHTS: 
# when working with the dataframe directly you end up having to write a predicate function that does more mathy manipulations (eg. reshaping, pivoting, melting, reducing etc) of the dataframe because that's just how the DataFrame api was designed - while for the right developer this might sound like a fun and challenging way to write prediate functions for validators, it really does make for hacky and unfriendly code


KeyError: "None of [Int64Index([1], dtype='int64')] are in the [index]"

In [20]:
# below are the dataframe manipulations step by step to illustrate the point above:

In [25]:
df = pd.DataFrame(
    columns = ['a','b','c'],
    data = [
        [1, -42, 1], # row 0
        [1, None, 1], # row 1
        [1, None, 1], # row 2
        [1, 3.14, 1], # row 3
    ]
)

In [26]:
df.loc[[0,3],['a','c']]

Unnamed: 0,a,c
0,1,1
3,1,1


In [None]:
df.loc[[0,3],['a','c']][df == 1]
df.loc[[0,3],['a','c']].where(df == 1) # same as prev


In [None]:
df.loc[[0,3],['a','c']][df == 1].count().sum()

In [31]:
df3 = pd.DataFrame(
      columns = ['a','b','c'],
      data = [
        [1, -42, 1], # row 0
        [1, None, 1], # row 1
        [1, None, 1], # row 2
        [1, 3.14, 1], # row 3
      ]
    )

df3[['a','c']]
df3.loc[[0]]
df3.loc[[0],['a']]
df3.loc[0,'a']
df3.rows

1

In [34]:
[i for i, row in df3.iterrows()]

[0, 1, 2, 3]