# unit testing

In [1]:
from data_book import DataBook, v_norm_formula, col_names
import pandas as pd
import numpy as np

In [2]:
# test data
test_df = pd.DataFrame({'numRow':[1,2,3,4,5], 
                        'numCol':[2,2,2,2,2], 
                        'sheetName':['sheet1', 'sheet1', 'sheet1', 'sheet1', 'sheet1'], 
                       'cellFormula':[np.NaN, 'Z2/K2', 'Z3/K3', 'Z4+K4', 'SUM(B2:B4)'],
                       'cellValue':[np.NaN, '2', '3', '4.0', '9.0'],
                       'cellType':[np.NaN, 'int', 'int', 'float', 'float'],
                       'vNormFormula':[np.NaN, 'Z*/K*', 'Z*/K*', 'Z*+K*', 'SUM(B*:B*)']},
                       index=['sheet1!B1', 'sheet1!B2', 'sheet1!B3', 'sheet1!B4', 'sheet1!B5'])

In [3]:
test_df

Unnamed: 0,numRow,numCol,sheetName,cellFormula,cellValue,cellType,vNormFormula
sheet1!B1,1,2,sheet1,,,,
sheet1!B2,2,2,sheet1,Z2/K2,2.0,int,Z*/K*
sheet1!B3,3,2,sheet1,Z3/K3,3.0,int,Z*/K*
sheet1!B4,4,2,sheet1,Z4+K4,4.0,float,Z*+K*
sheet1!B5,5,2,sheet1,SUM(B2:B4),9.0,float,SUM(B*:B*)


In [4]:
db=DataBook()
db.load_data(test_df)
db.pre_process_data()

In [None]:
df=db.get_data()
assert all(df.index==['sheet1!B2', 'sheet1!B3', 'sheet1!B4', 'sheet1!B5'])

In [None]:
df

In [None]:
db._add_positive_cases(keys=['sheet1!B3'])

In [None]:
df2=db.get_data()
df2

In [None]:
df.loc['sheet1!B3']

In [None]:
df=db.get_data()
assert len(df.loc['sheet1!B3'])==2
assert len(df.loc['sheet1!B3','Label'].unique())==2
assert len(df.loc['sheet1!B3','dw1_isWeaklyFormulaConsistent'].unique())==1
assert len(df.loc['sheet1!B3','up1_isWeaklyFormulaConsistent'].unique())==2

In [None]:
# v_norm_formula
assert (v_norm_formula("SUM(D1:D123)") == 'SUM(D*:D*)')

In [None]:
# col_names
assert col_names()[0]=='na'
assert col_names()[2]=='B'
assert col_names()[28]=='AB'

In [None]:
# _get_v_cell_ref
assert (db._get_v_cell_ref(10, 1, 2, 'sheet')=='sheet!A12')
assert (db._get_v_cell_ref(1, 1, -1, 'sheet') is None)

In [None]:
# get_that_from_this
assert (db._get_that_from_this(test_df.loc['sheet1!B2'], -1).name=='sheet1!B1')
assert (db._get_that_from_this(test_df.loc['sheet1!B1'], 1).name=='sheet1!B2')
assert (db._get_that_from_this(test_df.loc['sheet1!B1'], -1)==None)

In [None]:
# isBlank
assert(db._isBlank(None, None))
assert(db._isBlank(None, test_df.iloc[0]))
assert(not db._isBlank(None, test_df.iloc[1]))

In [None]:
# isFormula
assert(not db._isFormula(None, None))
assert(not db._isFormula(None, test_df.iloc[0]))
assert(db._isFormula(None, test_df.iloc[1]))

In [None]:
# isSameType
assert (not db._isSameType(None, test_df.iloc[0]))
assert (not db._isSameType(test_df.iloc[0], None))
assert (db._isSameType(test_df.iloc[1], test_df.iloc[2]))
assert (not db._isSameType(test_df.iloc[1], test_df.iloc[3]))

In [None]:
# isWeaklyFormulaConsistent
assert (db._isWeaklyFormulaConsistent(None, test_df.iloc[1]))
assert (db._isWeaklyFormulaConsistent(test_df.iloc[1], None))
assert (db._isWeaklyFormulaConsistent(test_df.iloc[1], test_df.iloc[2]))
assert (not db._isWeaklyFormulaConsistent(test_df.iloc[1], test_df.iloc[3]))

In [None]:
# isSum
assert (db._isSum(None, test_df.iloc[4]))
assert (not db._isSum(None, test_df.iloc[3]))

In [None]:
# test data
test_df_inf = pd.DataFrame({'numRow':[1,2,3,4,5], 
                        'workbookName':['wb','wb','wb','wb','wb'],
                        'numCol':[2,2,2,2,2], 
                        'sheetName':['sheet1', 'sheet1', 'sheet1', 'sheet1', 'sheet1'], 
                        'cellAddress':['B1', 'B2', 'B3', 'B4', 'B5'],
                       'cellFormula':[np.NaN, 'Z2/K2', 'Z3/K3', 'Z4+K4', 'SUM(B2:B4)'],
                       'cellValue':[np.NaN, '2', '3', '4.0', '9.0'],
                       'cellType':[np.NaN, 'int', 'int', 'float', 'float'],
                       'vNormFormula':[np.NaN, 'Z*/K*', 'Z*/K*', 'Z*+K*', 'SUM(B*:B*)']},
                       index=['sheet1!B1', 'sheet1!B2', 'sheet1!B3', 'sheet1!B4', 'sheet1!B5'])
test_df_inf

In [None]:
db_inf=DataBook()
db_inf.load_data(test_df_inf)
db_inf.pre_process_data(for_training=False)
assert db_inf.label not in db_inf.df.columns.to_list()

In [None]:
db_inf.df

In [None]:
df_inf_1 = db_inf.get_inconsistent_cells()
assert len(df_inf_1) == 4

In [None]:
df_inf_2 = db_inf.get_inconsistent_cells(sheet_filter='sheet1')
assert len(df_inf_2) == 4

In [None]:
df_inf_3 = db_inf.get_inconsistent_cells(sheet_filter='sheet1', cell_filter='B2')
assert len(df_inf_3) == 1