# Inference

To run this you need:

    pip install pandas
    pip install sklearn


In [1]:
import joblib
import json
from data_book import DataBook
import pickle

In [2]:
raw_data_path = './Data-v1/Standard_Databook_ 06 07 2022.csv.json'

## Loading an Excel file as a DataBook object

In [3]:
dBook = DataBook()
dBook.load_file(raw_data_path)
dBook.get_data(all_columns=True)

Unnamed: 0_level_0,workbookName,sheetName,numRow,numCol,cellAddress,cellValue,cellFormula,cellType
key,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
Cover!A1,Standard_Databook_ 06 07 2022.xlsx,Cover,1,1,A1,,,General
Cover!B1,Standard_Databook_ 06 07 2022.xlsx,Cover,1,2,B1,,,Time
Cover!C1,Standard_Databook_ 06 07 2022.xlsx,Cover,1,3,C1,,,Time
Cover!D1,Standard_Databook_ 06 07 2022.xlsx,Cover,1,4,D1,,,Time
Cover!E1,Standard_Databook_ 06 07 2022.xlsx,Cover,1,5,E1,,,Time
...,...,...,...,...,...,...,...,...
Sheet12S!L55,Standard_Databook_ 06 07 2022.xlsx,Sheet12S,55,12,L55,,,Currency
Sheet12S!M55,Standard_Databook_ 06 07 2022.xlsx,Sheet12S,55,13,M55,,,Currency
Sheet12S!N55,Standard_Databook_ 06 07 2022.xlsx,Sheet12S,55,14,N55,,,Currency
Sheet12S!O55,Standard_Databook_ 06 07 2022.xlsx,Sheet12S,55,15,O55,,,Currency


## Add features for inference

We compute features for each cell, that is, for each cell we process its context

In [4]:
dBook.pre_process_data(for_training=False)
dBook.get_data()

Unnamed: 0_level_0,sheetName,cellAddress,up1_isBlank,up1_isFormula,up1_isSameType,up1_isWeaklyFormulaConsistent,up2_isWeaklyFormulaConsistent,dw1_isBlank,dw1_isFormula,dw1_isSameType,dw1_isWeaklyFormulaConsistent,dw2_isWeaklyFormulaConsistent,nb1_isWeaklyFormulaConsistent,dw1_isSum
key,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
Cover!AZ1,Cover,AZ1,True,False,False,True,True,False,False,False,False,True,True,False
Lead PL!O1,Lead PL,O1,True,False,False,True,True,True,False,False,True,True,True,False
Lead PL!C7,Lead PL,C7,False,False,False,False,True,False,True,True,True,True,False,False
Lead PL!D7,Lead PL,D7,False,False,False,False,True,False,True,True,True,True,False,False
Lead PL!E7,Lead PL,E7,False,False,False,False,True,False,True,True,True,True,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
ScratchPad_TB!H152,ScratchPad_TB,H152,True,False,False,False,True,True,False,False,False,True,True,False
Sheet8S!A28,Sheet8S,A28,False,False,False,False,True,True,False,False,False,True,True,False
Sheet4S!A28,Sheet4S,A28,False,False,False,False,True,True,False,False,False,True,True,False
Sheet01S!B1,Sheet01S,B1,True,False,False,True,True,False,False,False,False,True,True,False


## Load a model

In [5]:
with open ('./model.pkl', 'br') as f:
    model = pickle.load(f)
model

LogisticRegression(multi_class='ovr', random_state=0)

## Get some data to make a prediction

Note that this is sample code, and we felt free to use the same data we already used for training! On a real situation this wouldn't happen.

#### Score a single cell

*G16* in *PL8-Other income exp* is wrongly evaluated as non-error

In [6]:
df_for_inference_cell=dBook.get_inconsistent_cells(sheet_filter='PL8-Other income exp', cell_filter='G16')
df_for_inference_cell

Unnamed: 0_level_0,up1_isBlank,up1_isFormula,up1_isSameType,up1_isWeaklyFormulaConsistent,up2_isWeaklyFormulaConsistent,dw1_isBlank,dw1_isFormula,dw1_isSameType,dw1_isWeaklyFormulaConsistent,dw2_isWeaklyFormulaConsistent,nb1_isWeaklyFormulaConsistent,dw1_isSum
key,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
PL8-Other income exp!G16,False,True,True,False,True,False,True,True,False,True,True,False


In [7]:
model.predict(df_for_inference_cell)

array([False])

#### Score an entire sheet

In [8]:
df_for_inference_sheet=dBook.get_inconsistent_cells(sheet_filter='PL8-Other income exp')
df_for_inference_sheet

Unnamed: 0_level_0,up1_isBlank,up1_isFormula,up1_isSameType,up1_isWeaklyFormulaConsistent,up2_isWeaklyFormulaConsistent,dw1_isBlank,dw1_isFormula,dw1_isSameType,dw1_isWeaklyFormulaConsistent,dw2_isWeaklyFormulaConsistent,nb1_isWeaklyFormulaConsistent,dw1_isSum
key,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
PL8-Other income exp!B7,False,False,False,False,True,False,True,True,True,True,False,False
PL8-Other income exp!C7,False,False,False,False,True,False,True,True,True,True,False,False
PL8-Other income exp!D7,False,False,False,False,True,False,True,True,True,True,False,False
PL8-Other income exp!F7,False,False,False,False,True,False,True,True,True,True,False,False
PL8-Other income exp!G7,False,False,False,False,True,False,True,True,True,True,False,False
PL8-Other income exp!H7,False,False,False,False,True,False,True,True,False,True,True,False
PL8-Other income exp!I7,False,False,False,False,True,False,True,True,True,True,False,False
PL8-Other income exp!F8,False,True,True,True,False,False,True,True,False,False,False,False
PL8-Other income exp!G8,False,True,True,True,False,False,True,True,False,False,False,False
PL8-Other income exp!H8,False,True,True,False,False,False,True,True,True,False,False,False


Cells that are evaluated as errors

In [9]:
results = zip(df_for_inference_sheet.index.to_list(), model.predict(df_for_inference_sheet))
[cell for cell, prediction in results if prediction]

['PL8-Other income exp!B17',
 'PL8-Other income exp!C17',
 'PL8-Other income exp!D17',
 'PL8-Other income exp!B20',
 'PL8-Other income exp!C20',
 'PL8-Other income exp!D20']

#### Scoring an entire workbook

In [10]:
df_for_inference=dBook.get_inconsistent_cells()
results = zip(df_for_inference.index.to_list(), model.predict(df_for_inference))
[cell for cell, prediction in results if prediction]

['Lead CF!C9',
 'Lead CF!D9',
 'Lead CF!C14',
 'Lead CF!D14',
 'Lead CF!C15',
 'Lead CF!D15',
 'Lead CF!C16',
 'Lead CF!D16',
 'Lead CF!C17',
 'Lead CF!D17',
 'Lead CF!C20',
 'Lead CF!D20',
 'Lead CF!C21',
 'Lead CF!D21',
 'Lead CF!C22',
 'Lead CF!D22',
 'Lead CF!C23',
 'Lead CF!D23',
 'Lead CF!C24',
 'Lead CF!D24',
 'Lead BS!C18',
 'Lead BS!C28',
 'Lead BS!D28',
 'Lead BS!E28',
 'Lead BS!C29',
 'R1!G12',
 'R1!G19',
 'R2!D9',
 'R2!D10',
 'R2!D11',
 'R2!D14',
 'R2!D15',
 'R2!D16',
 'R2!D17',
 'R2!D18',
 'R2!D20',
 'R2!D21',
 'R2!D22',
 'R3!C9',
 'R3!D9',
 'R3!E9',
 'R3!G9',
 'R3!I9',
 'R3!J9',
 'R3!K9',
 'R3!M9',
 'R3!O9',
 'R3!P9',
 'R3!Q9',
 'R3!S9',
 'R3!G22',
 'R3!M22',
 'R3!S22',
 'R3!G24',
 'R3!M24',
 'R3!S24',
 'R4!F9',
 'R4!N9',
 'R4!R9',
 'R4!T9',
 'R4!F10',
 'R4!N10',
 'R4!R10',
 'R4!T10',
 'R4!N11',
 'R4!T11',
 'R4!N14',
 'R4!T14',
 'R4!N15',
 'R4!T15',
 'R4!N16',
 'R4!T16',
 'R4!N17',
 'R4!T17',
 'R4!C18',
 'R4!D18',
 'R4!F18',
 'R4!G18',
 'R4!M18',
 'R4!N18',
 'R4!R18',
 'R