In [1]:
# from IPython.core.display import display, HTML
# display(HTML("<style>.container { width:100% !important; }</style>"))

In [2]:
import pandas as pd
import json
import os
import json
import sys
import json
import requests
import numpy as np
from shapely.geometry import box
from difflib import SequenceMatcher

class ExtractionDataCollector(object):
    
    def __init__(self, root_url, api_key, doc_suffix,
                 endpoints=['ExtractionUI']):
        """
        Collects data from RememberV2
        """
        self.root_url = root_url
        self.api_key = api_key
        self.endpoints = endpoints
        self.doc_suffix = doc_suffix
        
    def collect(self, docids):
        """
        Collects data from RememberV2 and drop into folders 
        in current working dir for each docid provided
        """
        for docid in docids:
            self.get_data(docid, self.root_url, self.api_key, self.endpoints, self.doc_suffix)
    
    def get_data(self, document_id, root_url, api_key, endpoints, doc_suffix):
        """
        Collects data from RememberV2 and drop into folders 
        in current working dir for the docid provided
        """
        for endpoint in endpoints:
            results = self.query_rememberv2(endpoint, 'DocumentId', document_id)
            #print(results)

            formatted_results = []
            for result in results['Results']:
                formatted_results.append(self.format_result(result, endpoint))

            # create folder for endpoint results if it doesnt exist in cwd
            if not os.path.isdir(os.path.join(os.getcwd(), endpoint)):
                os.mkdir(os.path.join(os.getcwd(), endpoint))
                #print(os.path.join(os.getcwd(), endpoint,doc_suffix))

            with open(f'{endpoint}_{doc_suffix}/{document_id}.json', 'w') as f:
                json.dump(formatted_results, f)
                
    def format_result(self, result, endpoint):
        """
        Formats raw result from RememberV2 into common format
        """
        if endpoint == 'ExtractionUI':
            formatted_result = {
                'doctype': result['DOCUMENT TYPE'] if 'DOCUMENT TYPE' in result else None,
                'document_id': result['ExtractionUI::DocumentId'],
                'label': result['NAME'],
                'entity_type': result['ENTITY TYPE'] if 'ENTITY TYPE' in result else None,
                'text': result['selection_input']['text'] if ('selection_input' in result) and ('text' in result['selection_input']) else None,
                'cloneid': None,
                'coordinates': result['selection_input']['pos'] if ('selection_input' in result) and ('pos' in result['selection_input']) else None
            }
        elif endpoint == 'PageEntityClassification':
            formatted_result = {
                'doctype': result['document_type'],
                'document_id': result['PageEntityClassification::DocumentId'],
                'label': result['Label'],
                'entity_type': None,
                'text': result['value'],
                'cloneid': None,
                'coordinates': result['coordinates']
            }
        elif endpoint == 'TextractDocumentAnalysisResult':
            formatted_result = {
                'doctype': result['document_type'],
                'document_id': result['TextractDocumentAnalysisResult::DocumentId'],
                'label': result['Label'],
                'entity_type': None,
                'text': result['value'],
                'cloneid': None,
                'coordinates': result['coordinates']
            }

        return formatted_result
    
    def query_rememberv2(self, endpoint, query_param, query_value):
        HEADERS = {
            'Content-Type': 'application/json',
            'Authorization': self.api_key
        }
        BASE_URL = f"https://rememberv2.{self.root_url}/latest/query"
        DATA = json.dumps({
            "Index":
                {
                    f"{endpoint}::{query_param}": query_value
                } 
        })
        response = requests.post(url=f"{BASE_URL}", headers=HEADERS, data=DATA)
        response_json = json.loads(response.text)

        return response_json


class ExtractionEvaluator(object):
    
    def __init__(self, y_true_path, y_pred_path, metrics=['string_match']):
        #TODO what if the cloneid is just to group them rather than ordered list?
        
        self.y_true_path = y_true_path
        self.y_pred_path = y_pred_path
        self.metrics = metrics

        self.y_true_files = [f for f in os.listdir(self.y_true_path) if '.json' in f]
        self.y_pred_files = [f for f in os.listdir(self.y_pred_path) if '.json' in f]
        
        self.len_y_true = len(self.y_true_files)
        self.len_y_pred = len(self.y_pred_files)

        if self.len_y_true != self.len_y_pred:
            raise ValueError('Length of y_true files does not equal length of y pred files')
            
        if not all(filename in self.y_pred_files for filename in self.y_true_files):
            raise ValueError('A document id from y true doesnt exist in y pred directory')
        
    def evaluate(self):
        #TODO aggregate doc level metrics (group by doc type and average) == FOR NOW
        
        self.metric_values = [] # one calc per document
        for y_true_file in self.y_true_files:
            with open(os.path.join(self.y_true_path, y_true_file)) as f:
                y_true_labels = json.load(f)
                
            with open(os.path.join(self.y_pred_path, y_true_file)) as f:
                y_pred_labels = json.load(f)
            
            metric_calc = ExtractionMetricsCalculator(y_true_labels, y_pred_labels, self.metrics)
            #print(type(metric_calc)) 
            self.metric_values.append(metric_calc.calculate())
        #print(pd.concat([i for i in self.metric_values]))    
        #return self.metric_values #pd.concat([i for i in self.metric_values])
        return pd.concat([i for i in self.metric_values])

class ExtractionMetricsCalculator(object):
    
    def __init__(self, y_true, y_pred,
                metrics=['string_match']):
        """
        Calculates universal metrics for extraction
        """
        self.y_pred = y_pred
        self.y_true = y_true
        self.metrics = metrics
        
        self.metric_to_match_function_map = {
            'string_match': self.string_match,
            'string_match_partial': self.string_match_partial,
            'coordinate_match': self.coordinate_match,
            'coordinate_match_partial': self.coordinate_match_partial
        }
        
        self.metric_to_match_type_map = {
            'string_match': 'compare_exact_strings',
            'string_match_partial': 'difflib_ratio_on_exact_strings_threshold=0.25',
            'coordinate_match': 'intersection_over_union_on_coordinates_threshold=0.95',
            'coordinate_match_partial': 'intersection_over_union_on_coordinates_threshold=0.25'
        }

        for metric in self.metrics:
            if metric not in self.metric_to_match_function_map:
                raise NotImplementedError('Metric not implemented')
       
        
    def calculate(self):
        """
        Calculates metrics for each docid (file) provided
        """
        calculated_metrics = []
        for metric in self.metrics:
            calculated_metrics.append(self.calculate_metrics(self.y_true, self.y_pred, metric, self.metric_to_match_function_map[metric]))
        #print(calculated_metrics)
        
        return pd.concat([i for i in calculated_metrics])
    
    def calculate_metrics(self, y_true, y_pred, metric, match_function):
        """
        Calculates single metric for predictions and actuals provided
        """
        
        # TODO get match score in df
        # print(y_pred)
        
        if len(y_pred) == 0:
            return self.make_initial_df(y_true, metric, is_labels=True)
        
        df = self.make_initial_df(y_true, metric, is_labels=True)
        
        match_count = 0
        match_candidate_count = len(y_true)
        matched_true_idxs = []
        matched_pred_idxs = []
        
        for true_idx, true_label in enumerate(y_true):
            for pred_idx, pred_label in enumerate(y_pred):
                if (true_idx not in matched_true_idxs) and (pred_idx not in matched_pred_idxs):
                    if true_label['label'] == pred_label['label']:
                        if match_function(true_label, pred_label):
                            df = self.update_for_matches(df, metric, true_idx, pred_label)
                            matched_true_idxs.append(true_idx)
                            matched_pred_idxs.append(pred_idx)
                            match_count += 1
                            break
                         
        unmatched_idxs = [idx for idx in range(len(y_pred)) if idx not in matched_pred_idxs]
        unmatched_preds_count = len(unmatched_idxs) 
        
        if unmatched_idxs:
            unmatched_df = self.make_initial_df(np.array(y_pred)[unmatched_idxs].tolist(), metric, is_labels=False)
            return pd.concat([df, unmatched_df])
        else:
            return df

    
    def update_for_matches(self, df, metric, row_idx, pred):
        """
        Update results df row if a prediction matches an actual label
        """
        df.loc[df.index == row_idx, 'predicted_label'] = pred['label']
        df.loc[df.index == row_idx, 'predicted_entity_type'] = pred['entity_type']
        df.loc[df.index == row_idx, 'predicted_text'] = pred['text']
        df.loc[df.index == row_idx, 'predicted_cloneid'] = pred['cloneid']
        df.loc[df.index == row_idx, 'predicted_coordinates'] = str(pred['coordinates'])
        df.loc[df.index == row_idx, 'is_matched'] = True
        df.loc[df.index == row_idx, 'match_score_type'] = self.metric_to_match_type_map[metric]
        df.loc[df.index == row_idx, 'match_score'] = pred['label']
        df.loc[df.index == row_idx, 'metric'] = metric

        return df
        
    def make_initial_df(self, data, metric, is_labels=True):
        """
        Creates an initial df with either actual labels or predictions
        """
        prefix1 = 'actual' if is_labels else 'predicted' #actual
        prefix2 = 'predicted' if prefix1 == 'actual' else 'actual' #predicted

        col_order = [
            'doctype',
            'document_id',
            'metric',
            'is_true_label',
            'is_matched',
            'match_score_type',
            'match_score',
            'actual_label',
            'actual_entity_type',
            'actual_text',
            'actual_cloneid',
            'actual_coordinates',
            'predicted_label',
            'predicted_entity_type',
            'predicted_text',
            'predicted_cloneid',
            'predicted_coordinates'
        ]
#         print("Data: ",data)
        df = pd.DataFrame(data)
        if df[df["doctype"] !="AUTOMATED UNDERWRITING FEEDBACK - DU CODIFIED FINDINGS"].shape[0] > 0:
            print(df[df["doctype"] !="AUTOMATED UNDERWRITING FEEDBACK - DU CODIFIED FINDINGS"])
        #print("DF: ", df)
        df.columns = [
            'doctype',
            'document_id',
            f'{prefix1}_label',
            f'{prefix1}_entity_type',
            f'{prefix1}_text',
            f'{prefix1}_cloneid',
            f'{prefix1}_coordinates'
        ]
        df[f'{prefix2}_label'] = None
        df[f'{prefix2}_entity_type'] = None
        df[f'{prefix2}_text'] = None
        df[f'{prefix2}_cloneid'] = None
        df[f'{prefix2}_coordinates'] = None
        df['is_matched'] = False
        df['match_score_type'] = self.metric_to_match_type_map[metric]
        df['match_score'] = 0.0
        df['metric'] = metric
        df['is_true_label'] = is_labels
        
        return df[col_order]
        
    def string_match(self, y_true, y_pred):
        """
        Checks if text matches between prediction and actual
        """
        return y_true['text'] == y_pred['text']
        
    def string_match_partial(self, y_true, y_pred, threshold=0.25):
        """
        Checks if text ratio is greater than threshold between prediction and actual
        """
        return self._similarity(y_true['text'], y_pred['text']) > threshold

    def _similarity(self, stra, strb):
        """
        Applies difflib SequenceMatcher.ratio to given strings
        """
        return SequenceMatcher(None, stra, strb).ratio()
    
    def _format_coords(self, coords):
        """
        formats coordinates for use in shapely.geometry (iou calc)
        """
        x0 = coords['x0']
        x1 = coords['x1']
        y0 = coords['y0']
        y1 = coords['y1']

        return x0, y0, x1, y1
    
    def _iou(self, boxA, boxB):
        """
        Calculates intersection over union using shapely.geometry
        """
        # {'y0': 958, 'x0': 275, 'y1': 978, 'x1': 290}
        boxA = self._format_coords(boxA)
        boxB = self._format_coords(boxB)
        
        a = box(*boxA)
        b = box(*boxB)
        
        return a.intersection(b).area / a.union(b).area
    
    def coordinate_match(self, y_true, y_pred, threshold=0.95):
        """
        Checks if coordinates' iou is greater than threshold for prediction and actual (generaly high threshold)
        """
        return self._iou((y_true['coordinates']), y_pred['coordinates']) > threshold
    
    def coordinate_match_partial(self, y_true, y_pred, threshold=0.25):
        """
        Checks if coordinates' iou is greater than threshold for prediction and actual (generally lower threshold)
        """
        return self._iou(y_true['coordinates'], y_pred['coordinates']) > threshold
    



In [3]:
def make_eval_summary(metrics_df):
    summary = {}
    
    metrics = metrics_df.metric.unique().tolist()
    
    for metric in metrics:
        matched_count = metrics_df[
            (metrics_df['metric'] == metric) &
            (metrics_df['is_true_label'] == True) &
            (metrics_df['is_matched'] == True)].doctype.count()
        
        label_count = metrics_df[
            (metrics_df['metric'] == metric) &
            (metrics_df['is_true_label'] == True)].doctype.count()
        
        unmatched_count = metrics_df[
            (metrics_df['metric'] == metric) &
            (metrics_df['is_true_label'] == False)].doctype.count()
        
        pred_count = matched_count + unmatched_count
        
        summary_metrics = {'accuracy': matched_count / label_count, 'false_positive_rate': unmatched_count / pred_count}
        
        summary[metric] = summary_metrics
        summary['doctype'] = metrics_df.doctype.unique()[0]

        
    return summary

In [4]:
def make_summary(metrics_df):

        summary = {}

        metrics = metrics_df.metric.unique().tolist()

        for metric in metrics:
            true_positives = metrics_df[
                (metrics_df['metric'] == metric) &
                (metrics_df['is_true_label'] == True) &
                (metrics_df['is_matched'] == True)].doctype.count()

            label_count = metrics_df[
                (metrics_df['metric'] == metric) &
                (metrics_df['is_true_label'] == True)].doctype.count()

            false_negatives = label_count - true_positives

            false_positives = metrics_df[
                (metrics_df['metric'] == metric) &
                (metrics_df['is_true_label'] == False)].doctype.count()

            pred_count = true_positives + false_positives


            eta = 1e-10

            precision = true_positives / ((true_positives + false_positives) + eta)
            recall = true_positives / ((true_positives + false_negatives) + eta)

            summary_metrics = {
                'accuracy': true_positives / (label_count + eta), 
                'false_positive_rate': false_positives / (pred_count + eta),
                'precision': precision,
                'recall': recall,
                'f1': 2 * ((precision * recall) / ((precision + recall) + eta))
            }

            summary[metric] = summary_metrics

        return summary

In [5]:
# rids_filename = '../Abhi_TaggingExperiments/pipelineSet_rids_UNDERWRITINGTRANSMITTAL-1008.txt'
# with open(rids_filename, 'r') as f:
#     docids = [line.strip() for line in f]
# print("DocIDS: ", docids[:5])
# collectorObject.collect(docids)

In [6]:
# doc_map = [
#     {
#   "doctype" : "UNDERWRITING TRANSMITTAL - 1008",
#     "eval_dir" : "eval_ut1008_10per",
#     "rids_dir" : "pipelineSet_rids_UNDERWRITINGTRANSMITTAL-1008.txt"
# },
# {
#   "doctype" : "1003",
#     "eval_dir" : "eval_1003_50per",
#     "rids_dir" : "pipelineSet_rids_1003.txt"
# },
#     {
#   "doctype" : "DEMOGRAPHIC ADDENDUM",
#     "eval_dir" : "eval_demoAdd",
#         "rids_dir" : "pipelineSet_rids_DEMOGRAPHICADDENDUM.txt"
# },
#     {
#   "doctype" : "CLOSING DISCLOSURE",
#     "eval_dir" : "eval_cd_50per"
# },
# ]

In [7]:
doc_map = [
{
  "doctype" : "AUTOMATED UNDERWRITING FEEDBACK - DU CODIFIED FINDINGS",
    "eval_dir" : "eval_aufdcf"
}
]

In [8]:
#only run if len is not equal
from os import walk
from os import listdir
from os.path import isfile, join

evalLocation = 'eval_aufdcf'
extractionLocation = 'ExtractionUI_AUTOMATED_UNDERWRITING_FEEDBACK_-_DU_CODIFIED_FINDINGS'

evalfiles = [f for f in listdir(evalLocation) if isfile(join(evalLocation, f))]
extractionfiles = [f for f in listdir(extractionLocation) if isfile(join(extractionLocation, f))]

print(len(evalfiles), len(extractionfiles))

if len(evalfiles) < len(extractionfiles):
    for each in extractionfiles:
        if each not in evalfiles:
            fileToRemove = extractionLocation+'/'+each
            print("Missing File: ", fileToRemove)
            os.remove(fileToRemove)
print(len(evalfiles), len(extractionfiles))

655 663
Missing File:  ExtractionUI_AUTOMATED_UNDERWRITING_FEEDBACK_-_DU_CODIFIED_FINDINGS/f018a9a2-144f-4798-983b-0e95d5bbc708.json
Missing File:  ExtractionUI_AUTOMATED_UNDERWRITING_FEEDBACK_-_DU_CODIFIED_FINDINGS/5f8fa6e1-0b4f-450f-8579-dd02cbdb77b5.json
Missing File:  ExtractionUI_AUTOMATED_UNDERWRITING_FEEDBACK_-_DU_CODIFIED_FINDINGS/8088f467-9d14-4da9-8d96-6181fe4b0270.json
Missing File:  ExtractionUI_AUTOMATED_UNDERWRITING_FEEDBACK_-_DU_CODIFIED_FINDINGS/e6eb85a9-bd7b-4e78-af71-123476e6a808.json
Missing File:  ExtractionUI_AUTOMATED_UNDERWRITING_FEEDBACK_-_DU_CODIFIED_FINDINGS/b1df078f-c49a-4484-826d-eaa401d21301.json
Missing File:  ExtractionUI_AUTOMATED_UNDERWRITING_FEEDBACK_-_DU_CODIFIED_FINDINGS/f6a0b746-6e9d-4853-8b7b-95d58a0dc54d.json
Missing File:  ExtractionUI_AUTOMATED_UNDERWRITING_FEEDBACK_-_DU_CODIFIED_FINDINGS/69c2b312-de5a-48e0-acc3-5f3dd96f39cf.json
Missing File:  ExtractionUI_AUTOMATED_UNDERWRITING_FEEDBACK_-_DU_CODIFIED_FINDINGS/8ca1902f-62d4-48f3-a148-f566fd1519

In [None]:
from os import listdir
from os.path import isfile, join
df_calculate = pd.DataFrame()
for each in doc_map:
    print(each)
    collectorObject = ExtractionDataCollector(root_url='usbanktraining.heavywater.com', api_key='361778c1-2fdb-4689-8233-d4df4b97999a'
, doc_suffix = each['doctype'].replace(" ","_"), endpoints=['ExtractionUI'])

In [14]:
from os import walk
from os import listdir
from os.path import isfile, join


list_of_results = []
for each in doc_map:

    docids = [f.split('.')[0] for f in listdir(each['eval_dir']) if isfile(join(each['eval_dir'], f))]
#     for i in docids:
#         #if i == "12727c63-cf2e-4c3c-ae58-5cf8bcb573ca":
#         loc = each['eval_dir'] + '/' + i + '.json'
#         with open(loc) as json_file:
#             json_decoded = json.load(json_file)
#         for each1 in json_decoded:
#             each1['entity_type'] = None
#             each1['doctype'] = each['doctype']
#             each1['document_id'] = str(i)

#         with open(loc, 'w') as json_file:
#             json.dump(json_decoded, json_file)
    #collectorObject.collect(docids)
    y_true = 'ExtractionUI'+'_'+each['doctype'].replace(" ","_")
    y_pred = each['eval_dir']
    print("Len Y-True: ", len(os.listdir(y_true)))
    print("Len Y-Pred: ", len(os.listdir(y_pred)))
    just = ExtractionEvaluator(y_true, y_pred, metrics=['string_match']).evaluate()
    df_calculate = pd.concat([just])
        

Len Y-True:  655
Len Y-Pred:  656


In [19]:
all_false = df_calculate[df_calculate["is_matched"]==False]
all_false

Unnamed: 0,doctype,document_id,metric,is_true_label,is_matched,match_score_type,match_score,actual_label,actual_entity_type,actual_text,actual_cloneid,actual_coordinates,predicted_label,predicted_entity_type,predicted_text,predicted_cloneid,predicted_coordinates
0,AUTOMATED UNDERWRITING FEEDBACK - DU CODIFIED ...,f8e35053-b4bc-4613-b58a-ec6338d87b63,string_match,True,False,compare_exact_strings,0,BORROWER OTHER CREDIT SCORE VALUE 2,TEXT,,,"{'x0': 1540.15185546875, 'y1': 1971.4381103515...",,,,,
1,AUTOMATED UNDERWRITING FEEDBACK - DU CODIFIED ...,f8e35053-b4bc-4613-b58a-ec6338d87b63,string_match,True,False,compare_exact_strings,0,AUTOMATED UNDERWRITING SYSTEM TYPE,LIST,SUMMARY,,"{'x0': 1025.7548828125, 'y1': 240.067153930664...",,,,,
3,AUTOMATED UNDERWRITING FEEDBACK - DU CODIFIED ...,f8e35053-b4bc-4613-b58a-ec6338d87b63,string_match,True,False,compare_exact_strings,0,BORROWER LAST NAME,NAME,son,,"{'x0': 715.8772583007812, 'y1': 1974.536865234...",,,,,
5,AUTOMATED UNDERWRITING FEEDBACK - DU CODIFIED ...,f8e35053-b4bc-4613-b58a-ec6338d87b63,string_match,True,False,compare_exact_strings,0,LTV RATIO PERCENT,PERCENT,. 92.00% o,,"{'x0': 706.5809326171875, 'y1': 549.9448242187...",,,,,
6,AUTOMATED UNDERWRITING FEEDBACK - DU CODIFIED ...,f8e35053-b4bc-4613-b58a-ec6338d87b63,string_match,True,False,compare_exact_strings,0,BORROWER OTHER CREDIT SCORE VALUE 1,TEXT,,,"{'x0': 1459.5836181640625, 'y1': 1918.75891113...",,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
10,AUTOMATED UNDERWRITING FEEDBACK - DU CODIFIED ...,12727c63-cf2e-4c3c-ae58-5cf8bcb573ca,string_match,False,False,compare_exact_strings,0,,,,,,TOTAL LOAN AMOUNT,DATE,$124300,,"{'y0': 743, 'x0': 524, 'y1': 776, 'x1': 765}"
11,AUTOMATED UNDERWRITING FEEDBACK - DU CODIFIED ...,12727c63-cf2e-4c3c-ae58-5cf8bcb573ca,string_match,False,False,compare_exact_strings,0,,,,,,SALER PRICE,DATE,,,"{'y0': 743, 'x0': 524, 'y1': 776, 'x1': 765}"
12,AUTOMATED UNDERWRITING FEEDBACK - DU CODIFIED ...,12727c63-cf2e-4c3c-ae58-5cf8bcb573ca,string_match,False,False,compare_exact_strings,0,,,,,,LOAN PURPOSE TYPE,DATE,refinance,,"{'y0': 743, 'x0': 524, 'y1': 776, 'x1': 765}"
13,AUTOMATED UNDERWRITING FEEDBACK - DU CODIFIED ...,12727c63-cf2e-4c3c-ae58-5cf8bcb573ca,string_match,False,False,compare_exact_strings,0,,,,,,APPRISAL AMOUNT,DATE,$149000,,"{'y0': 743, 'x0': 524, 'y1': 776, 'x1': 765}"


In [22]:
all_false[all_false.document_id=="f8e35053-b4bc-4613-b58a-ec6338d87b63"]

Unnamed: 0,doctype,document_id,metric,is_true_label,is_matched,match_score_type,match_score,actual_label,actual_entity_type,actual_text,actual_cloneid,actual_coordinates,predicted_label,predicted_entity_type,predicted_text,predicted_cloneid,predicted_coordinates
0,AUTOMATED UNDERWRITING FEEDBACK - DU CODIFIED ...,f8e35053-b4bc-4613-b58a-ec6338d87b63,string_match,True,False,compare_exact_strings,0,BORROWER OTHER CREDIT SCORE VALUE 2,TEXT,,,"{'x0': 1540.15185546875, 'y1': 1971.4381103515...",,,,,
1,AUTOMATED UNDERWRITING FEEDBACK - DU CODIFIED ...,f8e35053-b4bc-4613-b58a-ec6338d87b63,string_match,True,False,compare_exact_strings,0,AUTOMATED UNDERWRITING SYSTEM TYPE,LIST,SUMMARY,,"{'x0': 1025.7548828125, 'y1': 240.067153930664...",,,,,
3,AUTOMATED UNDERWRITING FEEDBACK - DU CODIFIED ...,f8e35053-b4bc-4613-b58a-ec6338d87b63,string_match,True,False,compare_exact_strings,0,BORROWER LAST NAME,NAME,son,,"{'x0': 715.8772583007812, 'y1': 1974.536865234...",,,,,
5,AUTOMATED UNDERWRITING FEEDBACK - DU CODIFIED ...,f8e35053-b4bc-4613-b58a-ec6338d87b63,string_match,True,False,compare_exact_strings,0,LTV RATIO PERCENT,PERCENT,. 92.00% o,,"{'x0': 706.5809326171875, 'y1': 549.9448242187...",,,,,
6,AUTOMATED UNDERWRITING FEEDBACK - DU CODIFIED ...,f8e35053-b4bc-4613-b58a-ec6338d87b63,string_match,True,False,compare_exact_strings,0,BORROWER OTHER CREDIT SCORE VALUE 1,TEXT,,,"{'x0': 1459.5836181640625, 'y1': 1918.75891113...",,,,,
7,AUTOMATED UNDERWRITING FEEDBACK - DU CODIFIED ...,f8e35053-b4bc-4613-b58a-ec6338d87b63,string_match,True,False,compare_exact_strings,0,LOAN PURPOSE TYPE,LIST,Refinance,,"{'x0': 1800.448974609375, 'y1': 754.4640502929...",,,,,
8,AUTOMATED UNDERWRITING FEEDBACK - DU CODIFIED ...,f8e35053-b4bc-4613-b58a-ec6338d87b63,string_match,True,False,compare_exact_strings,0,LOAN AMOUNT,CURRENCY,$268725.00,,"{'x0': 1818, 'y1': 2820, 'x1': 2033, 'y0': 2780}",,,,,
13,AUTOMATED UNDERWRITING FEEDBACK - DU CODIFIED ...,f8e35053-b4bc-4613-b58a-ec6338d87b63,string_match,True,False,compare_exact_strings,0,OCCUPANCY TYPE,LIST,Primary Residence,,"{'x0': 1735.374755859375, 'y1': 2637.393310546...",,,,,
14,AUTOMATED UNDERWRITING FEEDBACK - DU CODIFIED ...,f8e35053-b4bc-4613-b58a-ec6338d87b63,string_match,True,False,compare_exact_strings,0,TOTAL LIABILITY MONTHLY PAYMENT AMOUNT,CURRENCY,$1392.00,,"{'x0': 1927, 'y1': 543, 'x1': 2097, 'y0': 503}",,,,,
15,AUTOMATED UNDERWRITING FEEDBACK - DU CODIFIED ...,f8e35053-b4bc-4613-b58a-ec6338d87b63,string_match,True,False,compare_exact_strings,0,SUBMISSION TIME,TIME,10:00AM,,"{'x0': 935, 'y1': 393, 'x1': 1094, 'y0': 360}",,,,,


In [None]:
# df_calculate.to_csv("usbank_ut1008_LSTM_evaluation.csv")

In [17]:
import pandas as pd
# df_calculate = pd.read_csv('test_cd_50per.csv')
for each in doc_map:
    print(str(make_eval_summary(df_calculate[df_calculate['doctype'] == each['doctype']])))
    print(str(make_summary(df_calculate[df_calculate['doctype'] == each['doctype']])))

{'string_match': {'accuracy': 0.11437236731255265, 'false_positive_rate': 0.7644691186675919}, 'doctype': 'AUTOMATED UNDERWRITING FEEDBACK - DU CODIFIED FINDINGS'}
{'string_match': {'accuracy': 0.11437236731255228, 'false_positive_rate': 0.7644691186675866, 'precision': 0.2355308813324064, 'recall': 0.11437236731255228, 'f1': 0.1539752749928548}}


# --------------------------------------LEE STUFF-----------------------------------------------

In [None]:
list_to_remove = []
for file in os.listdir(y_true):
    full_path = f'{os.path.abspath(os.getcwd())}/{y_true}/{file}'
    if os.path.getsize(full_path) < 200:
        list_to_remove.append(file.split(".")[0])
        os.remove(full_path)
        

In [None]:
list_to_remove

In [None]:
with open('no_data_rids_2.txt', 'w') as filehandle:
    for listitem in list_to_remove:
        filehandle.write('%s\n' % listitem)