In [1]:
import warnings
warnings.filterwarnings("ignore")


from utils.hard_rules import hard_rules
import pandas as pd
import numpy as np
from sklearn import metrics

from tqdm import tqdm
tqdm.pandas()

In [2]:
# Import data
df = pd.read_csv('Data/_20230622-052634_training.tsv', sep='\t', header=None)

# Rename columns
df.rename(columns = {0: 'Value',
                     1: 'Evidence_type',
                     2: 'CCV'}, 
          inplace = True)

# Remove ENG instances
ENG_ccvs = [ccv for ccv in df['CCV'].unique() if 'ENG' in ccv]
df = df[ ~df['CCV'].isin(ENG_ccvs) ]

# Remove duplicates
df.drop_duplicates(inplace=True)

# Drop CCVs in which no prediction can be made
dropped_CCVs = ['CCV:00004', 'CCV:00025', 'CCV:00035', 'CCV:00036', 'CCV:00042', 'CCV:00043',
    'CCV:00047', 'CCV:00048', 'CCV:00049', 'CCV:00072', 'CCV:00084', 'CCV:00087', 'CCV:00088',
    'CCV:00079', 'CCV:00082', 'CCV:00092', 'CCV:00093', 'CCV:00096']
df = df[ ~df['CCV'].isin( dropped_CCVs )]

# Reset index
df = df.reset_index().drop('index', axis=1)

### Hard rule predictions

In [3]:
df['Predictions'] = df[['Value','Evidence_type']].progress_apply(lambda x: hard_rules(x.Value, x.Evidence_type), axis=1)  

# Keep hard-rule predictions
df = df[ df['Predictions'].astype('str') != "None" ]


100%|██████████| 17998605/17998605 [20:10<00:00, 14864.23it/s]


In [4]:
Predictions = df['Predictions'].values
Actuals = df['CCV'].values

### Model predictions

In [5]:
import pickle

X = np.load('Data/data.npz', allow_pickle=True)['X']
y = np.load('Data/data.npz', allow_pickle=True)['y']

# Load Label encoder
encoder = pickle.load(open('Model/Label_encoder.pkl', 'rb'))
# Load prediction model
model = pickle.load(open('Model/model.pkl', 'rb'))
# Get predictions
pred = encoder.inverse_transform( model.predict(X) )


# Model Evaluation
Accuracy = 100*metrics.accuracy_score(y, pred)
Recall = metrics.recall_score(y, pred, average='macro')
Precision = metrics.precision_score(y, pred, average='macro')   
CM = metrics.confusion_matrix(y, pred)

print(f'Accuracy: {Accuracy:.3f}%')
print(f'Precision: {Precision:.3f}')
print(f'Recall: {Recall:.3f}')
CM

Accuracy: 96.437%
Precision: 0.991
Recall: 0.991


array([[  2048,      0,      0,      0,      0,      0,      0,      0],
       [     0,  19422,     14,      2,      0,      0,      0,      0],
       [     0,     11, 274050,  12057,      0,      0,      0,      0],
       [     0,      9,   9523, 288390,      0,      0,      0,      0],
       [     0,      0,      0,      0,    100,      0,      0,      0],
       [     0,      0,      0,      0,      0,     15,      0,      0],
       [     0,      0,      0,      0,      0,      0,    641,      0],
       [     0,      0,      0,      0,      0,      0,      0,    335]],
      dtype=int64)

In [6]:
print( metrics.classification_report(y_true=y, y_pred=pred, target_names=np.unique(y)) )

              precision    recall  f1-score   support

   CCV:00002       1.00      1.00      1.00      2048
   CCV:00003       1.00      1.00      1.00     19438
   CCV:00005       0.97      0.96      0.96    286118
   CCV:00008       0.96      0.97      0.96    297922
   CCV:00011       1.00      1.00      1.00       100
   CCV:00030       1.00      1.00      1.00        15
   CCV:00052       1.00      1.00      1.00       641
   CCV:00065       1.00      1.00      1.00       335

    accuracy                           0.96    606617
   macro avg       0.99      0.99      0.99    606617
weighted avg       0.96      0.96      0.96    606617



### Evaluation

In [7]:
Predictions = np.concatenate([Predictions, pred])
Actuals = np.concatenate([Actuals, y])

In [8]:
Accuracy = 100*metrics.accuracy_score(Actuals, Predictions)
Recall = metrics.recall_score(Actuals, Predictions, average='macro')
Precision = metrics.precision_score(Actuals, Predictions, average='macro')   

print(f'Accuracy: {Accuracy:.3f}%')
print(f'Precision: {Precision:.3f}')
print(f'Recall: {Recall:.3f}')

Accuracy: 99.860%
Precision: 0.998
Recall: 0.998


In [9]:
CM = metrics.confusion_matrix(Actuals, Predictions)
CM

array([[  2048,      0,      0, ...,      0,      0,      0],
       [     0,  19422,     14, ...,      0,      0,      0],
       [     0,     11, 274050, ...,      0,      0,      0],
       ...,
       [     0,      0,      0, ..., 636004,      0,      0],
       [     0,      0,      0, ...,      0, 636004,      0],
       [     0,      0,      0, ...,      0,      0,    999]], dtype=int64)

In [10]:
from utils.pretty_confusion_matrix import pp_matrix_from_data
pp_matrix_from_data(CM=CM, 
                    cmap="Oranges", 
                    figsize=(25, 25))
