In [1]:
import warnings
warnings.filterwarnings("ignore")


from utils.hard_rules import hard_rules
import pandas as pd
import numpy as np
from sklearn import metrics
import pickle
import joblib

from tqdm import tqdm
tqdm.pandas()

In [2]:
# Import data
df = pd.read_csv('Data/_20230622-130921_training.tsv', sep='\t', header=None)

# Rename columns
df.rename(columns = {0: 'Value',
                     1: 'Evidence_type',
                     2: 'CCV'}, 
          inplace = True)

# Remove ENG instances
ENG_ccvs = [ccv for ccv in df['CCV'].unique() if 'ENG' in ccv]
df = df[ ~df['CCV'].isin(ENG_ccvs) ]

# Remove duplicates
df.drop_duplicates(inplace=True)

# Drop CCVs in which no prediction can be made
dropped_CCVs = ['CCV:00004', 'CCV:00025', 'CCV:00028', 'CCV:00029', 'CCV:00035', 'CCV:00036', 'CCV:00042', 'CCV:00043',
    'CCV:00047', 'CCV:00048', 'CCV:00049', 'CCV:00072', 'CCV:00084', 'CCV:00087', 'CCV:00088',
    'CCV:00079', 'CCV:00082', 'CCV:00092', 'CCV:00093', 'CCV:00096']
df = df[ ~df['CCV'].isin( dropped_CCVs )]

# Reset index
df = df.reset_index().drop('index', axis=1)

### Hard rule predictions

In [3]:
df['Predictions'] = df[['Value','Evidence_type']].progress_apply(lambda x: hard_rules(x.Value, x.Evidence_type), axis=1)  

# Keep hard-rule predictions
df = df[ df['Predictions'].astype('str') != "None" ]


100%|██████████| 12446035/12446035 [03:28<00:00, 59569.82it/s]


### Model predictions

In [4]:
X = np.load('Data/data.npz', allow_pickle=True)['X']
y = np.load('Data/data.npz', allow_pickle=True)['y']

# Load Label encoder
encoder = pickle.load(open('Model/Label_encoder.pkl', 'rb'))
# Load prediction model
model = joblib.load("Model/model.joblib")
# Get predictions
pred = encoder.inverse_transform( model.predict(X) )


# Model Evaluation
Accuracy = 100*metrics.accuracy_score(y, pred)
Recall = metrics.recall_score(y, pred, average='macro')
Precision = metrics.precision_score(y, pred, average='macro')   
CM = metrics.confusion_matrix(y, pred)

print(f'Accuracy: {Accuracy:.3f}%')
print(f'Precision: {Precision:.3f}')
print(f'Recall: {Recall:.3f}')
CM

Accuracy: 82.729%
Precision: 0.957
Recall: 0.956


array([[   974,      0,      0,      0,      0,      0,      0,      0],
       [     0,    867,      0,      0,      0,      0,      0,      0],
       [     0,      0, 167991,  24846,      0,      0,      0,      0],
       [     0,      0,  40180, 140840,      0,      0,      0,      0],
       [     0,      0,      0,      0,    144,      0,      0,      0],
       [     0,      0,      0,      0,      0,     14,      0,      0],
       [     0,      0,      0,      0,      0,      0,    408,      0],
       [     0,      0,      0,      0,      0,      0,      0,    236]],
      dtype=int64)

In [5]:
print( metrics.classification_report(y_true=y, y_pred=pred, target_names=np.unique(y)) )

              precision    recall  f1-score   support

   CCV:00002       1.00      1.00      1.00       974
   CCV:00003       1.00      1.00      1.00       867
   CCV:00005       0.81      0.87      0.84    192837
   CCV:00008       0.85      0.78      0.81    181020
   CCV:00011       1.00      1.00      1.00       144
   CCV:00030       1.00      1.00      1.00        14
   CCV:00052       1.00      1.00      1.00       408
   CCV:00065       1.00      1.00      1.00       236

    accuracy                           0.83    376500
   macro avg       0.96      0.96      0.96    376500
weighted avg       0.83      0.83      0.83    376500



### Evaluation

In [6]:
Predictions = np.concatenate([df['Predictions'].values, pred])
Actuals = np.concatenate([df['CCV'].values, y])

In [7]:
Accuracy = 100*metrics.accuracy_score(Actuals, Predictions)
Recall = metrics.recall_score(Actuals, Predictions, average='macro')
Precision = metrics.precision_score(Actuals, Predictions, average='macro')   

print(f'Accuracy: {Accuracy:.3f}%')
print(f'Precision: {Precision:.3f}')
print(f'Recall: {Recall:.3f}')

Accuracy: 99.404%
Precision: 0.992
Recall: 0.991


In [8]:
CM = metrics.confusion_matrix(Actuals, Predictions)
CM

array([[   974,      0,      0, ...,      0,      0,      0],
       [     0,    867,      0, ...,      0,      0,      0],
       [     0,      0, 167991, ...,      0,      0,      0],
       ...,
       [     0,      0,      0, ..., 440209,      0,      0],
       [     0,      0,      0, ...,      0, 440209,      0],
       [     0,      0,      0, ...,      0,      0,    999]], dtype=int64)

In [10]:
from utils.pretty_confusion_matrix import pp_matrix_from_data
pp_matrix_from_data(CM=CM, 
                    cmap="Oranges", 
                    figsize=(25, 25))
