# Model Evaluation 

In [1]:
import pandas as pd
import joblib
from sklearn.metrics import accuracy_score, f1_score, recall_score, precision_score

pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', 100)
pd.set_option('display.expand_frame_repr', False)
pd.set_option('max_colwidth', 0)

Load in clean training/test dataset

In [2]:
clean_spacy_mapaffil = pd.read_parquet("data/clean_spacy_mapaffil.parquet", engine="fastparquet") 

Load in saved NLP model and text vectorizer

In [2]:
loaded_model = joblib.load('geoinference_linearsvc_1mil.joblib.lzma')
loaded_vectorizer = joblib.load('geoinference_vectorizer_1mil.joblib.lzma')

Randomly select 10,000 affiliation texts and their respective city/state/country, assigining them to input and target variables

In [4]:
test_sample = clean_spacy_mapaffil.sample(n=10000, random_state=42) # random_state for reproducibility
X_test = list(test_sample['affiliation'])
y_test = list(test_sample['city'])
orgs = list(test_sample['org'])
gpes = list(test_sample['gpe'])

Transform the affiliation texts into numerical vectors

In [5]:
features_test = loaded_vectorizer.transform(X_test)

Generate model predictions on the 10,000 randomly sampled affiliations

In [6]:
y_pred = loaded_model.predict(features_test)

Output evaluation metrics, including overall accuracy, F1, recall, and precision

In [7]:
print('Accuracy: ', "%.2f" % ((accuracy_score(y_test, y_pred))*100))
print('F1: ', "%.2f" % ((f1_score(y_test, y_pred, average='weighted'))*100))
print('Recall: ', "%.2f" % ((recall_score(y_test, y_pred, average='weighted'))*100))
print('Precision: ', "%.2f" % ((precision_score(y_test, y_pred, average='weighted'))*100))

Accuracy:  95.59
F1:  94.77
Recall:  95.59
Precision:  94.34


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


Output model predictions 

In [8]:
results_dict = {'affiliation': X_test, 'City Prediction': y_pred, 'True City': y_test, 'ORG': orgs, 'GPE': gpes}
results_df = pd.DataFrame.from_dict(results_dict)
display(results_df)

Unnamed: 0,affiliation,City Prediction,True City,ORG,GPE
0,"Newport Laboratories, 1520 Prairie Drive, Worthington, MN, 56187, USA.","Worthington, MN, USA","Worthington, MN, USA",Newport Laboratories,"MN, USA, Worthington"
1,"Norwich Medical School, University of East Anglia, Norwich Research Park, Norwich NR4 7TJ, United Kingdom.","Norwich, Norfolk, UK","Norwich, Norfolk, UK","Norwich Medical School, University of East Anglia","Norwich, United Kingdom"
2,"Institute of Inflammation and Aging, University of Birmingham, Birmingham, UK.","Birmingham, West Midlands, UK","Birmingham, West Midlands, UK","University of Birmingham, Institute of Inflammation and Aging","UK, Birmingham"
3,"Optivet Referrals, Southleigh Farm, Southleigh Road, Havant, Hampshire P09 2NX, UK.","Hampshire, UK","Havant, Hampshire, UK",Optivet Referrals,"Havant, UK, Hampshire"
4,"Department of Animal Science, University of Padova, 35020 Legnaro, Italy.","Legnaro, Padova, Veneto, Italy","Legnaro, Padova, Veneto, Italy",University of Padova,"Legnaro, Italy"
...,...,...,...,...,...
9995,"National Institutes of Health, Bethesda, Maryland 20852-1820, USA.","Bethesda, MD, USA","Bethesda, MD, USA",National Institutes of Health,"USA, Maryland, Bethesda"
9996,"Departement des maladies infectieuses, Hopital Cantonal Universitaire de Geneve, Geneve, Switzerland.","Geneve, Switzerland","Geneve, Switzerland",Hopital Cantonal Universitaire de Geneve,"Switzerland, Geneve"
9997,"Global Livestock Collaborative Research Support Program (GL-CRSP), Kenya Agricultural Research Institute, Nairobi.","Nairobi, Kenya","Nairobi, Kenya","Global Livestock Collaborative Research Support Program, Kenya Agricultural Research Institute, GL-CRSP",Nairobi
9998,"Program in Cellular and Molecular Medicine, Division of Hematology/Oncology, Boston Children's Hospital, Boston, MA.","Boston, MA, USA","Boston, MA, USA","Boston Children's Hospital, Program in Cellular and Molecular Medicine","Boston, MA"


Load in validation set of ambiguous affiliations that MapAffil was unable to assign a city to

In [8]:
ambiguous_mapaffil_validation = pd.read_parquet("data/ambiguous_mapaffil_validation.parquet", engine="fastparquet")

Randomly select 100 affiliation texts and transform them into numerical vectors

In [10]:
ambiguous_mapaffil_validation = ambiguous_mapaffil_validation.sample(n=100, random_state=42)
ambiguous_mapaffil_validation_X_test = list(ambiguous_mapaffil_validation['affiliation'])
ambiguous_mapaffil_validation_features_test = loaded_vectorizer.transform(ambiguous_mapaffil_validation_X_test)

Generate model predictions on the 100 randomly sampled affiliations

In [11]:
ambiguous_mapaffil_validation_y_pred = loaded_model.predict(ambiguous_mapaffil_validation_features_test)

Output model predictions 

In [12]:
ambiguous_mapaffil_validation_results_dict = {'affiliation': ambiguous_mapaffil_validation_X_test, 'City Prediction': ambiguous_mapaffil_validation_y_pred, 'MapAffil Prediction': list(ambiguous_mapaffil_validation['city']), 'ORG': list(ambiguous_mapaffil_validation['org']), 'GPE': list(ambiguous_mapaffil_validation['gpe'])}
ambiguous_mapaffil_validation_results_df = pd.DataFrame.from_dict(ambiguous_mapaffil_validation_results_dict)
display(ambiguous_mapaffil_validation_results_df)

Unnamed: 0,affiliation,City Prediction,MapAffil Prediction,ORG,GPE
0,"SCMA. Young Physicians Section, USA.","Provo, UT, USA",USA,"SCMA, Young Physicians Section",USA
1,"3 282184 Heart and Stroke Foundation , Canada.","Toronto, ON, Canada",Canada,Heart and Stroke Foundation,Canada
2,"Applied Physiology Laboratory, University of North Texas, United States.","Denton, TX, USA","TX, USA","Applied Physiology Laboratory, University of North Texas",United States
3,"The Unit for Experimental Asthma and Allergy Research, Division of Physiology, The National Institute of EnvironmentalMedicine, Sweden.","London, UK",Sweden,"The Unit for Experimental Asthma and Allergy Research, The National Institute of EnvironmentalMedicine",Sweden
4,"Seconda Universita degli Studi di Napoli, Facolta di Medicina e Chirurgia, Dipartimento Universitario F. Magrassi, G. Lanzara, Ex Istituto di Scienze Radiologiche, Italy.","Napoli, Campania, Italy",Italy,"Facolta di Medicina e Chirurgia, Ex Istituto di Scienze Radiologiche, G. Lanzara, Seconda Universita degli Studi di Napoli, Dipartimento Universitario F. Magrassi",Italy
5,"Faculty of Education, The University of Hong Kong, Pokfulam Road, Hong Kong.","Pok Fu Lam, Hong Kong",Hong Kong,"Faculty of Education, The University of Hong Kong",Hong Kong
6,"Monitoring Evaluation Research Unit, Jhpiego, Johns Hopkins University, Kenya.","Baltimore, MD, USA",Kenya,"Monitoring Evaluation Research Unit, Johns Hopkins University",Kenya
7,"Division of Biotechnology, Commonwealth Scientific and Industrial Research Organisation, Victoria, Australia.","Melbourne, VIC, Australia","VIC, Australia",Commonwealth Scientific and Industrial Research Organisation,"Australia, Victoria"
8,University of the Basque Country (UPV/EHU).,"Bilbao, Vizcaya, Spain",Spain,"UPV/EHU, University of the Basque Country",
9,"Department of Cardiovascular Development and Repair, Centro Nacional de Investigaciones Cardiovasculares (CNIC), Instituto de Salud Carlos III, Spain.","Madrid, Spain",Spain,"Centro Nacional de Investigaciones Cardiovasculares, Instituto de Salud Carlos III, CNIC",Spain


Load in validation set of PubMed affiliations from papers published after December 2018

In [8]:
post_2018_validation = pd.read_parquet("data/post_2018_validation.parquet", engine="fastparquet")

Randomly select 100 affiliation texts and transform them into numerical vectors

In [9]:
post_2018_validation = post_2018_validation.sample(n=100, random_state=42)
post_2018_validation_X_test = list(post_2018_validation['Affiliation'])
post_2018_validation_features_test = loaded_vectorizer.transform(post_2018_validation_X_test)

Generate model predictions on the 100 randomly sampled affiliations

In [10]:
post_2018_validation_y_pred = loaded_model.predict(post_2018_validation_features_test)

Output model predictions

In [11]:
post_2018_validation_results_dict = {'Affiliation': post_2018_validation_X_test, 'City Prediction': post_2018_validation_y_pred}
post_2018_validation_results_df = pd.DataFrame.from_dict(post_2018_validation_results_dict)
display(post_2018_validation_results_df)

Unnamed: 0,Affiliation,City Prediction
0,"Department of Ophthalmology, Harvard Medical School and Mass Eye and Ear, Boston, Massachusetts, USA","Boston, MA, USA"
1,"Ukraine Presidential Hospital, Kiev, Ukraine","Kiev, Ukraine"
2,"Departments of Epidemiology and Statistics, Harvard T. H. Chan School of Public Health, Boston, MA, 02115, USA","Boston, MA, USA"
3,"Department of Pharmacology, University of North Carolina at Chapel Hill, Chapel Hill, NC 27599, USA","Chapel Hill, NC, USA"
4,"Department of Dermatology and Cutaneous Biology Research Center, Massachusetts General Hospital, Harvard Medical School, 44 Fruit Street, Boston, MA, 02114, USA","Boston, MA, USA"
5,"Genito Urinary Oncology, Prostate Brachytherapy Unit, Goustave Roussy, Paris, France","Villejuif, Paris, France"
6,"Division of Paediatric Surgery, The National Hospital, Abuja, Nigeria","Abuja, FCT, Nigeria"
7,"Division of Spine, Department of Orthopedics, Tongji Hospital Affiliated to Tongji University School of Medicine, Shanghai, China","Shanghai, China"
8,"John F. Kennedy School of Government, Harvard University, Cambridge, MA, USA","Cambridge, MA, USA"
9,"Division of Hematologic Neoplasia, Department of Medical Oncology, Dana-Farber Cancer Institute, Boston, MA","Boston, MA, USA"
