# Model Evaluation 

In [1]:
import pandas as pd
import joblib
from sklearn.metrics import accuracy_score, f1_score, recall_score, precision_score

pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', 100)
pd.set_option('display.expand_frame_repr', False)
pd.set_option('max_colwidth', 0)

Load in clean training/test dataset

In [2]:
clean_spacy_mapaffil = pd.read_parquet("data/clean_spacy_mapaffil.parquet", engine="fastparquet") 

Load in saved NLP model and text vectorizer

In [2]:
loaded_model = joblib.load('geoinference_linearsvc_1mil.joblib.lzma')
loaded_vectorizer = joblib.load('geoinference_vectorizer_1mil.joblib.lzma')

Randomly select 100,000 affiliation texts and their respective city/state/country, assigining them to input and target variables

In [4]:
test_sample = clean_spacy_mapaffil.sample(n=100000, random_state=42) 
X_test = list(test_sample['affiliation'])
y_test = list(test_sample['city'])
orgs = list(test_sample['org'])
gpes = list(test_sample['gpe'])

Transform the affiliation texts into numerical vectors

In [5]:
features_test = loaded_vectorizer.transform(X_test)

Generate model predictions on the 100,000 randomly sampled affiliations

In [6]:
y_pred = loaded_model.predict(features_test)

Output evaluation metrics, including overall accuracy, F1, recall, and precision

In [7]:
print('Accuracy: ', "%.2f" % ((accuracy_score(y_test, y_pred))*100))
print('F1: ', "%.2f" % ((f1_score(y_test, y_pred, average='weighted'))*100))
print('Recall: ', "%.2f" % ((recall_score(y_test, y_pred, average='weighted'))*100))
print('Precision: ', "%.2f" % ((precision_score(y_test, y_pred, average='weighted'))*100))

Accuracy:  95.59
F1:  94.77
Recall:  95.59
Precision:  94.34


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


Output model predictions 

In [None]:
results_dict = {'affiliation': X_test, 'City Prediction': y_pred, 'True City': y_test, 'ORG': orgs, 'GPE': gpes}
results_df = pd.DataFrame.from_dict(results_dict)
display(results_df)

Load in validation set of ambiguous affiliations that MapAffil was unable to assign a city to

In [8]:
ambiguous_mapaffil_validation = pd.read_parquet("data/ambiguous_mapaffil_validation.parquet", engine="fastparquet")

Randomly select 100 affiliation texts and transform them into numerical vectors

In [10]:
ambiguous_mapaffil_validation = ambiguous_mapaffil_validation.sample(n=100, random_state=42)
ambiguous_mapaffil_validation_X_test = list(ambiguous_mapaffil_validation['affiliation'])
ambiguous_mapaffil_validation_features_test = loaded_vectorizer.transform(ambiguous_mapaffil_validation_X_test)

Generate model predictions on the 100 randomly sampled affiliations

In [11]:
ambiguous_mapaffil_validation_y_pred = loaded_model.predict(ambiguous_mapaffil_validation_features_test)

Output model predictions 

In [None]:
ambiguous_mapaffil_validation_results_dict = {'affiliation': ambiguous_mapaffil_validation_X_test, 'City Prediction': ambiguous_mapaffil_validation_y_pred, 'MapAffil Prediction': list(ambiguous_mapaffil_validation['city']), 'ORG': list(ambiguous_mapaffil_validation['org']), 'GPE': list(ambiguous_mapaffil_validation['gpe'])}
ambiguous_mapaffil_validation_results_df = pd.DataFrame.from_dict(ambiguous_mapaffil_validation_results_dict)
display(ambiguous_mapaffil_validation_results_df)

Load in validation set of PubMed affiliations from papers published after December 2018

In [8]:
post_2018_validation = pd.read_parquet("data/post_2018_validation.parquet", engine="fastparquet")

Randomly select 100 affiliation texts and transform them into numerical vectors

In [9]:
post_2018_validation = post_2018_validation.sample(n=100, random_state=42)
post_2018_validation_X_test = list(post_2018_validation['Affiliation'])
post_2018_validation_features_test = loaded_vectorizer.transform(post_2018_validation_X_test)

Generate model predictions on the 100 randomly sampled affiliations

In [10]:
post_2018_validation_y_pred = loaded_model.predict(post_2018_validation_features_test)

Output model predictions

In [None]:
post_2018_validation_results_dict = {'Affiliation': post_2018_validation_X_test, 'City Prediction': post_2018_validation_y_pred}
post_2018_validation_results_df = pd.DataFrame.from_dict(post_2018_validation_results_dict)
display(post_2018_validation_results_df)