## Baseline 2 - Grail QA

We have decided which subdomains of Grail QA will constitute our overall domains of `healthcare` and `technology`. Here, we'll perform another baseline with this data. 

In [1]:
import pandas as pd

pd.options.display.max_colwidth = 0

In [2]:
from src.data.utils import *

train = pd.DataFrame(get_domains_and_questions('train', 'grail_qa'))
dev   = pd.DataFrame(get_domains_and_questions('dev',   'grail_qa'))

In [3]:
domains = ['medicine', 'computer', 'spaceflight', 'biology', 'automotive', 'internet', 'engineering']
train = set_domains(train, domains)
dev   = set_domains(dev,   domains)

In [4]:
healthcare_subdomains = ['medicine', 'biology']
technology_subdomains = ['computer', 'spaceflight', 'automotive', 'internet', 'engineering']

def set_label(df, label, subdomains):
    df.domains.loc[df.domains.isin(subdomains)] = label
    return df

train = set_label(train, 'healthcare', healthcare_subdomains)
train = set_label(train, 'technology', technology_subdomains)
dev   = set_label(dev,   'healthcare', healthcare_subdomains)
dev   = set_label(dev,   'technology', technology_subdomains)

In [5]:
print(f'TRAIN DISTRIBUTION\n{train.domains.value_counts()}')
print(f'DEV DISTRIBUTION\n{dev.domains.value_counts()}')

TRAIN DISTRIBUTION
technology    4967
healthcare    3250
Name: domains, dtype: int64
DEV DISTRIBUTION
technology    408
healthcare    303
Name: domains, dtype: int64


In [6]:
from sklearn.feature_extraction.text import TfidfVectorizer

tfidf = TfidfVectorizer()
xt = tfidf.fit_transform(train.questions)
xd = tfidf.transform(dev.questions)

In [7]:
import numpy as np

def transform_labels(labels):
    labels[np.where(labels == 'healthcare')] = 0.
    labels[np.where(labels == 'technology')] = 1.
    return labels.astype(np.float64)

yt = transform_labels(train.domains.values)
yd = transform_labels(dev.domains.values)

In [8]:
from sklearn.linear_model import LogisticRegression

clf = LogisticRegression()
clf.fit(xt, yt)

LogisticRegression()

In [9]:
from sklearn.metrics import classification_report

print(classification_report(yd, clf.predict(xd)))

              precision    recall  f1-score   support

         0.0       1.00      0.99      1.00       303
         1.0       0.99      1.00      1.00       408

    accuracy                           1.00       711
   macro avg       1.00      1.00      1.00       711
weighted avg       1.00      1.00      1.00       711



In [10]:
from sklearn.metrics import confusion_matrix

print(confusion_matrix(yd, clf.predict(xd)))

[[300   3]
 [  0 408]]


In [27]:
# Find the misclassified examples
mistakes = xd[np.where(yd != clf.predict(xd))].todense()

for mistake in mistakes:
    print(tfidf.inverse_transform(mistake)[0].tolist())

['been', 'by', 'contacted', 'has', 'heese', 'oliver', 'who']
['contraindications', 'deracoxib', 'for', 'is', 'number', 'of', 'the', 'what']
['contraindications', 'for', 'is', 'number', 'of', 'teriparatide', 'the', 'what']
