In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.svm import NuSVC
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, plot_confusion_matrix, classification_report
from mrec.data.dataset import load_data
from mrec.features.transform import clean_text

In [None]:
# Read in training, validation data and labels
csv_fnames = {'train': '../dataset/raw/train.csv', 'validation': '../dataset/raw/validation.csv',
              'test': '../dataset/raw/test.csv'}
dataset = load_data(csv_fnames)
train, validation, test = dataset.train, dataset.validation, dataset.test

relation_type = ['causes', 'treats']
features_list = ['_unit_id', 'sentence', 'relation']
train = train[features_list][train['relation'].isin(relation_type)].drop_duplicates()
validation = validation[features_list][validation['relation'].isin(relation_type)].drop_duplicates()
test = test[features_list][test['relation'].isin(relation_type)].drop_duplicates()

#TODO add feature engineering
count_vect = CountVectorizer(ngram_range=(1, 3), analyzer=clean_text)
X_counts_train = count_vect.fit_transform(train['sentence'])
X_train_label = train['relation']

most_popular_word_df = pd.DataFrame(X_counts_train.toarray(), columns=count_vect.get_feature_names())

X_counts_validation = count_vect.transform(validation['sentence'])
X_validation_label = validation['relation']

X_counts_test = count_vect.transform(test['sentence'])
X_test_label = test['relation']


In [None]:
temp_df = pd.DataFrame(most_popular_word_df.sum(axis=0), columns=['count']).sort_values(by='count', ascending=False)
count_vec_top_200 = list(temp_df['count'][:200].index)
count_vec_top_200

In [None]:
def print_metric(gtruth, predictions, dset_name):
    """Print 5 scoring metrics: accuracy, roc_auc, f1, precision, and recall

    Args:
        gtruth (array): label (either 0 or 1)
        predictions (array): model prediction (either 0 or 1)
        dset_name: the dataset that is evaluating on
    """
    accuracy = round(accuracy_score(gtruth, predictions), 4)
    roc_auc = round(roc_auc_score(gtruth, predictions), 4)
    f1 = round(f1_score(gtruth, predictions), 4)
    precision = round(precision_score(gtruth, predictions), 4)
    recall = round(recall_score(gtruth, predictions), 4)
    print('{:>10} {:>11} {:>12} {:>12} {:>11} {:>12}'.format(dset_name, accuracy, roc_auc, f1, precision, recall))

def evaluate_model(model, X, y, dset_name):
    """Evaluate on given model

    Args:
        model: NuSVC()
        X: countvectorizers of feature(s)
        y: label
        dset_name: dataset that is evaluating on
    """
    enc = LabelEncoder()

    predictions = model.predict(X)
    gtruth = enc.fit_transform(y)
    encoder_predictions = enc.transform(predictions)

    print_metric(gtruth, encoder_predictions, dset_name)
    return predictions

In [None]:
"""Train the best model"""
model = NuSVC()

print('Training model..')
model.fit(X_counts_train, X_train_label)

print('{:>23} {:>12} {:>12} {:>12} {:>10}'.format('Accuracy', 'ROC_AUC', 'F1-score', 'Precision', 'Recall'))
train_predictions = evaluate_model(model, X_counts_train, X_train_label, 'Train')

val_predictions = evaluate_model(model, X_counts_validation, X_validation_label, 'Validation')

test_predictions = evaluate_model(model, X_counts_test, X_test_label, 'Test')

In [None]:
class_names = X_test_label.unique()
plot_confusion_matrix(model, X_counts_test, X_test_label,
                      display_labels=class_names,
                      cmap=plt.cm.Blues,
                      normalize='true')
plt.show()

In [None]:
enc = LabelEncoder()
gtruth = enc.fit_transform(X_test_label)
encoder_predictions = enc.transform(test_predictions)
print(classification_report(gtruth, encoder_predictions, target_names=class_names))

In [None]:
train_predictions.shape

In [None]:
base_dir = '/Users/ktle2/personal_projects/mrec/models/baseline_model'
# add predictions back into the dataframe
# save the dataframe as the csv file `-predictions.csv`

train['relation_pred'] = train_predictions
train.head()

In [None]:
id_pred = train.set_index('_unit_id').relation_pred.to_dict()
train_with_pred = dataset.train
train_with_pred['relation_pred'] = train_with_pred['_unit_id'].map(id_pred)

In [None]:
import os
mode = 'train'
csv_file = os.path.join(base_dir, '{}-predictions.csv')
train_with_pred.to_csv(csv_file.format(mode))
print('File saved {}'.format(csv_file))

In [None]:
def save_predictions(data, predictions, new_data, save=False, output_csv_file=None):
    pred_col = 'relation_pred'
    id_col = '_unit_id'
    data[pred_col] = predictions
    id_pred = data.set_index(id_col).relation_pred.to_dict()
    new_data_with_pred = new_data
    new_data_with_pred['relation_pred'] = new_data_with_pred['_unit_id'].map(id_pred)
    if save:
        print(f'Saving file as {output_csv_file}')
        new_data_with_pred.to_csv(output_csv_file)
    return new_data_with_pred

val_predd = save_predictions(validation, val_predictions, dataset.validation, save=True, output_csv_file=csv_file.format('validation'))
test_predd = save_predictions(test, test_predictions, dataset.test, save=True, output_csv_file=csv_file.format('test'))

In [None]:
csv_fnames = {'train': 'dataset/raw/train.csv', 'validation': 'dataset/raw/validation.csv',
              'test': 'dataset/raw/test.csv'}
relation_type = ['causes', 'treats']
dataset = load_data(csv_fnames)
train_df, validation_df, test_df = dataset.train, dataset.validation, dataset.test

train_df = train_df[['sentence', 'relation']][train_df['relation'].isin(relation_type)].drop_duplicates()
validation_df = validation_df[['sentence', 'relation']][validation_df['relation'].isin(relation_type)].drop_duplicates()
test_df = test_df[['sentence', 'relation']][test_df['relation'].isin(relation_type)].drop_duplicates()

## Reasson why model fails to classify relation in test set

In [None]:
test_df['pred'] = predictions
false_pred_df = test_df[test_df['relation'] != test_df['pred']]
false_pred_df

In [None]:
false_pred_df.iloc[1].sentence

In [None]:
raw = dataset.train[['_unit_id', 'relation', 'sentence', 'direction', 'term1', 'term2']]
raw[raw['sentence'] == "INSULIN PEPTIDE B9 23 is a major autoantigen in TYPE 1 DIABETES"]

## Reason why model fails to classify in validation set

In [None]:
validation_df['pred'] = val_predictions
validation_df_false_pred = validation_df[validation_df['pred'] != validation_df['relation']]
validation_df_false_pred.head()

In [None]:
validation_df_false_pred.iloc[0].sentence

In [None]:
raw_df = dataset.validation[['_unit_id', 'relation', 'sentence', 'direction', 'term1', 'term2']]
raw_df[raw_df['sentence'] == 'A possible role of LEU in sensomotor cortex is limitation of intensity and duration of SEIZURES and prevention of STATUS EPILEPTICUS']

In [None]:
raw_df = dataset.train[['_unit_id', 'relation', 'sentence', 'direction', 'term1', 'term2']]
raw_df[raw_df['sentence'] == 'A possible role of LEU in sensomotor cortex is limitation of intensity and duration of SEIZURES and prevention of STATUS EPILEPTICUS']

In [None]:
pd.set_option('display.max_columns', 7000)
count_vect_df = pd.DataFrame(test.toarray(), columns=count_vect.get_feature_names())
count_vect_df.shape

In [None]:
# Function to calculate length of message excluding space
train_df['chacracter count'] = train_df['sentence'].apply(lambda x: len(x) - x.count(" "))
train_df['word count'] = train_df['sentence'].apply(lambda x: len(x.split()))
train_df.head()

## Character Count

In [None]:
bins = np.linspace(0, 600, 60)

plt.hist(train_df[train_df['relation']=='causes']['chacracter count'], bins, alpha=0.5, label='causes', density=True)
plt.hist(train_df[train_df['relation']=='treats']['chacracter count'], bins, alpha=0.5, label='treats', density=True)
plt.legend(loc='upper right')
plt.show()

In [None]:
false_pred_df['chacracter count'] = false_pred_df['sentence'].apply(lambda x: len(x) - x.count(" "))
bins = np.linspace(0, 600, 60)

plt.hist(false_pred_df[false_pred_df['relation']=='causes']['chacracter count'], bins, alpha=0.5, label='causes', density=True)
plt.hist(false_pred_df[false_pred_df['relation']=='treats']['chacracter count'], bins, alpha=0.5, label='treats', density=True)
plt.legend(loc='upper right')
plt.show()

## Word Count

In [None]:
bins = np.linspace(0, 100, 60)

plt.hist(train_df[train_df['relation']=='causes']['word count'], bins, alpha=0.5, label='causes', density=True)
plt.hist(train_df[train_df['relation']=='treats']['word count'], bins, alpha=0.5, label='treats', density=True)
plt.legend(loc='upper right')
plt.show()

In [None]:
false_pred_df['word count'] = false_pred_df['sentence'].apply(lambda x: len(x.split()))
bins = np.linspace(0, 100, 60)

plt.hist(false_pred_df[false_pred_df['relation']=='causes']['word count'], bins, alpha=0.5, label='causes', density=True)
plt.hist(false_pred_df[false_pred_df['relation']=='treats']['word count'], bins, alpha=0.5, label='treats', density=True)
plt.legend(loc='upper right')
plt.show()

## Punctuation Count

In [None]:
import string 
def count_punct(text):
    count = sum([1 for char in text if char in string.punctuation])
    return round(count/(len(text) - text.count(" ")), 3)*100

train_df['punct%'] = train_df['sentence'].apply(lambda x: count_punct(x))
train_df.head()

In [None]:
bins = np.linspace(0, 20, 40)

plt.hist(train_df[train_df['relation']=='causes']['punct%'], bins, alpha=0.5, label='causes', density=True)
plt.hist(train_df[train_df['relation']=='treats']['punct%'], bins, alpha=0.5, label='treats', density=True)
plt.legend(loc='upper right')
plt.show()

## TF-IDF vs CountVectorizer

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer

tfidf_vect = TfidfVectorizer(ngram_range=(1, 3), analyzer=clean_text, use_idf=False)
X_tfidf_train = tfidf_vect.fit_transform(train_df['sentence'])
X_tfidf_train_label = train_df['relation']

'''
indices = np.argsort(tfidf_vect.idf_)[::-1]
features = tfidf_vect.get_feature_names()
top_n = 200
top_features = [features[i] for i in indices[:top_n]]
print(top_features)
'''

X_tfidf_eval = tfidf_vect.transform(validation_df['sentence'])
X_tfidf_eval_label = validation_df['relation']

X_tfidf_test = tfidf_vect.transform(test_df['sentence'])
X_tfidf_test_label = test_df['relation']

In [None]:
new_model = NuSVC()

print('Training model..')
new_model.fit(X_tfidf_train, X_tfidf_train_label)

print('{:>23} {:>12} {:>12} {:>12} {:>10}'.format('Accuracy', 'ROC_AUC', 'F1-score', 'Precision', 'Recall'))
evaluate_model(new_model, X_tfidf_train, X_tfidf_train_label, 'Train')

evaluate_model(new_model, X_tfidf_eval, X_tfidf_eval_label, 'Validation')

predictions = evaluate_model(new_model, X_tfidf_test, X_tfidf_test_label, 'Test')

In [None]:
enc = LabelEncoder()
gtruth = enc.fit_transform(X_tfidf_test_label)
encoder_predictions = enc.transform(predictions)
print(classification_report(gtruth, encoder_predictions, target_names=class_names))

In [None]:
class_names = X_tfidf_test_label.unique()
plot_confusion_matrix(new_model, X_tfidf_test, X_tfidf_test_label,
                      display_labels=class_names,
                      cmap=plt.cm.Blues,
                      normalize='true')
plt.show()