In [1]:
import warnings
warnings.filterwarnings('ignore')

import pandas as pd
from sklearn.model_selection import train_test_split, cross_val_score, StratifiedKFold, cross_val_predict
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.feature_extraction import DictVectorizer
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score, classification_report, confusion_matrix,roc_auc_score, roc_curve, precision_recall_curve
from sklearn.utils import class_weight
import numpy as np 
import matplotlib.pyplot as plt
from sklearn.utils.fixes import signature
from scipy.sparse import hstack
from sklearn.naive_bayes import MultinomialNB
import seaborn as sns
from collections import defaultdict
import pickle
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import KFold
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2
from scipy.sparse import coo_matrix, hstack, vstack
from collections import Counter

#Read manual annotated data
old_train_data_file = "/efs/CONSORT/MLDataset/valid_data_withMetamap.csv"
old_test_data_file = "/efs/CONSORT/MLDataset/test_data_withMetamap.csv"

old_automatic_train_data_file = "/efs/CONSORT/MLDataset/train_data_withMetamap.csv"

old_train_data_df = pd.read_csv(old_train_data_file, encoding = "latin")
old_test_data_df = pd.read_csv(old_test_data_file, encoding = "latin")
old_automatic_train_data_df = pd.read_csv(old_automatic_train_data_file, encoding = "latin")

old_train_data_df = old_train_data_df[["PMCID", "sentence_id","CONSORT_Item","section","sentence_text","metamap_concepts","metamap_concepts_text","metamap_semantictypes"]]
old_test_data_df = old_test_data_df[["PMCID", "sentence_id","CONSORT_Item","section","sentence_text","metamap_concepts","metamap_concepts_text","metamap_semantictypes"]]
old_automatic_train_data_df = old_automatic_train_data_df[["PMCID", "sentence_id","CONSORT_Item","section","sentence_text","metamap_concepts","metamap_concepts_text","metamap_semantictypes"]]

all_old_data = pd.concat([old_train_data_df,old_test_data_df])

#NEW SPLIT FROM HALIL:
# new_train_data_file = "/efs/CONSORT/MLDataset/rare_data.csv"
new_train_data_file = "/efs/CONSORT/MLDataset/split_train.csv"
new_test_data_file = "/efs/CONSORT/MLDataset/split_test.csv"

new_train_data_df = pd.read_csv(new_train_data_file, encoding = "latin")
new_test_data_df = pd.read_csv(new_test_data_file, encoding = "latin")

new_train_data_df = new_train_data_df[["PMCID", "sentence_id","text","labels","CONSORT_Item","n_labels"]]
new_test_data_df = new_test_data_df[["PMCID", "sentence_id","text","labels","CONSORT_Item","n_labels"]]

train_data_df = pd.merge(new_train_data_df,all_old_data, on = ["PMCID","sentence_id"], how = "left")
# train_data_df = pd.merge(new_train_data_df,old_automatic_train_data_df, on = ["PMCID","sentence_id"], how = "left")
print (len(train_data_df))
train_data_df = train_data_df[["PMCID", "sentence_id","CONSORT_Item_y","section","sentence_text","metamap_concepts","metamap_concepts_text","metamap_semantictypes"]]
train_data_df.rename(columns = {'CONSORT_Item_y':'CONSORT_Item'}, inplace = True) 

test_data_df = pd.merge(new_test_data_df,all_old_data, on = ["PMCID","sentence_id"], how = "left")
# print (len(test_data_df))
test_data_df = test_data_df[["PMCID", "sentence_id","CONSORT_Item_y","section","sentence_text","metamap_concepts","metamap_concepts_text","metamap_semantictypes"]]
test_data_df.rename(columns = {'CONSORT_Item_y':'CONSORT_Item'}, inplace = True) 
print (len(test_data_df))

train_labels = train_data_df["CONSORT_Item"]
train_labels = train_labels.tolist()
train_value_counts = Counter(train_labels)
print ("Manual train labels distribution: ", train_value_counts)

test_labels = test_data_df["CONSORT_Item"]
test_labels = test_labels.tolist()
test_value_counts = Counter(test_labels)
print ("Manual test labels distribution: ", test_value_counts)

2002
550
Manual train labels distribution:  Counter({'6a': 523, '0': 493, '5': 217, '12a': 213, '4a': 123, '7a': 83, '12b': 55, '3a': 52, '10': 46, '11a': 45, '8b': 39, '8a': 33, '4b': 29, '9': 16, '11b': 15, '7b': 10, '3b': 7, '6b': 3})
Manual test labels distribution:  Counter({'0': 137, '6a': 132, '12a': 56, '5': 52, '4a': 37, '7a': 30, '12b': 17, '3a': 15, '11a': 12, '10': 11, '4b': 10, '8b': 10, '8a': 10, '9': 6, '7b': 6, '6b': 3, '11b': 3, '3b': 3})


In [2]:
import warnings
warnings.filterwarnings('ignore')
from sklearn.preprocessing import LabelBinarizer
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.feature_selection import SelectKBest

# TRAINING DATA
#Extract features from training data
#Get ngram features
vectorizer = CountVectorizer(ngram_range = (2,2))
train_text_features = vectorizer.fit_transform(train_data_df['sentence_text'])
#select the best k bigram features
selector = SelectKBest(chi2, k = 'all').fit(train_text_features, train_data_df['CONSORT_Item'])
train_text_features_selected = selector.transform(train_text_features) 

#Get section header feature
lb_make = LabelBinarizer()
train_header_features = lb_make.fit_transform(train_data_df["section"].astype(str))
train_header_features = np.asmatrix(train_header_features)

#Get Metamap features
metamap_concepts_unique = []
train_metamap_concepts = list(train_data_df['metamap_concepts'])
for concepts in train_metamap_concepts:
    if pd.notna(concepts) :
        concept_items  = concepts.split("|")
        for concept_item in concept_items:
            if concept_item not in metamap_concepts_unique:
                metamap_concepts_unique.append(concept_item)
                
metamap_semantictypes_unique = []
train_metamap_semantictypes = list(train_data_df['metamap_semantictypes'])
for semantictypes in train_metamap_semantictypes:
    if pd.notna(semantictypes) :
        semantictypes_items  = semantictypes.split("|")
        for semantictypes_item in semantictypes_items:
            if semantictypes_item not in metamap_semantictypes_unique:
                metamap_semantictypes_unique.append(semantictypes_item)

mlb = MultiLabelBinarizer(classes=metamap_concepts_unique)
train_metamap_concepts_features = mlb.fit_transform(train_data_df["metamap_concepts"].astype(str))

mlb_semantictype = MultiLabelBinarizer(classes=metamap_semantictypes_unique)
train_semantictype_features = mlb_semantictype.fit_transform(train_data_df["metamap_semantictypes"].astype(str))

# DEFINE WHICH FEATURES DO YOU WANT TO USE 
X_train_dm = hstack([train_text_features_selected,train_header_features,train_metamap_concepts_features,train_semantictype_features])
# X_train_dm = hstack([train_text_features_selected,train_header_features])
# X_train_dm = train_text_features_selected

print ("Training data: " , X_train_dm.shape)
y_train_dm = train_data_df['CONSORT_Item']

#TESTING DATA 
test_text = test_data_df['sentence_text']
test_text_features = vectorizer.transform(test_text)
test_text_features_selected = selector.transform(test_text_features) 
test_header_features = lb_make.transform(test_data_df["section"])
test_header_features = np.asmatrix(test_header_features)
test_metamap_concepts_features = mlb.transform(test_data_df["metamap_concepts"].astype(str))
test_semantictype_features = mlb_semantictype.transform(test_data_df["metamap_semantictypes"].astype(str))

# DEFINE WHICH FEATURES DO YOU WANT TO USE 
X_test_dm = hstack([test_text_features_selected,test_header_features,test_metamap_concepts_features,test_semantictype_features])
# X_test_dm = hstack([test_text_features_selected,test_header_features])
# X_test_dm = test_text_features_selected

print ("Test data: ", X_test_dm.shape)
y_test_dm = test_data_df['CONSORT_Item']

Training data:  (2002, 31798)
Test data:  (550, 31798)


In [3]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import make_scorer, accuracy_score
from sklearn.model_selection import GridSearchCV, PredefinedSplit
import numpy

X_train = X_train_dm
y_train = y_train_dm

X_test = X_test_dm
y_test = y_test_dm

In [4]:
from sklearn.pipeline import Pipeline
from sklearn.svm import LinearSVC
import pickle
from sklearn.pipeline import Pipeline
from sklearn import linear_model, datasets

# Choose some parameter combinations to try
param_grid = {'C':[1,10],'gamma':[1], 'kernel':['linear']}
clf = SVC(decision_function_shape = "ovr")

# Run the grid search
grid_obj = GridSearchCV(clf,param_grid,refit = True, verbose=2)
grid_obj = grid_obj.fit(X_train, y_train)

# Set the clf to the best combination of parameters
clf = grid_obj.best_estimator_

#this is the classifier used for feature selection
clf_pipe_multiclass = Pipeline([('model', clf)])

# Fit the best algorithm to the data
print ("Start training process...")
clf_multiclass = clf.fit(X_train, y_train)

Fitting 3 folds for each of 2 candidates, totalling 6 fits
[CV] C=1, gamma=1, kernel=linear .....................................


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


[CV] ...................... C=1, gamma=1, kernel=linear, total=   2.9s
[CV] C=1, gamma=1, kernel=linear .....................................


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    3.6s remaining:    0.0s


[CV] ...................... C=1, gamma=1, kernel=linear, total=   3.2s
[CV] C=1, gamma=1, kernel=linear .....................................
[CV] ...................... C=1, gamma=1, kernel=linear, total=   3.1s
[CV] C=10, gamma=1, kernel=linear ....................................
[CV] ..................... C=10, gamma=1, kernel=linear, total=   3.0s
[CV] C=10, gamma=1, kernel=linear ....................................
[CV] ..................... C=10, gamma=1, kernel=linear, total=   3.1s
[CV] C=10, gamma=1, kernel=linear ....................................
[CV] ..................... C=10, gamma=1, kernel=linear, total=   2.8s


[Parallel(n_jobs=1)]: Done   6 out of   6 | elapsed:   23.1s finished


Start training process...


In [5]:
# save the models to disk
filename_classifier = '/efs/lhoang2/Models/model_BOW_ManualDataOnly_NewSplit.sav'
pickle.dump(clf_multiclass, open(filename_classifier, 'wb'))

#Test SET
#Run the model on Test set
X_test = X_test_dm
y_test = y_test_dm
# Get validation results
predictions_multiclass = clf_multiclass.predict(X_test)
#Print Accurancy, ROC AUC, F1 Scores, Recall, Precision)
print ('Accuracy:', accuracy_score(y_test, predictions_multiclass))
print ('Precision:', precision_score(y_test, predictions_multiclass,average='weighted'))
print ('Recall:', recall_score(y_test, predictions_multiclass,average='weighted'))
print ('F1 score:', f1_score(y_test, predictions_multiclass,average='weighted'))
print (classification_report(y_test,predictions_multiclass))

Accuracy: 0.49454545454545457
Precision: 0.5597724760676217
Recall: 0.49454545454545457
F1 score: 0.4686567290965292
              precision    recall  f1-score   support

           0       0.36      0.80      0.49       137
          10       1.00      0.09      0.17        11
         11a       0.75      0.25      0.38        12
         11b       0.00      0.00      0.00         3
         12a       0.65      0.55      0.60        56
         12b       1.00      0.06      0.11        17
          3a       0.62      0.53      0.57        15
          3b       0.00      0.00      0.00         3
          4a       0.73      0.51      0.60        37
          4b       0.00      0.00      0.00        10
           5       0.48      0.23      0.31        52
          6a       0.68      0.53      0.60       132
          6b       0.00      0.00      0.00         3
          7a       1.00      0.47      0.64        30
          7b       0.00      0.00      0.00         6
          8a      