# Library & Data Loading

In [1]:
import os
import pandas as pd
import numpy as np
import string
import re
import math
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn import metrics

os.environ['CUDA_LAUNCH_BLOCKING'] = '1'


DATA = "https://github.com/socd06/private_nlp/raw/master/data/mtsamples.csv"

filtered_labels = [
    "Surgery",
    "Consult - History and Phy.",
    "Cardiovascular / Pulmonary",
    "Orthopedic",
]
data = pd.read_csv(DATA, usecols=['medical_specialty', 'transcription']).dropna()
data.columns = ['labels', 'text']
data['labels'] = [i.strip() if (i.strip() in filtered_labels) else 'Other' for i in data.labels.to_list()]
train, test = train_test_split(data, test_size=0.4, stratify=data.labels, random_state=0)
train = train.reset_index(drop=True)
test = test.reset_index(drop=True)

In [3]:
test

Unnamed: 0,labels,text
0,Orthopedic,"EXAM:,MRI LEFT SHOULDER,CLINICAL:,This is a 69..."
1,Other,"PREOPERATIVE DIAGNOSIS:, Right hallux abducto..."
2,Other,"CHIEF COMPLAINT:, Follicular non-Hodgkin's ly..."
3,Cardiovascular / Pulmonary,"PROCEDURE: ,Direct-current cardioversion.,BRI..."
4,Consult - History and Phy.,"REASON FOR CONSULTATION:, Coronary artery dis..."
...,...,...
1982,Other,"PREOPERATIVE DIAGNOSES:,1. Fullness in right ..."
1983,Orthopedic,"PREOPERATIVE DIAGNOSES: , Erythema of the righ..."
1984,Other,"REASON FOR EXAM: , Right-sided abdominal pain ..."
1985,Cardiovascular / Pulmonary,We discovered new T-wave abnormalities on her ...


In [4]:
len(np.unique(train.text))

1980

# Convert data format from multi-class to multi-label

In [5]:
column_names = train['labels'].unique()
column_names = column_names.tolist()
column_names.append('text')
print(column_names)

['Consult - History and Phy.', 'Surgery', 'Other', 'Orthopedic', 'Cardiovascular / Pulmonary', 'text']


In [7]:
def convert_to_multi_labels(df):
    newdf = pd.DataFrame(columns = column_names)
    for i in range(len(df)):
        pos = -1
        for j in range(len(newdf)):
            if df.text[i] == newdf.text[j]:
                pos = j
                break
        if pos == -1:
            newdf = newdf.append({'text': df.text[i], df.labels[i]: 1}, ignore_index=True)
        else:
            newdf.at[pos, df.labels[i]] = 1
    newdf = newdf.fillna(0)
    return newdf 


In [8]:
new_train = convert_to_multi_labels(train)
new_train

Unnamed: 0,Consult - History and Phy.,Surgery,Other,Orthopedic,Cardiovascular / Pulmonary,text
0,1,0,1,0,0,"REVIEW OF SYSTEMS,GENERAL: Negative weakness,..."
1,0,1,0,1,0,"PREOPERATIVE DIAGNOSIS: , Herniated nucleus pu..."
2,0,1,1,0,0,PREOPERATIVE DIAGNOSIS: Large juxtarenal abdo...
3,1,0,1,0,0,"GENERAL:, Negative weakness, negative fatigue..."
4,0,0,1,0,0,The patient underwent an overnight polysomnogr...
...,...,...,...,...,...,...
1975,0,1,0,0,0,"PREOPERATIVE DIAGNOSES:,1. Left carpal tunnel..."
1976,0,0,1,0,0,"HISTORY:, This is an initial visit for this 9..."
1977,0,0,1,0,0,"CHIEF COMPLAINT: , Swelling of lips causing di..."
1978,0,0,1,0,0,"PREOPERATIVE DIAGNOSIS: , Foreign body, right ..."


In [9]:
new_test = convert_to_multi_labels(test)
new_test

Unnamed: 0,Consult - History and Phy.,Surgery,Other,Orthopedic,Cardiovascular / Pulmonary,text
0,0,0,1,1,0,"EXAM:,MRI LEFT SHOULDER,CLINICAL:,This is a 69..."
1,0,0,1,1,0,"PREOPERATIVE DIAGNOSIS:, Right hallux abducto..."
2,0,0,1,0,0,"CHIEF COMPLAINT:, Follicular non-Hodgkin's ly..."
3,0,1,0,0,1,"PROCEDURE: ,Direct-current cardioversion.,BRI..."
4,1,0,0,0,0,"REASON FOR CONSULTATION:, Coronary artery dis..."
...,...,...,...,...,...,...
1525,1,0,0,0,0,"GENERAL: , A well-developed infant in no acute..."
1526,1,0,0,0,0,"CHIEF COMPLAINT: , Swelling of lips causing di..."
1527,0,0,0,0,1,"INDICATIONS:, Peripheral vascular disease wit..."
1528,0,0,0,1,0,"PREOPERATIVE DIAGNOSES: , Erythema of the righ..."


In [13]:
from google.colab import drive
import os
drive.mount('/content/drive')
os.chdir("/content/drive/MyDrive/Colab/Talosix")
# new_train.to_csv('train.csv')
# new_test.to_csv('test.csv')
new_train.to_csv('train_multi_label.csv')
new_test.to_csv('test_multi_label.csv')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
