In [None]:
!pip install pycrf
!pip install sklearn-crfsuite

import spacy
import sklearn_crfsuite
from sklearn_crfsuite import metrics

model = spacy.load("en_core_web_sm")

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting pycrf
  Downloading pycrf-0.0.1.tar.gz (1.1 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: pycrf
  Building wheel for pycrf (setup.py) ... [?25l[?25hdone
  Created wheel for pycrf: filename=pycrf-0.0.1-py3-none-any.whl size=1896 sha256=ad7fd5a14ed835093f5c81bbf7012fd75f4af3742ce0e8a9472b1878faa8d340
  Stored in directory: /root/.cache/pip/wheels/03/bf/ca/6777c01db8f2183ae7c5fadfc62d6e88d3e6d600c6379fa3c9
Successfully built pycrf
Installing collected packages: pycrf
Successfully installed pycrf-0.0.1
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting sklearn-crfsuite
  Downloading sklearn_crfsuite-0.3.6-py2.py3-none-any.whl (12 kB)
Collecting python-crfsuite>=0.8.3
  Downloading python_crfsuite-0.9.9-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.0 MB)
[2K



In [None]:
import pandas as pd

In [None]:
from google.colab import drive
drive.mount('/content/gdrive')

Mounted at /content/gdrive


In [None]:
def preprocess_inputfile(input_file):
    i_file = open(input_file, 'r')
    file_name = i_file.readlines()
    i_file.close()

    output_list = []

    full_sentence = ""

    for each_word in file_name:
        each_word = each_word.strip()
        if each_word == "":
            output_list.append(full_sentence) 
            full_sentence = "" 
        else:
            if full_sentence:
                full_sentence += " " + each_word
            else:
                full_sentence = each_word
                
    return output_list

In [None]:
train_sentences = preprocess_inputfile('/content/gdrive/MyDrive/train_sent')
train_labels = preprocess_inputfile('/content/gdrive/MyDrive/train_label')
test_sentences = preprocess_inputfile('/content/gdrive/MyDrive/test_sent')
test_labels = preprocess_inputfile('/content/gdrive/MyDrive/test_label')

In [None]:
for each_item in range(5):
    print(f"Sentence {each_item+1} is: {train_sentences[each_item]}")
    print(f"Label {each_item+1} is: {train_labels[each_item]}")
    print("*"*100)

Sentence 1 is: All live births > or = 23 weeks at the University of Vermont in 1995 ( n = 2395 ) were retrospectively analyzed for delivery route , indication for cesarean , gestational age , parity , and practice group ( to reflect risk status )
Label 1 is: O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O
****************************************************************************************************
Sentence 2 is: The total cesarean rate was 14.4 % ( 344 of 2395 ) , and the primary rate was 11.4 % ( 244 of 2144 )
Label 2 is: O O O O O O O O O O O O O O O O O O O O O O O O O
****************************************************************************************************
Sentence 3 is: Abnormal presentation was the most common indication ( 25.6 % , 88 of 344 )
Label 3 is: O O O O O O O O O O O O O O O
****************************************************************************************************
Sentence 4 is: The `` corrected '' ce

In [None]:
print(f"Number of sentences in processed train dataset is: {len(train_sentences)}")
print(f"Number of sentences in processed test dataset is: {len(test_sentences)}")

Number of sentences in processed train dataset is: 2599
Number of sentences in processed test dataset is: 1056


In [None]:
print(f"Number of lines of labels in processed train dataset is: {len(train_labels)}")
print(f"Number of lines of labels in processed test dataset is: {len(test_labels)}")

Number of lines of labels in processed train dataset is: 2599
Number of lines of labels in processed test dataset is: 1056


In [None]:
noun_propn_tokens_list = []

In [None]:
for sentences in (train_sentences, test_sentences):
    for sent in sentences:
        processed_sent = model(sent)
        for each_token in processed_sent:
            if each_token.pos_ == "NOUN" or each_token.pos_ == "PROPN":
                noun_propn_tokens_list.append(each_token.text)

In [None]:
df_noun_propn = pd.Series(noun_propn_tokens_list)

In [None]:
df_noun_propn.value_counts().sort_values(ascending=False).head(25)

patients        492
treatment       281
%               247
cancer          200
therapy         175
study           154
disease         142
cell            140
lung            116
group            94
chemotherapy     88
gene             87
effects          85
women            77
results          77
use              75
surgery          71
risk             71
cases            71
analysis         70
rate             67
response         66
dose             66
survival         65
children         64
dtype: int64

In [None]:
def getFeaturesForOneWord(sentence, pos, pos_tags):
  word = sentence[pos]

  features = [
    'word.lower=' + word.lower(), 
    'word[-3:]=' + word[-3:],     
    'word[-2:]=' + word[-2:],     
    'word.isupper=%s' % word.isupper(),  
    'word.isdigit=%s' % word.isdigit(),  
    'word.startsWithCapital=%s' % word[0].isupper(), 
    'word.pos=' + pos_tags[pos]
  ]

 
  if(pos > 0):
    prev_word = sentence[pos-1]
    features.extend([
    'prev_word.lower=' + prev_word.lower(), 
    'prev_word.isupper=%s' % prev_word.isupper(),
    'prev_word.isdigit=%s' % prev_word.isdigit(),
    'prev_word.startsWithCapital=%s' % prev_word[0].isupper(),
    'prev_word.pos=' + pos_tags[pos-1]
  ])
 
  else:
    features.append('BEG') 
    
  if(pos == len(sentence)-1):
    features.append('END') 

  return features

In [None]:
def getFeaturesForOneSentence(sentence):
    
    # We need to get the pos_tags to be passed to the function
    processed_sent = model(sentence)
    postags = []
    
    for each_token in processed_sent:
        postags.append(each_token.pos_)
    
    sentence_list = sentence.split()
    return [getFeaturesForOneWord(sentence_list, pos, postags) for pos in range(len(sentence_list))]

In [None]:
def getLabelsInListForOneSentence(labels):
  return labels.split()

In [None]:
X_train = [getFeaturesForOneSentence(sentence) for sentence in train_sentences]
X_test = [getFeaturesForOneSentence(sentence) for sentence in test_sentences]

In [None]:
Y_train = [getLabelsInListForOneSentence(labels) for labels in train_labels]
Y_test = [getLabelsInListForOneSentence(labels) for labels in test_labels]

In [None]:
import sklearn_crfsuite

from sklearn_crfsuite import metrics

In [None]:
crf = sklearn_crfsuite.CRF(max_iterations=100)

In [None]:
try:
    crf.fit(X_train, Y_train)
except AttributeError:
    pass
predictions = crf.predict(X_test)

In [None]:
Y_pred = crf.predict(X_test)

In [None]:
f1_score = metrics.flat_f1_score(Y_test, Y_pred, average='weighted')
print(f"F1 score is: {round(f1_score,4)}")

F1 score is: 0.9058


In [None]:
D_T_dict = dict()

for i in range(len(Y_pred)):
    # Get the predicted labels of each test sentence into "val"
    val = Y_pred[i]
    
    # Empty strings to store the values of Diseases and Treatments
    Diseases = ""
    Treatments = ""
    
    # Each loop will iterate through the individual labels and focus on mapping D and T labels
    # with Diseases and Treatments within each sentence into a concatenated string
    for j in range(len(val)):
        if val[j] == 'D': # If label is D, it indicates a Disease 
            Diseases += test_sentences[i].split()[j] + " "
        elif val[j] == 'T': # If label is T, it indicates a Treatment
            Treatments += test_sentences[i].split()[j] + " "
            
    # Removes any extra whitespaces to either end of the string
    Diseases = Diseases.lstrip().rstrip()
    Treatments = Treatments.lstrip().rstrip()

    # If Diseases and Treatments are blank, ignore them
    # If Disease is not present in Dictionary, add it along with the corresponding treatment
    # If Disease is present in the Dictionary, append the treatments for that diseases with existing
    # treatments
    if Diseases != "" and Treatments != "":
        if Diseases in D_T_dict.keys():
            treat_out = list(D_T_dict[Diseases])
            treat_out.append(Treatments)
            D_T_dict[Diseases] = treat_out
        elif Diseases not in D_T_dict.keys():
            D_T_dict[Diseases] = Treatments

In [None]:
D_T_dict['hereditary retinoblastoma']

'radiotherapy'