In [1]:
import pandas as pd
# import nltk.data
import nltk
from collections import OrderedDict, Counter
from tqdm import tqdm
import regex as re
# import re, string
import itertools
import pickle

pd.set_option('display.max_colwidth', None)
pd.set_option('display.max_rows', 1000)


# text-to-sentence

In [210]:
def clean_text(input_text):
    # preprocess text where sentences end with no space after the full stop
    # e.g., "This a sentence.This another one.""
    text = re.sub(r'([a-z])\.([A-Z])', r'\1. \2', input_text)
    # Remove new lines
    text = text.replace('\n', ' ').strip()
    # Remove URL links
    text = re.sub(r'http\S+', '', text)
    # Remove patterns like "Validity of ASD subtypes: \n\n</ref>""
    text = re.sub(r'\.[\s\S\][^.]*?<\/ref>', '. ', text)
    # Remove e.g., '| |- |colspan=2|', '|- |rowspan=12|', '| - |- |rowspan=4|'
    text = re.sub(r'(\|[\-\s]*)|(rowspan=\d+)|(colspan=\d+)', ' ', text)
    # Remove images in <gallery ...> ... </gallery>, <imagemap> ... </imagemap>
    text = re.sub(r'<([\w\s\W]*?)>(.*?)<(\/?)([\w\s\W]*?)>', '', text)
    # Remove html tags
    text = re.sub(r'<(\/?)([\w\s\W]*?)(\/?)>', '', text)
    
    return text.strip()

def articles_to_sentences(articles_in_json):
    df_med_conditions = pd.read_json(articles_in_json, lines=True)
    
    # Don't extract sentences from the following articles
    excl_articles = ["Wikipedia:Administrators' noticeboard/IncidentArchive1003",
                     "Wikipedia:Article",
                     "Wikipedia:Bot",
                     "Wikipedia:Deletion review/Log/2015 February 21",
                     "Wikipedia:Featured article candidates/Acne vulgaris/archive2",
                     "Wikipedia:Fringe theories/Noticeboard/Archive 57",
                     "Wikipedia:Help desk/Archives/2016 August 20",
                     "Wikipedia:Manual",
                     "Wikipedia:Teahouse/Questions/Archive 671",
                     "Wikipedia:Templates",
                     "Wikipedia:Village"
                    ]

    data = []
    for index, row in tqdm(df_med_conditions.iterrows()):                
        if any(x in row['title'] for x in excl_articles):
            continue
        
        title = row['title']
        text = clean_text(row['parsed_text'])

        for i, sentence in enumerate(nltk.tokenize.sent_tokenize(text)):
            # Remove sentences which only contain one word
            if len(sentence.split()) == 1:
                continue
            # Remove sentences which only contain digits and punctuations
            if not any(c.isalpha() for c in sentence):
                continue
                
            data.append(OrderedDict({"source_page": title, "sent_id":i+1 ,"sentence": sentence}))
    
    # Create dataframe objects
    data = pd.DataFrame(data)

    return data

In [211]:
articles_in_json = './extracted/articles/medical_conditions_V4.json'

In [212]:
articles = articles_to_sentences(articles_in_json)

8677it [01:27, 99.40it/s] 


In [217]:
articles

Unnamed: 0,source_page,sent_id,sentence
0,Autism,1,"Autism is a developmental disorder characterized by difficulties with social interaction and communication, and by restricted and repetitive behavior."
1,Autism,2,Autism's symptoms result from maturation-related changes in various systems of the brain.
2,Autism,3,"Some studies have reported diagnoses of autism in children due to a loss of language or social skills, as opposed to a failure to make progress, typically from 15 to 30 months of age."
3,Autism,4,No known medication relieves autism's core symptoms of social and communication impairments.
4,Autism,5,"Treatment approaches have little empirical support in quality-of-life contexts, and many programs focus on success measures that lack predictive validity and real-world relevance."
...,...,...,...
350757,Hepatosplenic T-cell lymphoma,25,Autologous bone marrow transplantation is currently being investigated in the treatment of hepatosplenic lymphoma.
350758,Hepatosplenic T-cell lymphoma,26,Allogeneic bone marrow transplant has been proven to attain remission for over five years and possibly cure hepatosplenic lymphoma with a number of patients surviving if they can make it to Allogenic transplant.
350759,Hepatosplenic T-cell lymphoma,27,"This lymphoma is rare, comprising less than 5% of all cases, and is most common in young adults and adolescents."
350760,Hepatosplenic T-cell lymphoma,28,A distinct male gender preference has been described.


In [218]:
articles.to_csv('./sentences_and_ngrams/medcon_sentences.csv', sep=';', index=False)

# Extract n-grams

## pipeline n-gram extraction

In [17]:
def remove_punctuation(sentence):
    sent = re.sub(r"[^\s\p{L}\p{N}'-]|(?<=(^|\s))'|'(?=($|\s))|(?<=(^|\s))-|-(?=($|\s))", '', sentence)
    return sent

def extract_ngrams(sentence, num):
    new_sentence = remove_punctuation(sentence.lower())
    tokens = nltk.word_tokenize(new_sentence)
    tokens_POS = nltk.pos_tag(tokens)
    n_grams = nltk.ngrams(tokens_POS, num)

    return list(n_grams)

def group_ngrams(filtered_ngram_dict):
    ngrams_grouped = OrderedDict()
    for item in filtered_ngram_dict.items():
        #print(item[0]) #keys, e.g.: (('disorder', 'NN'),)
        #print(item[1]) #values (freq), e.g.: 11386
        ngrams = []
        for gram in item[0]:
            #print(gram[0]) #token
            #print(gram[1]) #POS
            ngrams.append(gram[0])
        ngrams_grouped[" ".join(ngrams)] = item[1]
    return ngrams_grouped

In [223]:
medcon_sentences = pd.read_csv('./sentences_and_ngrams/medcon_sentences.csv', sep=';')
medcon_sentences

medcon_ngrams = []
for index, row in tqdm(medcon_sentences.iterrows()):
    one_grams = extract_ngrams(row['sentence'], 1)
    bi_grams = extract_ngrams(row['sentence'], 2)
    tri_grams = extract_ngrams(row['sentence'], 3)
    four_grams = extract_ngrams(row['sentence'], 4)
    if index <10:
        print(row['sentence'])
        print("uni_grams:\n", one_grams)
        print("bi_grams:\n", bi_grams)
        print("tri_grams:\n", tri_grams)
        print("quandri_grams:\n", four_grams)
        print(" ")
    
    medcon_ngrams.append(OrderedDict({"unigrams": one_grams,
                                      "bigrams": bi_grams,
                                      "trigrams": tri_grams,
                                      "quandrigrams": four_grams
                                     }))

medcon_ngrams = pd.DataFrame(medcon_ngrams)
medcon_df = pd.concat([medcon_sentences, medcon_ngrams], axis=1)

46it [00:00, 175.17it/s]

Autism is a developmental disorder characterized by difficulties with social interaction and communication, and by restricted and repetitive behavior.
uni_grams:
 [(('autism', 'NN'),), (('is', 'VBZ'),), (('a', 'DT'),), (('developmental', 'JJ'),), (('disorder', 'NN'),), (('characterized', 'VBN'),), (('by', 'IN'),), (('difficulties', 'NNS'),), (('with', 'IN'),), (('social', 'JJ'),), (('interaction', 'NN'),), (('and', 'CC'),), (('communication', 'NN'),), (('and', 'CC'),), (('by', 'IN'),), (('restricted', 'VBN'),), (('and', 'CC'),), (('repetitive', 'JJ'),), (('behavior', 'NN'),)]
bi_grams:
 [(('autism', 'NN'), ('is', 'VBZ')), (('is', 'VBZ'), ('a', 'DT')), (('a', 'DT'), ('developmental', 'JJ')), (('developmental', 'JJ'), ('disorder', 'NN')), (('disorder', 'NN'), ('characterized', 'VBN')), (('characterized', 'VBN'), ('by', 'IN')), (('by', 'IN'), ('difficulties', 'NNS')), (('difficulties', 'NNS'), ('with', 'IN')), (('with', 'IN'), ('social', 'JJ')), (('social', 'JJ'), ('interaction', 'NN')), 

350762it [22:00, 265.59it/s]


In [224]:
medcon_df.to_csv('./sentences_and_ngrams/medcon_sentences_ngrams.csv', sep=';', index=False)

In [228]:
medcon_df

Unnamed: 0,source_page,sent_id,sentence,unigrams,bigrams,trigrams,quandrigrams
0,Autism,1,"Autism is a developmental disorder characterized by difficulties with social interaction and communication, and by restricted and repetitive behavior.","[((autism, NN),), ((is, VBZ),), ((a, DT),), ((developmental, JJ),), ((disorder, NN),), ((characterized, VBN),), ((by, IN),), ((difficulties, NNS),), ((with, IN),), ((social, JJ),), ((interaction, NN),), ((and, CC),), ((communication, NN),), ((and, CC),), ((by, IN),), ((restricted, VBN),), ((and, CC),), ((repetitive, JJ),), ((behavior, NN),)]","[((autism, NN), (is, VBZ)), ((is, VBZ), (a, DT)), ((a, DT), (developmental, JJ)), ((developmental, JJ), (disorder, NN)), ((disorder, NN), (characterized, VBN)), ((characterized, VBN), (by, IN)), ((by, IN), (difficulties, NNS)), ((difficulties, NNS), (with, IN)), ((with, IN), (social, JJ)), ((social, JJ), (interaction, NN)), ((interaction, NN), (and, CC)), ((and, CC), (communication, NN)), ((communication, NN), (and, CC)), ((and, CC), (by, IN)), ((by, IN), (restricted, VBN)), ((restricted, VBN), (and, CC)), ((and, CC), (repetitive, JJ)), ((repetitive, JJ), (behavior, NN))]","[((autism, NN), (is, VBZ), (a, DT)), ((is, VBZ), (a, DT), (developmental, JJ)), ((a, DT), (developmental, JJ), (disorder, NN)), ((developmental, JJ), (disorder, NN), (characterized, VBN)), ((disorder, NN), (characterized, VBN), (by, IN)), ((characterized, VBN), (by, IN), (difficulties, NNS)), ((by, IN), (difficulties, NNS), (with, IN)), ((difficulties, NNS), (with, IN), (social, JJ)), ((with, IN), (social, JJ), (interaction, NN)), ((social, JJ), (interaction, NN), (and, CC)), ((interaction, NN), (and, CC), (communication, NN)), ((and, CC), (communication, NN), (and, CC)), ((communication, NN), (and, CC), (by, IN)), ((and, CC), (by, IN), (restricted, VBN)), ((by, IN), (restricted, VBN), (and, CC)), ((restricted, VBN), (and, CC), (repetitive, JJ)), ((and, CC), (repetitive, JJ), (behavior, NN))]","[((autism, NN), (is, VBZ), (a, DT), (developmental, JJ)), ((is, VBZ), (a, DT), (developmental, JJ), (disorder, NN)), ((a, DT), (developmental, JJ), (disorder, NN), (characterized, VBN)), ((developmental, JJ), (disorder, NN), (characterized, VBN), (by, IN)), ((disorder, NN), (characterized, VBN), (by, IN), (difficulties, NNS)), ((characterized, VBN), (by, IN), (difficulties, NNS), (with, IN)), ((by, IN), (difficulties, NNS), (with, IN), (social, JJ)), ((difficulties, NNS), (with, IN), (social, JJ), (interaction, NN)), ((with, IN), (social, JJ), (interaction, NN), (and, CC)), ((social, JJ), (interaction, NN), (and, CC), (communication, NN)), ((interaction, NN), (and, CC), (communication, NN), (and, CC)), ((and, CC), (communication, NN), (and, CC), (by, IN)), ((communication, NN), (and, CC), (by, IN), (restricted, VBN)), ((and, CC), (by, IN), (restricted, VBN), (and, CC)), ((by, IN), (restricted, VBN), (and, CC), (repetitive, JJ)), ((restricted, VBN), (and, CC), (repetitive, JJ), (behavior, NN))]"
1,Autism,2,Autism's symptoms result from maturation-related changes in various systems of the brain.,"[((autism, NN),), (('s, POS),), ((symptoms, NNS),), ((result, VBP),), ((from, IN),), ((maturation-related, JJ),), ((changes, NNS),), ((in, IN),), ((various, JJ),), ((systems, NNS),), ((of, IN),), ((the, DT),), ((brain, NN),)]","[((autism, NN), ('s, POS)), (('s, POS), (symptoms, NNS)), ((symptoms, NNS), (result, VBP)), ((result, VBP), (from, IN)), ((from, IN), (maturation-related, JJ)), ((maturation-related, JJ), (changes, NNS)), ((changes, NNS), (in, IN)), ((in, IN), (various, JJ)), ((various, JJ), (systems, NNS)), ((systems, NNS), (of, IN)), ((of, IN), (the, DT)), ((the, DT), (brain, NN))]","[((autism, NN), ('s, POS), (symptoms, NNS)), (('s, POS), (symptoms, NNS), (result, VBP)), ((symptoms, NNS), (result, VBP), (from, IN)), ((result, VBP), (from, IN), (maturation-related, JJ)), ((from, IN), (maturation-related, JJ), (changes, NNS)), ((maturation-related, JJ), (changes, NNS), (in, IN)), ((changes, NNS), (in, IN), (various, JJ)), ((in, IN), (various, JJ), (systems, NNS)), ((various, JJ), (systems, NNS), (of, IN)), ((systems, NNS), (of, IN), (the, DT)), ((of, IN), (the, DT), (brain, NN))]","[((autism, NN), ('s, POS), (symptoms, NNS), (result, VBP)), (('s, POS), (symptoms, NNS), (result, VBP), (from, IN)), ((symptoms, NNS), (result, VBP), (from, IN), (maturation-related, JJ)), ((result, VBP), (from, IN), (maturation-related, JJ), (changes, NNS)), ((from, IN), (maturation-related, JJ), (changes, NNS), (in, IN)), ((maturation-related, JJ), (changes, NNS), (in, IN), (various, JJ)), ((changes, NNS), (in, IN), (various, JJ), (systems, NNS)), ((in, IN), (various, JJ), (systems, NNS), (of, IN)), ((various, JJ), (systems, NNS), (of, IN), (the, DT)), ((systems, NNS), (of, IN), (the, DT), (brain, NN))]"
2,Autism,3,"Some studies have reported diagnoses of autism in children due to a loss of language or social skills, as opposed to a failure to make progress, typically from 15 to 30 months of age.","[((some, DT),), ((studies, NNS),), ((have, VBP),), ((reported, VBN),), ((diagnoses, NNS),), ((of, IN),), ((autism, NN),), ((in, IN),), ((children, NNS),), ((due, JJ),), ((to, TO),), ((a, DT),), ((loss, NN),), ((of, IN),), ((language, NN),), ((or, CC),), ((social, JJ),), ((skills, NNS),), ((as, IN),), ((opposed, VBN),), ((to, TO),), ((a, DT),), ((failure, NN),), ((to, TO),), ((make, VB),), ((progress, NN),), ((typically, RB),), ((from, IN),), ((15, CD),), ((to, TO),), ((30, CD),), ((months, NNS),), ((of, IN),), ((age, NN),)]","[((some, DT), (studies, NNS)), ((studies, NNS), (have, VBP)), ((have, VBP), (reported, VBN)), ((reported, VBN), (diagnoses, NNS)), ((diagnoses, NNS), (of, IN)), ((of, IN), (autism, NN)), ((autism, NN), (in, IN)), ((in, IN), (children, NNS)), ((children, NNS), (due, JJ)), ((due, JJ), (to, TO)), ((to, TO), (a, DT)), ((a, DT), (loss, NN)), ((loss, NN), (of, IN)), ((of, IN), (language, NN)), ((language, NN), (or, CC)), ((or, CC), (social, JJ)), ((social, JJ), (skills, NNS)), ((skills, NNS), (as, IN)), ((as, IN), (opposed, VBN)), ((opposed, VBN), (to, TO)), ((to, TO), (a, DT)), ((a, DT), (failure, NN)), ((failure, NN), (to, TO)), ((to, TO), (make, VB)), ((make, VB), (progress, NN)), ((progress, NN), (typically, RB)), ((typically, RB), (from, IN)), ((from, IN), (15, CD)), ((15, CD), (to, TO)), ((to, TO), (30, CD)), ((30, CD), (months, NNS)), ((months, NNS), (of, IN)), ((of, IN), (age, NN))]","[((some, DT), (studies, NNS), (have, VBP)), ((studies, NNS), (have, VBP), (reported, VBN)), ((have, VBP), (reported, VBN), (diagnoses, NNS)), ((reported, VBN), (diagnoses, NNS), (of, IN)), ((diagnoses, NNS), (of, IN), (autism, NN)), ((of, IN), (autism, NN), (in, IN)), ((autism, NN), (in, IN), (children, NNS)), ((in, IN), (children, NNS), (due, JJ)), ((children, NNS), (due, JJ), (to, TO)), ((due, JJ), (to, TO), (a, DT)), ((to, TO), (a, DT), (loss, NN)), ((a, DT), (loss, NN), (of, IN)), ((loss, NN), (of, IN), (language, NN)), ((of, IN), (language, NN), (or, CC)), ((language, NN), (or, CC), (social, JJ)), ((or, CC), (social, JJ), (skills, NNS)), ((social, JJ), (skills, NNS), (as, IN)), ((skills, NNS), (as, IN), (opposed, VBN)), ((as, IN), (opposed, VBN), (to, TO)), ((opposed, VBN), (to, TO), (a, DT)), ((to, TO), (a, DT), (failure, NN)), ((a, DT), (failure, NN), (to, TO)), ((failure, NN), (to, TO), (make, VB)), ((to, TO), (make, VB), (progress, NN)), ((make, VB), (progress, NN), (typically, RB)), ((progress, NN), (typically, RB), (from, IN)), ((typically, RB), (from, IN), (15, CD)), ((from, IN), (15, CD), (to, TO)), ((15, CD), (to, TO), (30, CD)), ((to, TO), (30, CD), (months, NNS)), ((30, CD), (months, NNS), (of, IN)), ((months, NNS), (of, IN), (age, NN))]","[((some, DT), (studies, NNS), (have, VBP), (reported, VBN)), ((studies, NNS), (have, VBP), (reported, VBN), (diagnoses, NNS)), ((have, VBP), (reported, VBN), (diagnoses, NNS), (of, IN)), ((reported, VBN), (diagnoses, NNS), (of, IN), (autism, NN)), ((diagnoses, NNS), (of, IN), (autism, NN), (in, IN)), ((of, IN), (autism, NN), (in, IN), (children, NNS)), ((autism, NN), (in, IN), (children, NNS), (due, JJ)), ((in, IN), (children, NNS), (due, JJ), (to, TO)), ((children, NNS), (due, JJ), (to, TO), (a, DT)), ((due, JJ), (to, TO), (a, DT), (loss, NN)), ((to, TO), (a, DT), (loss, NN), (of, IN)), ((a, DT), (loss, NN), (of, IN), (language, NN)), ((loss, NN), (of, IN), (language, NN), (or, CC)), ((of, IN), (language, NN), (or, CC), (social, JJ)), ((language, NN), (or, CC), (social, JJ), (skills, NNS)), ((or, CC), (social, JJ), (skills, NNS), (as, IN)), ((social, JJ), (skills, NNS), (as, IN), (opposed, VBN)), ((skills, NNS), (as, IN), (opposed, VBN), (to, TO)), ((as, IN), (opposed, VBN), (to, TO), (a, DT)), ((opposed, VBN), (to, TO), (a, DT), (failure, NN)), ((to, TO), (a, DT), (failure, NN), (to, TO)), ((a, DT), (failure, NN), (to, TO), (make, VB)), ((failure, NN), (to, TO), (make, VB), (progress, NN)), ((to, TO), (make, VB), (progress, NN), (typically, RB)), ((make, VB), (progress, NN), (typically, RB), (from, IN)), ((progress, NN), (typically, RB), (from, IN), (15, CD)), ((typically, RB), (from, IN), (15, CD), (to, TO)), ((from, IN), (15, CD), (to, TO), (30, CD)), ((15, CD), (to, TO), (30, CD), (months, NNS)), ((to, TO), (30, CD), (months, NNS), (of, IN)), ((30, CD), (months, NNS), (of, IN), (age, NN))]"
3,Autism,4,No known medication relieves autism's core symptoms of social and communication impairments.,"[((no, DT),), ((known, JJ),), ((medication, NN),), ((relieves, VBZ),), ((autism, NN),), (('s, POS),), ((core, NN),), ((symptoms, NNS),), ((of, IN),), ((social, JJ),), ((and, CC),), ((communication, NN),), ((impairments, NNS),)]","[((no, DT), (known, JJ)), ((known, JJ), (medication, NN)), ((medication, NN), (relieves, VBZ)), ((relieves, VBZ), (autism, NN)), ((autism, NN), ('s, POS)), (('s, POS), (core, NN)), ((core, NN), (symptoms, NNS)), ((symptoms, NNS), (of, IN)), ((of, IN), (social, JJ)), ((social, JJ), (and, CC)), ((and, CC), (communication, NN)), ((communication, NN), (impairments, NNS))]","[((no, DT), (known, JJ), (medication, NN)), ((known, JJ), (medication, NN), (relieves, VBZ)), ((medication, NN), (relieves, VBZ), (autism, NN)), ((relieves, VBZ), (autism, NN), ('s, POS)), ((autism, NN), ('s, POS), (core, NN)), (('s, POS), (core, NN), (symptoms, NNS)), ((core, NN), (symptoms, NNS), (of, IN)), ((symptoms, NNS), (of, IN), (social, JJ)), ((of, IN), (social, JJ), (and, CC)), ((social, JJ), (and, CC), (communication, NN)), ((and, CC), (communication, NN), (impairments, NNS))]","[((no, DT), (known, JJ), (medication, NN), (relieves, VBZ)), ((known, JJ), (medication, NN), (relieves, VBZ), (autism, NN)), ((medication, NN), (relieves, VBZ), (autism, NN), ('s, POS)), ((relieves, VBZ), (autism, NN), ('s, POS), (core, NN)), ((autism, NN), ('s, POS), (core, NN), (symptoms, NNS)), (('s, POS), (core, NN), (symptoms, NNS), (of, IN)), ((core, NN), (symptoms, NNS), (of, IN), (social, JJ)), ((symptoms, NNS), (of, IN), (social, JJ), (and, CC)), ((of, IN), (social, JJ), (and, CC), (communication, NN)), ((social, JJ), (and, CC), (communication, NN), (impairments, NNS))]"
4,Autism,5,"Treatment approaches have little empirical support in quality-of-life contexts, and many programs focus on success measures that lack predictive validity and real-world relevance.","[((treatment, NN),), ((approaches, NNS),), ((have, VBP),), ((little, JJ),), ((empirical, JJ),), ((support, NN),), ((in, IN),), ((quality-of-life, JJ),), ((contexts, NN),), ((and, CC),), ((many, JJ),), ((programs, NNS),), ((focus, VBP),), ((on, IN),), ((success, NN),), ((measures, NNS),), ((that, WDT),), ((lack, VBP),), ((predictive, JJ),), ((validity, NN),), ((and, CC),), ((real-world, NN),), ((relevance, NN),)]","[((treatment, NN), (approaches, NNS)), ((approaches, NNS), (have, VBP)), ((have, VBP), (little, JJ)), ((little, JJ), (empirical, JJ)), ((empirical, JJ), (support, NN)), ((support, NN), (in, IN)), ((in, IN), (quality-of-life, JJ)), ((quality-of-life, JJ), (contexts, NN)), ((contexts, NN), (and, CC)), ((and, CC), (many, JJ)), ((many, JJ), (programs, NNS)), ((programs, NNS), (focus, VBP)), ((focus, VBP), (on, IN)), ((on, IN), (success, NN)), ((success, NN), (measures, NNS)), ((measures, NNS), (that, WDT)), ((that, WDT), (lack, VBP)), ((lack, VBP), (predictive, JJ)), ((predictive, JJ), (validity, NN)), ((validity, NN), (and, CC)), ((and, CC), (real-world, NN)), ((real-world, NN), (relevance, NN))]","[((treatment, NN), (approaches, NNS), (have, VBP)), ((approaches, NNS), (have, VBP), (little, JJ)), ((have, VBP), (little, JJ), (empirical, JJ)), ((little, JJ), (empirical, JJ), (support, NN)), ((empirical, JJ), (support, NN), (in, IN)), ((support, NN), (in, IN), (quality-of-life, JJ)), ((in, IN), (quality-of-life, JJ), (contexts, NN)), ((quality-of-life, JJ), (contexts, NN), (and, CC)), ((contexts, NN), (and, CC), (many, JJ)), ((and, CC), (many, JJ), (programs, NNS)), ((many, JJ), (programs, NNS), (focus, VBP)), ((programs, NNS), (focus, VBP), (on, IN)), ((focus, VBP), (on, IN), (success, NN)), ((on, IN), (success, NN), (measures, NNS)), ((success, NN), (measures, NNS), (that, WDT)), ((measures, NNS), (that, WDT), (lack, VBP)), ((that, WDT), (lack, VBP), (predictive, JJ)), ((lack, VBP), (predictive, JJ), (validity, NN)), ((predictive, JJ), (validity, NN), (and, CC)), ((validity, NN), (and, CC), (real-world, NN)), ((and, CC), (real-world, NN), (relevance, NN))]","[((treatment, NN), (approaches, NNS), (have, VBP), (little, JJ)), ((approaches, NNS), (have, VBP), (little, JJ), (empirical, JJ)), ((have, VBP), (little, JJ), (empirical, JJ), (support, NN)), ((little, JJ), (empirical, JJ), (support, NN), (in, IN)), ((empirical, JJ), (support, NN), (in, IN), (quality-of-life, JJ)), ((support, NN), (in, IN), (quality-of-life, JJ), (contexts, NN)), ((in, IN), (quality-of-life, JJ), (contexts, NN), (and, CC)), ((quality-of-life, JJ), (contexts, NN), (and, CC), (many, JJ)), ((contexts, NN), (and, CC), (many, JJ), (programs, NNS)), ((and, CC), (many, JJ), (programs, NNS), (focus, VBP)), ((many, JJ), (programs, NNS), (focus, VBP), (on, IN)), ((programs, NNS), (focus, VBP), (on, IN), (success, NN)), ((focus, VBP), (on, IN), (success, NN), (measures, NNS)), ((on, IN), (success, NN), (measures, NNS), (that, WDT)), ((success, NN), (measures, NNS), (that, WDT), (lack, VBP)), ((measures, NNS), (that, WDT), (lack, VBP), (predictive, JJ)), ((that, WDT), (lack, VBP), (predictive, JJ), (validity, NN)), ((lack, VBP), (predictive, JJ), (validity, NN), (and, CC)), ((predictive, JJ), (validity, NN), (and, CC), (real-world, NN)), ((validity, NN), (and, CC), (real-world, NN), (relevance, NN))]"
...,...,...,...,...,...,...,...
350757,Hepatosplenic T-cell lymphoma,25,Autologous bone marrow transplantation is currently being investigated in the treatment of hepatosplenic lymphoma.,"[((autologous, JJ),), ((bone, NN),), ((marrow, NN),), ((transplantation, NN),), ((is, VBZ),), ((currently, RB),), ((being, VBG),), ((investigated, VBN),), ((in, IN),), ((the, DT),), ((treatment, NN),), ((of, IN),), ((hepatosplenic, JJ),), ((lymphoma, NN),)]","[((autologous, JJ), (bone, NN)), ((bone, NN), (marrow, NN)), ((marrow, NN), (transplantation, NN)), ((transplantation, NN), (is, VBZ)), ((is, VBZ), (currently, RB)), ((currently, RB), (being, VBG)), ((being, VBG), (investigated, VBN)), ((investigated, VBN), (in, IN)), ((in, IN), (the, DT)), ((the, DT), (treatment, NN)), ((treatment, NN), (of, IN)), ((of, IN), (hepatosplenic, JJ)), ((hepatosplenic, JJ), (lymphoma, NN))]","[((autologous, JJ), (bone, NN), (marrow, NN)), ((bone, NN), (marrow, NN), (transplantation, NN)), ((marrow, NN), (transplantation, NN), (is, VBZ)), ((transplantation, NN), (is, VBZ), (currently, RB)), ((is, VBZ), (currently, RB), (being, VBG)), ((currently, RB), (being, VBG), (investigated, VBN)), ((being, VBG), (investigated, VBN), (in, IN)), ((investigated, VBN), (in, IN), (the, DT)), ((in, IN), (the, DT), (treatment, NN)), ((the, DT), (treatment, NN), (of, IN)), ((treatment, NN), (of, IN), (hepatosplenic, JJ)), ((of, IN), (hepatosplenic, JJ), (lymphoma, NN))]","[((autologous, JJ), (bone, NN), (marrow, NN), (transplantation, NN)), ((bone, NN), (marrow, NN), (transplantation, NN), (is, VBZ)), ((marrow, NN), (transplantation, NN), (is, VBZ), (currently, RB)), ((transplantation, NN), (is, VBZ), (currently, RB), (being, VBG)), ((is, VBZ), (currently, RB), (being, VBG), (investigated, VBN)), ((currently, RB), (being, VBG), (investigated, VBN), (in, IN)), ((being, VBG), (investigated, VBN), (in, IN), (the, DT)), ((investigated, VBN), (in, IN), (the, DT), (treatment, NN)), ((in, IN), (the, DT), (treatment, NN), (of, IN)), ((the, DT), (treatment, NN), (of, IN), (hepatosplenic, JJ)), ((treatment, NN), (of, IN), (hepatosplenic, JJ), (lymphoma, NN))]"
350758,Hepatosplenic T-cell lymphoma,26,Allogeneic bone marrow transplant has been proven to attain remission for over five years and possibly cure hepatosplenic lymphoma with a number of patients surviving if they can make it to Allogenic transplant.,"[((allogeneic, RB),), ((bone, NN),), ((marrow, NN),), ((transplant, NN),), ((has, VBZ),), ((been, VBN),), ((proven, VBN),), ((to, TO),), ((attain, VB),), ((remission, NN),), ((for, IN),), ((over, IN),), ((five, CD),), ((years, NNS),), ((and, CC),), ((possibly, RB),), ((cure, VB),), ((hepatosplenic, JJ),), ((lymphoma, NN),), ((with, IN),), ((a, DT),), ((number, NN),), ((of, IN),), ((patients, NNS),), ((surviving, VBG),), ((if, IN),), ((they, PRP),), ((can, MD),), ((make, VB),), ((it, PRP),), ((to, TO),), ((allogenic, VB),), ((transplant, NN),)]","[((allogeneic, RB), (bone, NN)), ((bone, NN), (marrow, NN)), ((marrow, NN), (transplant, NN)), ((transplant, NN), (has, VBZ)), ((has, VBZ), (been, VBN)), ((been, VBN), (proven, VBN)), ((proven, VBN), (to, TO)), ((to, TO), (attain, VB)), ((attain, VB), (remission, NN)), ((remission, NN), (for, IN)), ((for, IN), (over, IN)), ((over, IN), (five, CD)), ((five, CD), (years, NNS)), ((years, NNS), (and, CC)), ((and, CC), (possibly, RB)), ((possibly, RB), (cure, VB)), ((cure, VB), (hepatosplenic, JJ)), ((hepatosplenic, JJ), (lymphoma, NN)), ((lymphoma, NN), (with, IN)), ((with, IN), (a, DT)), ((a, DT), (number, NN)), ((number, NN), (of, IN)), ((of, IN), (patients, NNS)), ((patients, NNS), (surviving, VBG)), ((surviving, VBG), (if, IN)), ((if, IN), (they, PRP)), ((they, PRP), (can, MD)), ((can, MD), (make, VB)), ((make, VB), (it, PRP)), ((it, PRP), (to, TO)), ((to, TO), (allogenic, VB)), ((allogenic, VB), (transplant, NN))]","[((allogeneic, RB), (bone, NN), (marrow, NN)), ((bone, NN), (marrow, NN), (transplant, NN)), ((marrow, NN), (transplant, NN), (has, VBZ)), ((transplant, NN), (has, VBZ), (been, VBN)), ((has, VBZ), (been, VBN), (proven, VBN)), ((been, VBN), (proven, VBN), (to, TO)), ((proven, VBN), (to, TO), (attain, VB)), ((to, TO), (attain, VB), (remission, NN)), ((attain, VB), (remission, NN), (for, IN)), ((remission, NN), (for, IN), (over, IN)), ((for, IN), (over, IN), (five, CD)), ((over, IN), (five, CD), (years, NNS)), ((five, CD), (years, NNS), (and, CC)), ((years, NNS), (and, CC), (possibly, RB)), ((and, CC), (possibly, RB), (cure, VB)), ((possibly, RB), (cure, VB), (hepatosplenic, JJ)), ((cure, VB), (hepatosplenic, JJ), (lymphoma, NN)), ((hepatosplenic, JJ), (lymphoma, NN), (with, IN)), ((lymphoma, NN), (with, IN), (a, DT)), ((with, IN), (a, DT), (number, NN)), ((a, DT), (number, NN), (of, IN)), ((number, NN), (of, IN), (patients, NNS)), ((of, IN), (patients, NNS), (surviving, VBG)), ((patients, NNS), (surviving, VBG), (if, IN)), ((surviving, VBG), (if, IN), (they, PRP)), ((if, IN), (they, PRP), (can, MD)), ((they, PRP), (can, MD), (make, VB)), ((can, MD), (make, VB), (it, PRP)), ((make, VB), (it, PRP), (to, TO)), ((it, PRP), (to, TO), (allogenic, VB)), ((to, TO), (allogenic, VB), (transplant, NN))]","[((allogeneic, RB), (bone, NN), (marrow, NN), (transplant, NN)), ((bone, NN), (marrow, NN), (transplant, NN), (has, VBZ)), ((marrow, NN), (transplant, NN), (has, VBZ), (been, VBN)), ((transplant, NN), (has, VBZ), (been, VBN), (proven, VBN)), ((has, VBZ), (been, VBN), (proven, VBN), (to, TO)), ((been, VBN), (proven, VBN), (to, TO), (attain, VB)), ((proven, VBN), (to, TO), (attain, VB), (remission, NN)), ((to, TO), (attain, VB), (remission, NN), (for, IN)), ((attain, VB), (remission, NN), (for, IN), (over, IN)), ((remission, NN), (for, IN), (over, IN), (five, CD)), ((for, IN), (over, IN), (five, CD), (years, NNS)), ((over, IN), (five, CD), (years, NNS), (and, CC)), ((five, CD), (years, NNS), (and, CC), (possibly, RB)), ((years, NNS), (and, CC), (possibly, RB), (cure, VB)), ((and, CC), (possibly, RB), (cure, VB), (hepatosplenic, JJ)), ((possibly, RB), (cure, VB), (hepatosplenic, JJ), (lymphoma, NN)), ((cure, VB), (hepatosplenic, JJ), (lymphoma, NN), (with, IN)), ((hepatosplenic, JJ), (lymphoma, NN), (with, IN), (a, DT)), ((lymphoma, NN), (with, IN), (a, DT), (number, NN)), ((with, IN), (a, DT), (number, NN), (of, IN)), ((a, DT), (number, NN), (of, IN), (patients, NNS)), ((number, NN), (of, IN), (patients, NNS), (surviving, VBG)), ((of, IN), (patients, NNS), (surviving, VBG), (if, IN)), ((patients, NNS), (surviving, VBG), (if, IN), (they, PRP)), ((surviving, VBG), (if, IN), (they, PRP), (can, MD)), ((if, IN), (they, PRP), (can, MD), (make, VB)), ((they, PRP), (can, MD), (make, VB), (it, PRP)), ((can, MD), (make, VB), (it, PRP), (to, TO)), ((make, VB), (it, PRP), (to, TO), (allogenic, VB)), ((it, PRP), (to, TO), (allogenic, VB), (transplant, NN))]"
350759,Hepatosplenic T-cell lymphoma,27,"This lymphoma is rare, comprising less than 5% of all cases, and is most common in young adults and adolescents.","[((this, DT),), ((lymphoma, NN),), ((is, VBZ),), ((rare, JJ),), ((comprising, VBG),), ((less, JJR),), ((than, IN),), ((5, CD),), ((of, IN),), ((all, DT),), ((cases, NNS),), ((and, CC),), ((is, VBZ),), ((most, RBS),), ((common, JJ),), ((in, IN),), ((young, JJ),), ((adults, NNS),), ((and, CC),), ((adolescents, NNS),)]","[((this, DT), (lymphoma, NN)), ((lymphoma, NN), (is, VBZ)), ((is, VBZ), (rare, JJ)), ((rare, JJ), (comprising, VBG)), ((comprising, VBG), (less, JJR)), ((less, JJR), (than, IN)), ((than, IN), (5, CD)), ((5, CD), (of, IN)), ((of, IN), (all, DT)), ((all, DT), (cases, NNS)), ((cases, NNS), (and, CC)), ((and, CC), (is, VBZ)), ((is, VBZ), (most, RBS)), ((most, RBS), (common, JJ)), ((common, JJ), (in, IN)), ((in, IN), (young, JJ)), ((young, JJ), (adults, NNS)), ((adults, NNS), (and, CC)), ((and, CC), (adolescents, NNS))]","[((this, DT), (lymphoma, NN), (is, VBZ)), ((lymphoma, NN), (is, VBZ), (rare, JJ)), ((is, VBZ), (rare, JJ), (comprising, VBG)), ((rare, JJ), (comprising, VBG), (less, JJR)), ((comprising, VBG), (less, JJR), (than, IN)), ((less, JJR), (than, IN), (5, CD)), ((than, IN), (5, CD), (of, IN)), ((5, CD), (of, IN), (all, DT)), ((of, IN), (all, DT), (cases, NNS)), ((all, DT), (cases, NNS), (and, CC)), ((cases, NNS), (and, CC), (is, VBZ)), ((and, CC), (is, VBZ), (most, RBS)), ((is, VBZ), (most, RBS), (common, JJ)), ((most, RBS), (common, JJ), (in, IN)), ((common, JJ), (in, IN), (young, JJ)), ((in, IN), (young, JJ), (adults, NNS)), ((young, JJ), (adults, NNS), (and, CC)), ((adults, NNS), (and, CC), (adolescents, NNS))]","[((this, DT), (lymphoma, NN), (is, VBZ), (rare, JJ)), ((lymphoma, NN), (is, VBZ), (rare, JJ), (comprising, VBG)), ((is, VBZ), (rare, JJ), (comprising, VBG), (less, JJR)), ((rare, JJ), (comprising, VBG), (less, JJR), (than, IN)), ((comprising, VBG), (less, JJR), (than, IN), (5, CD)), ((less, JJR), (than, IN), (5, CD), (of, IN)), ((than, IN), (5, CD), (of, IN), (all, DT)), ((5, CD), (of, IN), (all, DT), (cases, NNS)), ((of, IN), (all, DT), (cases, NNS), (and, CC)), ((all, DT), (cases, NNS), (and, CC), (is, VBZ)), ((cases, NNS), (and, CC), (is, VBZ), (most, RBS)), ((and, CC), (is, VBZ), (most, RBS), (common, JJ)), ((is, VBZ), (most, RBS), (common, JJ), (in, IN)), ((most, RBS), (common, JJ), (in, IN), (young, JJ)), ((common, JJ), (in, IN), (young, JJ), (adults, NNS)), ((in, IN), (young, JJ), (adults, NNS), (and, CC)), ((young, JJ), (adults, NNS), (and, CC), (adolescents, NNS))]"
350760,Hepatosplenic T-cell lymphoma,28,A distinct male gender preference has been described.,"[((a, DT),), ((distinct, JJ),), ((male, NN),), ((gender, NN),), ((preference, NN),), ((has, VBZ),), ((been, VBN),), ((described, VBN),)]","[((a, DT), (distinct, JJ)), ((distinct, JJ), (male, NN)), ((male, NN), (gender, NN)), ((gender, NN), (preference, NN)), ((preference, NN), (has, VBZ)), ((has, VBZ), (been, VBN)), ((been, VBN), (described, VBN))]","[((a, DT), (distinct, JJ), (male, NN)), ((distinct, JJ), (male, NN), (gender, NN)), ((male, NN), (gender, NN), (preference, NN)), ((gender, NN), (preference, NN), (has, VBZ)), ((preference, NN), (has, VBZ), (been, VBN)), ((has, VBZ), (been, VBN), (described, VBN))]","[((a, DT), (distinct, JJ), (male, NN), (gender, NN)), ((distinct, JJ), (male, NN), (gender, NN), (preference, NN)), ((male, NN), (gender, NN), (preference, NN), (has, VBZ)), ((gender, NN), (preference, NN), (has, VBZ), (been, VBN)), ((preference, NN), (has, VBZ), (been, VBN), (described, VBN))]"


In [18]:
medcon_sentences = pd.read_csv('./sentences_and_ngrams/medcon_sentences.csv', sep=';')

medcon_df = pd.read_csv('./sentences_and_ngrams/medcon_sentences_ngrams.csv', sep=';')
columns_to_convert = ['unigrams',
                      'bigrams',
                      'trigrams',
                      'quandrigrams'
                     ]
# Convert selected columns into lists
for i,column in enumerate(columns_to_convert):
    medcon_df[column] = medcon_df[column].apply(lambda x: eval(x))

medcon_ngrams_filtered = []

# Frequencies of all the n-grams
unigrams_freq = nltk.FreqDist(list(itertools.chain(*medcon_df['unigrams'].tolist())))
bigrams_freq = nltk.FreqDist(list(itertools.chain(*medcon_df['bigrams'].tolist())))
trigrams_freq = nltk.FreqDist(list(itertools.chain(*medcon_df['trigrams'].tolist())))
quandrigrams_freq = nltk.FreqDist(list(itertools.chain(*medcon_df['quandrigrams'].tolist())))

# Filter n-grams per sentence based on POS patterns
for index, row in tqdm(medcon_df.iterrows()):
    filtered_unigrams = OrderedDict()
    filtered_bigrams = OrderedDict()
    filtered_trigrams = OrderedDict()
    filtered_quandrigrams = OrderedDict()
    
    for unigram in row['unigrams']:
        # exclude n-grams with term frequency less than 3
        if unigrams_freq[unigram] >= 3:
            # Extracting “Noun"
            #'NN' is 'noun, singular or mass', 'NNS' is 'noun, plural'
            if unigram[0][1] == 'NN' or unigram[0][1] == 'NNS':
                #print(unigram, unigrams_freq[unigram])
                #print(" ")
                filtered_unigrams[unigram] = unigrams_freq[unigram]
    for bigram in row['bigrams']:
        if bigrams_freq[bigram] >= 3:
            # “Noun+Noun”
            if (bigram[0][1] == 'NN') and (bigram[1][1] == 'NN' or bigram[1][1] == 'NNS'):
                #print(bigram, bigrams_freq[bigram])
                filtered_bigrams[bigram] = bigrams_freq[bigram]
            # "Verb(past particple)+Noun"
            if (bigram[0][1] == 'VBN') and (bigram[1][1] == 'NN' or bigram[1][1] == 'NNS'):
                #print(bigram, bigrams_freq[bigram])
                filtered_bigrams[bigram] = bigrams_freq[bigram]
            # “Adjective+Noun”
            if (bigram[0][1] == 'JJ') and (bigram[1][1] == 'NN' or bigram[1][1] == 'NNS'):
                #print(bigram, bigrams_freq[bigram])
                filtered_bigrams[bigram] = bigrams_freq[bigram] 
    for trigram in row['trigrams']:
        if trigrams_freq[trigram] >= 3:
            # 'IN' is 'conjunction, subordinating or preposition', 'CC' is 'conjunction, coordinating', 'POS' is 'possessive ending'
            if (trigram[0][1] == 'NN' or trigram[0][1] == 'NNS' or trigram[0][1] == 'JJ' or trigram[0][1] == 'VB') and \
            (trigram[1][1] == 'NN' or trigram[1][1] == 'JJ' or trigram[1][1] == 'VB' or trigram[1][1] == 'IN' or trigram[1][1] == 'CC' or trigram[1][1] == 'POS') and \
            (trigram[2][1] == 'NN' or trigram[2][1] == 'NNS' or trigram[2][1] == 'JJ' or trigram[2][1] == 'VB'):
                #print(trigram, trigrams_freq[trigram])
                filtered_trigrams[trigram] = trigrams_freq[trigram]
    for quandrigram in row['quandrigrams']:
        if quandrigrams_freq[quandrigram] >= 3:
            # 'IN' is 'conjunction, subordinating or preposition', 'CC' is 'conjunction, coordinating', 'DT' is 'determiner'
            if (quandrigram[0][1] == 'NN' or quandrigram[0][1] == 'NNS' or quandrigram[0][1] == 'JJ' or quandrigram[0][1] == 'VB') and \
            (quandrigram[1][1] == 'NN' or quandrigram[1][1] == 'JJ' or quandrigram[1][1] == 'VB' or quandrigram[1][1] == 'IN' or quandrigram[1][1] == 'CC') and \
            (quandrigram[2][1] == 'NN' or quandrigram[2][1] == 'JJ' or quandrigram[2][1] == 'VB' or quandrigram[2][1] == 'IN' or quandrigram[2][1] == 'CC' or quandrigram[2][1] == 'DT') and \
            (quandrigram[3][1] == 'NN' or quandrigram[3][1] == 'NNS' or quandrigram[3][1] == 'JJ' or quandrigram[3][1] == 'VB'):
                #print(quandrigram, quandrigrams_freq[quandrigram])
                filtered_quandrigrams[quandrigram] = quandrigrams_freq[quandrigram]
    
    unigrams_grouped = group_ngrams(filtered_unigrams)
    bigrams_grouped = group_ngrams(filtered_bigrams)
    trigrams_grouped = group_ngrams(filtered_trigrams)
    quandrigrams_grouped = group_ngrams(filtered_quandrigrams)
    
    medcon_ngrams_filtered.append(OrderedDict({"filtered_unigrams": filtered_unigrams,
                                               "filtered_bigrams": filtered_bigrams,
                                               "filtered_trigrams": filtered_trigrams,
                                               "filtered_quandrigrams": filtered_quandrigrams,
                                               "unigrams_grouped": unigrams_grouped,
                                               "bigrams_grouped": bigrams_grouped,
                                               "trigrams_grouped": trigrams_grouped,
                                               "quandrigrams_grouped": quandrigrams_grouped
                                              }))

medcon_ngrams_filtered = pd.DataFrame(medcon_ngrams_filtered)
medcon_df_filtered = pd.concat([medcon_sentences, medcon_ngrams_filtered], axis=1)

      



0it [00:00, ?it/s][A[A

3it [00:00, 29.96it/s][A[A

72it [00:00, 40.36it/s][A[A

639it [00:00, 57.48it/s][A[A

1179it [00:00, 81.74it/s][A[A

1733it [00:00, 116.04it/s][A[A

2268it [00:00, 164.24it/s][A[A

2814it [00:00, 231.64it/s][A[A

3361it [00:01, 325.01it/s][A[A

3904it [00:01, 452.68it/s][A[A

4442it [00:01, 624.15it/s][A[A

4987it [00:01, 849.86it/s][A[A

5526it [00:01, 1137.16it/s][A[A

6062it [00:01, 1488.92it/s][A[A

6610it [00:01, 1905.16it/s][A[A

7162it [00:01, 2370.40it/s][A[A

7702it [00:01, 2840.56it/s][A[A

8245it [00:01, 3313.84it/s][A[A

8792it [00:02, 3757.69it/s][A[A

9333it [00:02, 4112.74it/s][A[A

9870it [00:02, 4419.16it/s][A[A

10407it [00:02, 4635.59it/s][A[A

10955it [00:02, 4859.50it/s][A[A

11499it [00:02, 5017.61it/s][A[A

12038it [00:02, 5109.79it/s][A[A

12576it [00:02, 5187.22it/s][A[A

13119it [00:02, 5256.33it/s][A[A

13663it [00:02, 5308.49it/s][A[A

14204it [00:03, 5326.16it/s][A[A

14744

233798it [00:52, 5173.35it/s][A[A

234318it [00:52, 5179.28it/s][A[A

234837it [00:52, 5134.68it/s][A[A

235351it [00:52, 5066.61it/s][A[A

235873it [00:52, 5108.93it/s][A[A

236385it [00:52, 5057.60it/s][A[A

236892it [00:52, 5056.56it/s][A[A

237401it [00:52, 5065.26it/s][A[A

237908it [00:53, 4971.63it/s][A[A

238433it [00:53, 5049.77it/s][A[A

238958it [00:53, 5106.66it/s][A[A

239493it [00:53, 5175.52it/s][A[A

240012it [00:53, 5151.31it/s][A[A

240556it [00:53, 5232.06it/s][A[A

241085it [00:53, 5247.82it/s][A[A

241611it [00:53, 5158.30it/s][A[A

242128it [00:53, 5158.58it/s][A[A

242645it [00:53, 5130.30it/s][A[A

243159it [00:54, 5118.97it/s][A[A

243672it [00:54, 5088.03it/s][A[A

244182it [00:54, 5078.10it/s][A[A

244695it [00:54, 5093.06it/s][A[A

245205it [00:54, 5037.33it/s][A[A

245709it [00:54, 5009.19it/s][A[A

246211it [00:54, 4992.40it/s][A[A

246714it [00:54, 5001.52it/s][A[A

247226it [00:54, 5034.69it/s][A[A

2

In [19]:
medcon_df_filtered

Unnamed: 0,source_page,sent_id,sentence,filtered_unigrams,filtered_bigrams,filtered_trigrams,filtered_quandrigrams,unigrams_grouped,bigrams_grouped,trigrams_grouped,quandrigrams_grouped
0,Autism,1,"Autism is a developmental disorder characterized by difficulties with social interaction and communication, and by restricted and repetitive behavior.","{(('autism', 'NN'),): 575, (('disorder', 'NN'),): 11386, (('difficulties', 'NNS'),): 883, (('interaction', 'NN'),): 301, (('communication', 'NN'),): 552, (('behavior', 'NN'),): 1179}","{(('developmental', 'JJ'), ('disorder', 'NN')): 47, (('social', 'JJ'), ('interaction', 'NN')): 57, (('repetitive', 'JJ'), ('behavior', 'NN')): 17}","{(('difficulties', 'NNS'), ('with', 'IN'), ('social', 'JJ')): 3, (('interaction', 'NN'), ('and', 'CC'), ('communication', 'NN')): 7}","{(('social', 'JJ'), ('interaction', 'NN'), ('and', 'CC'), ('communication', 'NN')): 7}","{'autism': 575, 'disorder': 11386, 'difficulties': 883, 'interaction': 301, 'communication': 552, 'behavior': 1179}","{'developmental disorder': 47, 'social interaction': 57, 'repetitive behavior': 17}","{'difficulties with social': 3, 'interaction and communication': 7}",{'social interaction and communication': 7}
1,Autism,2,Autism's symptoms result from maturation-related changes in various systems of the brain.,"{(('autism', 'NN'),): 575, (('symptoms', 'NNS'),): 19658, (('changes', 'NNS'),): 3171, (('systems', 'NNS'),): 919, (('brain', 'NN'),): 6854}","{(('various', 'JJ'), ('systems', 'NNS')): 5}",{},{},"{'autism': 575, 'symptoms': 19658, 'changes': 3171, 'systems': 919, 'brain': 6854}",{'various systems': 5},{},{}
2,Autism,3,"Some studies have reported diagnoses of autism in children due to a loss of language or social skills, as opposed to a failure to make progress, typically from 15 to 30 months of age.","{(('studies', 'NNS'),): 5061, (('diagnoses', 'NNS'),): 472, (('autism', 'NN'),): 575, (('children', 'NNS'),): 7935, (('loss', 'NN'),): 6738, (('language', 'NN'),): 1124, (('skills', 'NNS'),): 583, (('failure', 'NN'),): 3369, (('progress', 'NN'),): 346, (('months', 'NNS'),): 3177, (('age', 'NN'),): 6394}","{(('social', 'JJ'), ('skills', 'NNS')): 80}","{(('diagnoses', 'NNS'), ('of', 'IN'), ('autism', 'NN')): 3, (('loss', 'NN'), ('of', 'IN'), ('language', 'NN')): 7, (('months', 'NNS'), ('of', 'IN'), ('age', 'NN')): 154}",{},"{'studies': 5061, 'diagnoses': 472, 'autism': 575, 'children': 7935, 'loss': 6738, 'language': 1124, 'skills': 583, 'failure': 3369, 'progress': 346, 'months': 3177, 'age': 6394}",{'social skills': 80},"{'diagnoses of autism': 3, 'loss of language': 7, 'months of age': 154}",{}
3,Autism,4,No known medication relieves autism's core symptoms of social and communication impairments.,"{(('medication', 'NN'),): 1962, (('autism', 'NN'),): 575, (('core', 'NN'),): 282, (('symptoms', 'NNS'),): 19658, (('communication', 'NN'),): 552, (('impairments', 'NNS'),): 365}","{(('core', 'NN'), ('symptoms', 'NNS')): 11, (('communication', 'NN'), ('impairments', 'NNS')): 3}","{(('symptoms', 'NNS'), ('of', 'IN'), ('social', 'JJ')): 6, (('social', 'JJ'), ('and', 'CC'), ('communication', 'NN')): 5}",{},"{'medication': 1962, 'autism': 575, 'core': 282, 'symptoms': 19658, 'communication': 552, 'impairments': 365}","{'core symptoms': 11, 'communication impairments': 3}","{'symptoms of social': 6, 'social and communication': 5}",{}
4,Autism,5,"Treatment approaches have little empirical support in quality-of-life contexts, and many programs focus on success measures that lack predictive validity and real-world relevance.","{(('treatment', 'NN'),): 17805, (('approaches', 'NNS'),): 421, (('support', 'NN'),): 1085, (('contexts', 'NN'),): 55, (('programs', 'NNS'),): 453, (('success', 'NN'),): 597, (('measures', 'NNS'),): 860, (('validity', 'NN'),): 84, (('relevance', 'NN'),): 48}","{(('treatment', 'NN'), ('approaches', 'NNS')): 27, (('empirical', 'JJ'), ('support', 'NN')): 11, (('predictive', 'JJ'), ('validity', 'NN')): 3}",{},{},"{'treatment': 17805, 'approaches': 421, 'support': 1085, 'contexts': 55, 'programs': 453, 'success': 597, 'measures': 860, 'validity': 84, 'relevance': 48}","{'treatment approaches': 27, 'empirical support': 11, 'predictive validity': 3}",{},{}
...,...,...,...,...,...,...,...,...,...,...,...
350757,Hepatosplenic T-cell lymphoma,25,Autologous bone marrow transplantation is currently being investigated in the treatment of hepatosplenic lymphoma.,"{(('bone', 'NN'),): 4711, (('marrow', 'NN'),): 1045, (('transplantation', 'NN'),): 686, (('treatment', 'NN'),): 17805, (('lymphoma', 'NN'),): 1099}","{(('autologous', 'JJ'), ('bone', 'NN')): 9, (('bone', 'NN'), ('marrow', 'NN')): 903, (('marrow', 'NN'), ('transplantation', 'NN')): 71}","{(('autologous', 'JJ'), ('bone', 'NN'), ('marrow', 'NN')): 7, (('bone', 'NN'), ('marrow', 'NN'), ('transplantation', 'NN')): 67}","{(('autologous', 'JJ'), ('bone', 'NN'), ('marrow', 'NN'), ('transplantation', 'NN')): 3}","{'bone': 4711, 'marrow': 1045, 'transplantation': 686, 'treatment': 17805, 'lymphoma': 1099}","{'autologous bone': 9, 'bone marrow': 903, 'marrow transplantation': 71}","{'autologous bone marrow': 7, 'bone marrow transplantation': 67}",{'autologous bone marrow transplantation': 3}
350758,Hepatosplenic T-cell lymphoma,26,Allogeneic bone marrow transplant has been proven to attain remission for over five years and possibly cure hepatosplenic lymphoma with a number of patients surviving if they can make it to Allogenic transplant.,"{(('bone', 'NN'),): 4711, (('marrow', 'NN'),): 1045, (('transplant', 'NN'),): 558, (('remission', 'NN'),): 382, (('years', 'NNS'),): 6127, (('lymphoma', 'NN'),): 1099, (('number', 'NN'),): 4026, (('patients', 'NNS'),): 16462}","{(('bone', 'NN'), ('marrow', 'NN')): 903, (('marrow', 'NN'), ('transplant', 'NN')): 67}","{(('bone', 'NN'), ('marrow', 'NN'), ('transplant', 'NN')): 58, (('number', 'NN'), ('of', 'IN'), ('patients', 'NNS')): 71}",{},"{'bone': 4711, 'marrow': 1045, 'transplant': 558, 'remission': 382, 'years': 6127, 'lymphoma': 1099, 'number': 4026, 'patients': 16462}","{'bone marrow': 903, 'marrow transplant': 67}","{'bone marrow transplant': 58, 'number of patients': 71}",{}
350759,Hepatosplenic T-cell lymphoma,27,"This lymphoma is rare, comprising less than 5% of all cases, and is most common in young adults and adolescents.","{(('lymphoma', 'NN'),): 1099, (('cases', 'NNS'),): 14393, (('adults', 'NNS'),): 2646, (('adolescents', 'NNS'),): 467}","{(('young', 'JJ'), ('adults', 'NNS')): 265}","{(('common', 'JJ'), ('in', 'IN'), ('young', 'JJ')): 28, (('adults', 'NNS'), ('and', 'CC'), ('adolescents', 'NNS')): 5}","{(('common', 'JJ'), ('in', 'IN'), ('young', 'JJ'), ('adults', 'NNS')): 6}","{'lymphoma': 1099, 'cases': 14393, 'adults': 2646, 'adolescents': 467}",{'young adults': 265},"{'common in young': 28, 'adults and adolescents': 5}",{'common in young adults': 6}
350760,Hepatosplenic T-cell lymphoma,28,A distinct male gender preference has been described.,"{(('male', 'NN'),): 727, (('gender', 'NN'),): 548, (('preference', 'NN'),): 110}","{(('male', 'NN'), ('gender', 'NN')): 6, (('gender', 'NN'), ('preference', 'NN')): 4}",{},{},"{'male': 727, 'gender': 548, 'preference': 110}","{'male gender': 6, 'gender preference': 4}",{},{}


In [20]:
medcon_df_filtered.to_csv('./sentences_and_ngrams/medcon_sentences_ngrams_filtered.csv', sep=';', index=False)