In [5]:
# testing text processing and possible rules to implement

import pandas as pd
import nltk 
from nltk import tokenize #, word_tokenize
from nltk.corpus import stopwords
# from spacy.morphology import Morphology
# import spacy
# nlp = spacy.load("en_core_web_sm")

# nlp = spacy.load("en_core_web_sm")
# from nltk.parse.stanford import StanfordParser
# from pycorenlp import StanfordCoreNLP


In [2]:
# data will be a list of strings from dataframe
smoker_data_df = pd.read_csv('smoker_status.csv')

identifiers = smoker_data_df['row_id'].to_list()
statuses = smoker_data_df['status'].to_list()
med_notes = smoker_data_df['text'].to_list()

assert len(identifiers) == len(statuses) == len(med_notes), 'Three cols not the same lengths.'

In [6]:
# split into sentences test
test_row = smoker_data_df.iloc[[30]].values.tolist()
test_sent = test_row[0][2]
print(test_row)

# split into sentences
split_sent = tokenize.sent_tokenize(test_sent)
print('--------------')
print('sentence split')
print(split_sent)
# not effective for most of medical notes, returns full sentences

[[34464, 'Smoker', "Past Medical History: current smoker  Social History: married current smoker >10cigrettes/day current ETOH use,heavy per patient's family  Family History: + father and brother died of myocardial infraction  Physical Exam: Vitals:97.9-63-19"]]
--------------
sentence split
["Past Medical History: current smoker  Social History: married current smoker >10cigrettes/day current ETOH use,heavy per patient's family  Family History: + father and brother died of myocardial infraction  Physical Exam: Vitals:97.9-63-19"]


In [7]:
# split into words test (most effective would have been sentences then words)
wordtok_sent = nltk.word_tokenize(test_sent)
print(wordtok_sent)

['Past', 'Medical', 'History', ':', 'current', 'smoker', 'Social', 'History', ':', 'married', 'current', 'smoker', '>', '10cigrettes/day', 'current', 'ETOH', 'use', ',', 'heavy', 'per', 'patient', "'s", 'family', 'Family', 'History', ':', '+', 'father', 'and', 'brother', 'died', 'of', 'myocardial', 'infraction', 'Physical', 'Exam', ':', 'Vitals:97.9-63-19']


In [8]:
# split all sentences into words
split_words = [nltk.word_tokenize(every_text)for every_text in med_notes]

In [9]:
# create baseline
    
# if words not related to smoking does not appear, label as unknown ==> F1
smoke_words = ['smoke', 
               'smoker', 
               'smoking', 'Smoking',
               'smokes',
               'smoked', 'Smoked', 
               'ppd', # maybe not needed
               'nonsmoker', 'Nonsmoker',]
                # useful words to add: 'tob', 'tobacco', 'cigs', 'cigarettes'


# all sentences should be okay for this dataset
def baseline(all_sents):
    
    to_label_unknown = []
    
    for every_sentence_idx in range(len(all_sents)):
        
        word_list = all_sents[every_sentence_idx]
        
        # check if any of the smoke_words appear in each sentence
        if any([every_smoke_word in word_list for every_smoke_word in smoke_words]) == False:
            
            to_label_unknown.append(every_sentence_idx)
    
    return to_label_unknown
    
#     return to_unknown

baseline(split_words)
    
    

[22]

In [101]:
# next state, nonsmoker
    # labels have 'nonsmoker' or 'Nonsmoker'
    # or negation, POS?

# all_sents = after baseline
def find_nonsmokers(all_sents):
    
    nonsmoker_words = ['nonsmoker', 'Nonsmoker']
    
    to_label_nonsmoker = []
    
    for every_sentence_idx in range(len(all_sents)):
        
        word_list = all_sents[every_sentence_idx]
        
        # check if any of nonsmoker_words appear in each sentence
        if any([every_word in word_list for every_word in nonsmoker_words]) == True:
            
            to_label_nonsmoker.append(every_sentence_idx)
        
        
        # check if text includes 'does not smoke', 'no h/o smoking', 'does not drink, smoke, or do any drugs'
            # relies on text only having 'smoker' appear once
            
        # 'no h/o smoking'
        if word_list.count('smoking') == 1:
            
            #if 'smoking' in word_list:

            smoking_idx = word_list.index('smoking')
            ho_idx = smoking_idx - 1
            no_idx = smoking_idx - 2

            # 'no', 'h/o', 'smoking' must appear in exact order in word list
            if word_list[ho_idx] == 'h/o' and word_list[no_idx] == 'no':

                to_label_nonsmoker.append(every_sentence_idx)
                #print(word_list)
        
        # 'does not smoke'
        if word_list.count('smoke') == 1:
            
            #if 'smoke' in word_list:
            smoke_idx = word_list.index('smoke')
            not_idx = smoke_idx - 1 
            does_idx = smoke_idx - 2

            if word_list[not_idx] == 'not' and word_list[does_idx] == 'does':

                to_label_nonsmoker.append(every_sentence_idx)
        
        
#     print('done all sents')
#     print(to_label_nonsmoker)
    # make sure there are no double indices   
    unique_labels_nonsmoker = list(set(to_label_nonsmoker))

    return unique_labels_nonsmoker


find_nonsmokers(split_words)     
# [15, 16, 27, 37, 38, 55, 65]
# [65, 37, 38, 15, 16, 55, 27]

[65, 37, 38, 15, 16, 55, 27]

In [102]:
# next state, unknown labels in data all have 'with a smoker in the household'
    # too specific, should have another check in case it has other?

def smoker_in_household(all_sents):
    
    to_label_unknown = []
    
    for every_sentence_idx in range(len(all_sents)):
        
        word_list = all_sents[every_sentence_idx]
        
        # check if 'smoker in the household' is in sentence
            # relies on text only having 'smoker' appear once
            # no other chances of 'smoke-related' info to be mentioned
        if word_list.count('smoker') == 1:
            
            #if 'smoker' in word_list:

            smoker_idx = word_list.index('smoker')
            in_idx = smoker_idx + 1
            the_idx = smoker_idx + 2
            household_idx = smoker_idx + 3

            # 'smoker', 'in', 'the', 'household' must appear in exact order in word list
            if word_list[in_idx] == 'in' and word_list[the_idx] == 'the' and word_list[household_idx] == 'household':
                to_label_unknown.append(every_sentence_idx)

            
#     print('done sents')
#     print(to_label_unknown)
    return to_label_unknown
        
smoker_in_household(split_words)
# [0, 4, 17, 28, 32, 34, 59, 61, 67]

[0, 4, 17, 28, 32, 34, 59, 61, 67]

In [12]:
# after smoker words inside word_list

# check suffixes of words in sentences that have not been labelled as unknown
    # --> should diverge to Smoker and Former Smoker labels



In [103]:
# find former smokers

# med_notes are not split into words

act_of_smoking = ['smoking', 'cigarettes', 'tobacco', 'tob']
smoked_NN = ['cigarette', 'cigarettes', 'cigs', 'tobacco', 'tob', 'packs', 'pack'] # unsure if packs should be included
temporal_words = ['years', 'yrs', 'months', 'weeks', 'wks', 'days'] # maybe change to time units (there are other temporal words)
approximate_words = ['under', 'over', 'around', 'approx', 'approximately','about']


def find_former_smokers(all_sents):
    
    to_label_formersmoker = []
    
    for every_sentence_idx in range(len(all_sents)):
        
        word_list = all_sents[every_sentence_idx]

        pos_tagged = nltk.pos_tag(word_list)
#         print('word list')
#         print(word_list)
        
#         print('pos-tagged')
#         print(pos_tagged)
        
        # should i be using elif?
        
        if word_list.count('quit') == 1 or word_list.count('Quit') == 1:
            
            # lowercase 'quit', easier to index both since they would follow the same patterns, just appear differently sometimes
            lowercase_quit = [every_word.lower() if every_word == 'Quit' else every_word for every_word in word_list]
#             print(lowercase_quit)
            
            quit_idx = lowercase_quit.index('quit')
            after_quit_idx = quit_idx + 1


            # check if 'quit ACT_OF_SMOKING_WORD A_NUMBER TEMPORAL_WORD ago'
            if word_list[after_quit_idx] in act_of_smoking and pos_tagged[after_quit_idx + 1][1] == 'CD' and word_list[after_quit_idx + 2] in temporal_words and word_list[after_quit_idx + 3] == 'ago':

#                 print('quit smoking # time ago')
#                 print(word_list)
                to_label_formersmoker.append(every_sentence_idx)
            
            # takes into account 'quit smoking X A_NUMBER TEMPORAL_WORD ago' X = over, around, under
            if word_list[after_quit_idx] == 'smoking' and word_list[after_quit_idx + 1] in approximate_words and pos_tagged[after_quit_idx + 2][1] == 'CD' and word_list[after_quit_idx + 3] in temporal_words and word_list[after_quit_idx + 4] == 'ago':
#                 print('quit smoking X A_NUMBER TEMPORAL_WORD ago')
#                 print(word_list)
                to_label_formersmoker.append(every_sentence_idx)
                
                
            # might indicate that quit something other than smoking
            if pos_tagged[after_quit_idx][1] == 'CD' and word_list[after_quit_idx + 1] in temporal_words and word_list[after_quit_idx + 2] == 'ago':
#                 print('quit # time ago')
#                 print(word_list)
                to_label_formersmoker.append(every_sentence_idx)
        
        # search 'former smoker'
        if word_list.count('smoker') == 1:
            smoker_idx = word_list.index('smoker')
            before_smoker_idx = smoker_idx - 1
            
            # allows 'former' and 'Former'
            if word_list[before_smoker_idx].lower() == 'former':
                
                # some texts have both 'quit smoking X years ago' AND 'former smoker'
                if every_sentence_idx not in to_label_formersmoker:
                    to_label_formersmoker.append(every_sentence_idx)

#     print('done sentences')
#     print(to_label_formersmoker)
    # dont forget return statement
    
    # make sure there are no double indices   
    unique_labels_former_smoker = list(set(to_label_formersmoker))

    return unique_labels_former_smoker

find_former_smokers(split_words)
# recent 
# [1, 2, 3, 12, 18, 19, 23, 35, 47, 49, 60]
# [2, 11, 36, 43, 58]

[1, 2, 3, 35, 12, 47, 49, 18, 19, 23, 60]

In [104]:
# find former smoker
def smoked_past(all_sents):
    
    to_label_formersmoker = []
    
    for every_sentence_idx in range(len(all_sents)):
        
        word_list = all_sents[every_sentence_idx]

        pos_tagged = nltk.pos_tag(word_list)
    
            # 'smoked X '
        if word_list.count('smoked') == 1 or word_list.count('Smoked') == 1:
            
            # lowercase 'smoked', easier to index both since they would follow the same patterns, just appear differently sometimes
            lowercase_smoked = [every_word.lower() if every_word == 'Smoked' else every_word for every_word in word_list]
            
            smoked_idx = lowercase_smoked.index('smoked')
            after_smoked_idx = smoked_idx + 1
            
#             print('has smoked')
#             print(word_list)

            # smoked cigarettes for 60 years 
            # (added for 60 years because 'smoked' can appear in (current) Smoker as well, establishes time has passed)
            if word_list[after_smoked_idx] in smoked_NN and word_list[after_smoked_idx + 1] == 'for' and pos_tagged[after_smoked_idx + 2][1] == 'CD'and word_list[after_smoked_idx + 3] in temporal_words:
                
#                 print('smoked NN for CD TEMP WORD')
#                 print(word_list)
                to_label_formersmoker.append(every_sentence_idx)
            
            # both smoke and quit have to be in text (can lead to false positives if multiple drugs are mentioned)
            if word_list.count('quit') == 1:
            
                print('smoked quit')
                print(word_list)
                to_label_formersmoker.append(every_sentence_idx)
            
            # smoked two packs per day
            # smoked about 0.5-1 packs per week
            # smoked 2.5 ppd
                # smoked 2ppd
            # smoked a pack a day
            
            
#     print(to_label_formersmoker)

    # make sure there are no double indices   
    unique_labels_former_smoker = list(set(to_label_formersmoker))

    return unique_labels_former_smoker 
smoked_past(split_words)
# [2, 11, 36, 43, 58]


smoked quit
['She', 'smoked', 'two', 'packs', 'per', 'day', 'for', '40', 'years', 'and', 'quit', 'four', 'years', 'ago', '.']
smoked quit
['hernia', 'repair', 'Social', 'History', ':', 'retired', 'window', 'maker', 'quit', 'smoking', '*', '*', '2106', '*', '*', ';', 'smoked', '2.5', 'ppd', 'for', '10', 'years', 'no', 'ETOH', 'lives', 'with', 'wife', 'Family', 'History', ':', 'no', 'premature', 'CAD', 'Physical', 'Exam', ':', 'PERRLA', ',', 'EOMI', ',', 'right', 'eye', 'with', 'baseline', 'congenital', 'drift', 'alert', 'and', 'oriented', 'x3', ',', 'MAE', '*', '*', '3-28', '*', '*', 'strengths', ',', 'steady', 'gait', 'RRR', ',', 'crisp', 'valve', 'click', ',', '?', 'murmur', 'CTAB', 'soft', ',', 'NT', ',', 'ND', ',', 'obese', 'abd', ',', 'no', 'palpable', 'masses', ',', '+', 'BS', 'extrems', 'warm', 'with', 'bil', 'LE', 'varicosities', 'no', 'carotid', 'bruits', '2+', 'bil', '.']
smoked quit
['He', 'smoked', 'a', 'pack', 'a', 'day', 'for', '40', 'years', 'and', 'quit', 'in', '*', '*',

[2, 36, 43, 11, 58]

In [42]:
# # might be better for Smoker
#             # smoked for approx. seven years
#             if word_list[after_smoked_idx] == 'for' : #and word_list[after_smoked_idx + 1] == 'for' and pos_tagged[after_smoked_idx + 2][1] == 'CD'and word_list[after_smoked_idx + 3] in temporal_words:
#                 print('smoked for')
#                 print(word_list)

In [105]:
# find (current) smokers
current_words = ['current', 'Current', 'currently', 'Currently', 'active', 'Active', 'actively', 'Actively', 'still', 'Still', 'reports', 'Reports']
smoking_present = ['smoking', 'smokes', 'smoke']

def find_current_smokers(all_sents):
    
    to_label_smoker = []
    
    for every_sentence_idx in range(len(all_sents)):
        
        word_list = all_sents[every_sentence_idx]

        pos_tagged = nltk.pos_tag(word_list)
        
        # 'CURRENT_WORD smoker'
        if 'smoker' in word_list or 'Smoker' in word_list:

            lowercase_smoker = [every_word.lower() if every_word == 'Smoker' else every_word for every_word in word_list]

            smoker_idx = lowercase_smoker.index('smoker')
                
            smoker_idx = word_list.index('smoker')
            before_smoker_idx = smoker_idx - 1
            
            if word_list[before_smoker_idx] in current_words:
                to_label_smoker.append(every_sentence_idx)
        
        # 'CURRENT_WORD smoking'
        if 'smoking' in word_list:
            smoking_idx = word_list.index('smoking')
            
            before_smoking_idx = smoking_idx - 1
            
            if word_list[before_smoking_idx] in current_words:
                to_label_smoker.append(every_sentence_idx)
                
            # 'due to smoking'
            if word_list[before_smoking_idx] == 'to' and word_list[before_smoking_idx - 1] == 'due':
                to_label_smoker.append(every_sentence_idx)
            
        # 'smokes/smoking X ppd'
            # would capture 14238 and 19217 if no asterisks wrapped around numbers
        if 'ppd' in word_list:
            ppd_idx = word_list.index('ppd')
            before_ppd_idx = ppd_idx - 1 
            
            if pos_tagged[before_ppd_idx][1] == 'CD' and word_list[before_ppd_idx - 1] in smoking_present:
                
                to_label_smoker.append(every_sentence_idx)
                
        # find XPPD
            # will not find any cases in dataset, but thought would be useful
        if [True for every_word in word_list if every_word.endswith("PPD")] == [True]:
#             print(word_list)
            
            # check if split its CD and NN
            word_PPD = [every_word for every_word in word_list if every_word.endswith('PPD')]
#             print(word_PPD)
            ppd_idx = word_list.index(word_PPD[0])
            ppd_original = word_list[ppd_idx]
            
            # slicing string to make sure it's not just a word ending in PPD 
            ppd = ppd_original[-3:] # PPD in string
            before_ppd = ppd_original[:-3] # whatever comes before PPD
            
            new_ppd = [before_ppd, ppd] # list of strings to pos-tag
            postagged_new_ppd = nltk.pos_tag(new_ppd) # list of tuples [(word, POS), (word, POS)]
            
            # check if POS-tags are correct
            if postagged_new_ppd[0][1] == 'CD' and postagged_new_ppd[1][1] == 'NN':
                
                if word_list[ppd_idx - 1] in smoking_present:
                    to_label_smoker.append(every_sentence_idx)
            
            
            

#     print(to_label_smoker)
    # use set before return 
    # return statement
    
    # make sure there are no double indices   
    unique_labels_smoker = list(set(to_label_smoker))

    return unique_labels_smoker
    
find_current_smokers(split_words)
# [8, 8, 14, 20, 30, 41, 52, 62]
# [8, 8, 14, 20, 21, 30, 41, 41, 52, 62] recent
# with set [8, 41, 14, 20, 21, 52, 62, 30]
        

[8, 41, 14, 20, 21, 52, 62, 30]

In [70]:
nltk.pos_tag(split_words[20])

[('Past', 'NNP'),
 ('Medical', 'NNP'),
 ('History', 'NN'),
 (':', ':'),
 ('hypertension', 'NN'),
 ('hypercholesterolemia', 'NN'),
 ('smoking', 'VBG'),
 ('basal', 'NN'),
 ('cell', 'NN'),
 ('skin', 'NN'),
 ('cancer', 'NN'),
 ('(', '('),
 ('s/p', 'JJ'),
 ('excision', 'NN'),
 (')', ')'),
 ('Social', 'NNP'),
 ('History', 'NNP'),
 (':', ':'),
 ('smokes', 'NNS'),
 ('1', 'CD'),
 ('ppd', 'NN'),
 ('x', '$'),
 ('30', 'CD'),
 ('years', 'NNS'),
 ('drinks', 'VBZ'),
 ('18', 'CD'),
 ('beers/wk', 'NN'),
 ('over', 'IN'),
 ('the', 'DT'),
 ('weekend', 'NN'),
 ('Denies', 'VBZ'),
 ('drug', 'NN'),
 ('use', 'NN'),
 ('works', 'VBZ'),
 ('as', 'IN'),
 ('probation', 'NN'),
 ('officer', 'NN'),
 ('Family', 'NNP'),
 ('History', 'NNP'),
 (':', ':'),
 ('Father', 'NN'),
 ('had', 'VBD'),
 ('MI', 'NNP'),
 ('and', 'CC'),
 ('quadruple', 'JJ'),
 ('bypass', 'NN'),
 ('surgery', 'NN'),
 ('at', 'IN'),
 ('age', 'NN'),
 ('of', 'IN'),
 ('63', 'CD'),
 ('Physical', 'JJ'),
 ('Exam', 'NN'),
 (':', ':'),
 ('VS', 'NN'),
 (':', ':'),
 ('

In [66]:
split_words[41]

['She', 'reports', 'smoking', '2PPD', 'x', '60', 'years', '.']

In [68]:
[True for every_word in split_words[41] if every_word.endswith("PPD")]

[True]

In [73]:
nltk.pos_tag(['He', 'quit', 'smoking', 'around', '15-29', 'years', 'ago', '.'])

[('He', 'PRP'),
 ('quit', 'VBD'),
 ('smoking', 'VBG'),
 ('around', 'IN'),
 ('15-29', 'CD'),
 ('years', 'NNS'),
 ('ago', 'RB'),
 ('.', '.')]

In [141]:
# stopwords

en_stop_words = stopwords.words('english')
en_stop_words

['i',
 'me',
 'my',
 'myself',
 'we',
 'our',
 'ours',
 'ourselves',
 'you',
 "you're",
 "you've",
 "you'll",
 "you'd",
 'your',
 'yours',
 'yourself',
 'yourselves',
 'he',
 'him',
 'his',
 'himself',
 'she',
 "she's",
 'her',
 'hers',
 'herself',
 'it',
 "it's",
 'its',
 'itself',
 'they',
 'them',
 'their',
 'theirs',
 'themselves',
 'what',
 'which',
 'who',
 'whom',
 'this',
 'that',
 "that'll",
 'these',
 'those',
 'am',
 'is',
 'are',
 'was',
 'were',
 'be',
 'been',
 'being',
 'have',
 'has',
 'had',
 'having',
 'do',
 'does',
 'did',
 'doing',
 'a',
 'an',
 'the',
 'and',
 'but',
 'if',
 'or',
 'because',
 'as',
 'until',
 'while',
 'of',
 'at',
 'by',
 'for',
 'with',
 'about',
 'against',
 'between',
 'into',
 'through',
 'during',
 'before',
 'after',
 'above',
 'below',
 'to',
 'from',
 'up',
 'down',
 'in',
 'out',
 'on',
 'off',
 'over',
 'under',
 'again',
 'further',
 'then',
 'once',
 'here',
 'there',
 'when',
 'where',
 'why',
 'how',
 'all',
 'any',
 'both',
 'each

In [97]:
# will have to compare if indices appear twice in the "to_label_x" lists
a_list = [3,4, 5,6,7,7,89]
unique = set(a_list)
print(list(unique))
print(type(unique))



# indices that have no labels are unknown (initialise all unknown then replace with statuses?)

# if index has two labels, choose not the smoker

[3, 4, 5, 6, 7, 89]
<class 'set'>


In [106]:
unknown_labels_from_baseline = baseline(split_words)
unknown_labels_smoker_in_household = smoker_in_household(split_words)
nonsmoker_labels = find_nonsmokers(split_words) 
former_smoker_labels = find_former_smokers(split_words)
former_smoker_labels_past = smoked_past(split_words)
smoker_labels = find_current_smokers(split_words)

smoked quit
['She', 'smoked', 'two', 'packs', 'per', 'day', 'for', '40', 'years', 'and', 'quit', 'four', 'years', 'ago', '.']
smoked quit
['hernia', 'repair', 'Social', 'History', ':', 'retired', 'window', 'maker', 'quit', 'smoking', '*', '*', '2106', '*', '*', ';', 'smoked', '2.5', 'ppd', 'for', '10', 'years', 'no', 'ETOH', 'lives', 'with', 'wife', 'Family', 'History', ':', 'no', 'premature', 'CAD', 'Physical', 'Exam', ':', 'PERRLA', ',', 'EOMI', ',', 'right', 'eye', 'with', 'baseline', 'congenital', 'drift', 'alert', 'and', 'oriented', 'x3', ',', 'MAE', '*', '*', '3-28', '*', '*', 'strengths', ',', 'steady', 'gait', 'RRR', ',', 'crisp', 'valve', 'click', ',', '?', 'murmur', 'CTAB', 'soft', ',', 'NT', ',', 'ND', ',', 'obese', 'abd', ',', 'no', 'palpable', 'masses', ',', '+', 'BS', 'extrems', 'warm', 'with', 'bil', 'LE', 'varicosities', 'no', 'carotid', 'bruits', '2+', 'bil', '.']
smoked quit
['He', 'smoked', 'a', 'pack', 'a', 'day', 'for', '40', 'years', 'and', 'quit', 'in', '*', '*',

In [108]:
unknown_labels = unknown_labels_from_baseline + unknown_labels_smoker_in_household
nonsmoker_labels = find_nonsmokers(split_words) 

formersmoker_labels = former_smoker_labels + former_smoker_labels_past 

smoker_labels = find_current_smokers(split_words)

print('unknown', unknown_labels)
print('nonsmoker', nonsmoker_labels)
print('former smoker', formersmoker_labels)
print('smoker', smoker_labels)

unknown [22, 0, 4, 17, 28, 32, 34, 59, 61, 67]
nonsmoker [65, 37, 38, 15, 16, 55, 27]
former smoker [1, 2, 3, 35, 12, 47, 49, 18, 19, 23, 60, 2, 36, 43, 11, 58]
smoker [8, 41, 14, 20, 21, 52, 62, 30]


In [109]:
def no_doubles(first_list, second_list): 
    
    """returns tuple with two lists"""
    
    intersection = [every_index for every_index in first_list if every_index in second_list]
    
    # if there are doubles
    if len(intersection) > 0:
        
        # accounts for if there are more than one
        for every_double in intersection: 
            
            # remove from second list, keep in the first
            second_list.remove(every_double)
            
    return first_list, second_list
        
    
# give two lists, first always takes precendence

In [115]:
first = [3, 4, 6, 9, 7]
second = [5, 7, 2, 31, 1]
third = [4, 2, 8, 65]

In [116]:
no_doubles(first, second)

([3, 4, 6, 9, 7], [5, 2, 31, 1])

In [117]:
no_doubles(second, third)

([5, 2, 31, 1], [4, 8, 65])

In [119]:
first_unique, second_unique = no_doubles(first, second)
print('f', first_unique)
print('s', second_unique)

f [3, 4, 6, 9, 7]
s [5, 2, 31, 1]


In [120]:
second_unique, third_unique = no_doubles(second_unique, third)
print('s', second_unique)
print('t', third_unique)

s [5, 2, 31, 1]
t [4, 8, 65]


In [127]:
data_length = len(identifiers)

final_labels = ['Unknown'] * data_length

In [129]:
def label(init_list, index_list, str_label):
    
    for every_index in index_list:
        init_list[every_index] = str_label
    
    return init_list

In [131]:
fake_final = label(final_labels, nonsmoker_labels, 'Nonsmoker')

In [135]:
identifier_preds = list(zip(identifiers, fake_final))

In [139]:
# print(identifier_preds)
preds_df = pd.DataFrame(identifier_preds, columns=['row_id','smoking_status'])
preds_df

Unnamed: 0,row_id,smoking_status
0,11911,Unknown
1,5853,Unknown
2,5366,Unknown
3,36155,Unknown
4,19896,Unknown
...,...,...
65,3117,Nonsmoker
66,19217,Unknown
67,12516,Unknown
68,37035,Unknown


In [137]:
X, Y = map(list, zip(*identifier_preds))

In [138]:
print(X)
print(Y)

[11911, 5853, 5366, 36155, 19896, 5377, 46361, 12798, 47211, 10377, 34014, 29676, 42884, 14238, 4053, 32565, 30698, 31431, 6838, 36898, 23341, 6644, 17708, 53596, 20403, 7256, 46701, 1030, 21910, 49089, 34464, 24790, 32978, 33349, 25411, 51000, 18687, 29869, 51692, 50867, 52244, 2629, 15564, 29718, 12663, 26743, 50129, 49710, 35305, 2900, 20173, 45881, 35495, 23216, 20796, 41523, 58482, 29654, 51831, 32750, 6698, 30870, 4547, 47290, 43336, 3117, 19217, 12516, 37035, 13777]
['Unknown', 'Unknown', 'Unknown', 'Unknown', 'Unknown', 'Unknown', 'Unknown', 'Unknown', 'Unknown', 'Unknown', 'Unknown', 'Unknown', 'Unknown', 'Unknown', 'Unknown', 'Nonsmoker', 'Nonsmoker', 'Unknown', 'Unknown', 'Unknown', 'Unknown', 'Unknown', 'Unknown', 'Unknown', 'Unknown', 'Unknown', 'Unknown', 'Nonsmoker', 'Unknown', 'Unknown', 'Unknown', 'Unknown', 'Unknown', 'Unknown', 'Unknown', 'Unknown', 'Unknown', 'Nonsmoker', 'Nonsmoker', 'Unknown', 'Unknown', 'Unknown', 'Unknown', 'Unknown', 'Unknown', 'Unknown', 'Unkn

In [140]:
data_name = 'lalalala'
csv_name = data_name + 'preds.csv'
csv_name

'lalalalapreds.csv'

In [143]:
wrong = [('Smoker', 'Unknown', 5377), ('Smoker', 'Unknown', 46361), ('Smoker', 'Unknown', 12798),
         ('Smoker', 'Unknown', 10377),('Smoker', 'Unknown', 34014),('Smoker', 'Unknown', 14238),
         ('Former Smoker', 'Unknown', 17708),('Smoker', 'Unknown', 20403),('Former Smoker', 'Unknown', 7256),
        ('Former Smoker', 'Unknown', 46701),('Smoker', 'Unknown', 49089),('Smoker', 'Unknown', 24790),
        ('Former Smoker', 'Unknown', 33349),('Former Smoker', 'Unknown', 50867),('Former Smoker', 'Unknown', 52244),
        ('Former Smoker', 'Unknown', 15564),('Smoker', 'Unknown', 12663),('Non Smoker', 'Unknown', 26743),
        ('Smoker', 'Unknown', 50129),('Former Smoker', 'Unknown', 35305),('Smoker', 'Unknown', 20173),
        ('Smoker', 'Unknown', 45881),('Smoker', 'Unknown', 23216),('Former Smoker', 'Unknown', 20796),
        ('Former Smoker', 'Unknown', 58482),('Smoker', 'Unknown', 29654),('Former Smoker', 'Unknown', 47290),
        ('Smoker', 'Unknown', 43336),('Smoker', 'Unknown', 19217),('Smoker', 'Unknown', 37035),
        ('Former Smoker', 'Unknown', 13777)]

In [144]:
for every_wrong in wrong:
    row_id = every_wrong[2]
    
    row_idx = identifiers.index(row_id)
    
    sentence = med_notes[row_idx]
    print(row_id)
    print(sentence)

5377
We recommend that you work closely with your doctor to quit smoking to help preserve your lung function and reduce your risk for further COPD exacerbation.
46361
He restarted smoke 3 weeks ago.
12798
30pk-yr smoking, current. Drinks 2-3 per night.
10377
Admission Date:  **2116-2-6**              Discharge Date:   **2116-2-12**  Date of Birth:  **2076-11-17**             Sex:   M  Service: MEDICINE  Allergies: Penicillins  Attending:**First Name3 (LF) 1253** Chief Complaint: shortness of Breath  Major Surgical or Invasive Procedure: none  History of Present Illness: 39 M smoker, no PMH p/w several days of fever, cough, sore throat, found to have bilateral pneumonia and hypoxia.
34014
She smoked up until current admission.
14238
Past Medical History: post-traumatic vertigo depression End stage liver disease secondary to alcohol cirrhosis w/ ascites onset **4-/2166** elevated ferritin level umbilical hernia hepatic encephalopathy hepatic coma DT GI bleeding lung mass followed on lung