In [1]:
import warnings
warnings.filterwarnings('ignore')

In [2]:
import os
import re
from collections import OrderedDict

import pandas as pd
pd.set_option('display.max_colwidth', -1)
pd.set_option('display.max_rows', None)

from tqdm.notebook import tqdm_notebook
tqdm_notebook().pandas(leave=False)


import spacy
import en_core_web_md
from spacy.tokens import Doc, Span, Token
from spacy_lookup import Entity

HBox(children=(FloatProgress(value=1.0, bar_style='info', max=1.0), HTML(value='')))




In [3]:
nlp = en_core_web_md.load()

In [4]:
ABUSE       = 'a'
CONSUMPTION = 'c'
MENTION     = 'm'
UNRELATED   = 'u'

classmap = {
    'ABUSE'       : 'a',
    'CONSUMPTION' : 'c',
    'MENTION'     : 'm',
    'UNRELATED'   : 'u'
}

---
<h2>Load tweet data, along with docs with drugnames and phrases </h2>
Uncomment instances of fid_eval and df_eval if you have the final task evaluation data<br>
Also fid_pred and df_pred are predictions on the eval data from an upstream model, uncomment if you have such data and want to see which samples are overwritten

In [5]:
# file paths

fid_train = 'train.csv'
fid_val = 'validation.csv'
fid_eval = 'task4_test_participant.csv'
fid_pred = 'prediction_task4.csv'

fid_drugs = 'drugs_list - Sheet1.csv'
fid_keywords =  'word_list - Sheet1.csv'
fid_expressions = 'expression_list - Sheet1.csv'

In [6]:
# helper funcs

def load_tweets(fid, rename_cols={}):
    df = pd.read_csv(fid).rename(columns=rename_cols)
    df['class'] = df['class'].map(lambda x: x.lower().strip()) if 'class' in df.columns else None
    return df

In [7]:
# load tweet data

# df_train = load_tweets(fid_train, {'unprocessed_text':'text'})
# df_val   = load_tweets(fid_val  , {'unprocessed_text':'text'})
# df_eval  = load_tweets(fid_eval , {'Tweet':'text'})
# df_pred  = load_tweets(fid_pred , {'Class':'P'}).merge(df_eval.drop(columns=['class']), how='outer', on='tweetid')

In [8]:
# load other docs used in our rules

df_drugs = pd.read_csv(fid_drugs)
# make sure we fill Nan values for indeicator columns with False
df_drugs = df_drugs.applymap(lambda x: x if not pd.isnull(x) else False)
# we add extra rows for plural versions of drug
df_drugs = pd.concat([df_drugs,
                      df_drugs.assign(term = lambda x: x['term'] + 's')],
                     axis=0)


# create an nlp pipe that will search for each drug term in our list
# we also create a LUT we can use to get more info about an identified drug in a tweet
drug_dict = df_drugs.set_index('term').to_dict(orient='index')
drug_entity_pipe = Entity(keywords_list=list(drug_dict.keys()), label='DRUG')
# in case we rerun the cell, need tofirst remove pipe before re-adding
if 'entity' in nlp.pipe_names:
    nlp.remove_pipe('entity')
nlp.add_pipe(drug_entity_pipe, before='ner')

In [9]:
# create lists of different verbs associated with each class
# NOTE: we only have ABUSE and CONSUMPTION verbs in our document right now
df_words = pd.read_csv(fid_keywords)
verbs_slang       = df_words[(df_words['term_type'].str.upper()=='VERB')&(df_words['class'].str.upper()=='ABUSE')].term.to_list()
verbs_consumption = df_words[(df_words['term_type'].str.upper()=='VERB')&(df_words['class'].str.upper()=='CONSUMPTION')].term.to_list()


# create lists of expressions assoiated with each class
df_expr = pd.read_csv(fid_expressions)
abuse_expressions       = df_expr[df_expr['class'].str.upper()=='ABUSE'].regex.to_list()
mention_expressions     = df_expr[df_expr['class'].str.upper()=='MENTION'].regex.to_list()
consumption_expressions = df_expr[df_expr['class'].str.upper()=='CONSUMPTION'].regex.to_list()
unrelated_expressions   = df_expr[df_expr['class'].str.upper()=='UNRELATED'].regex.to_list()

---
<h2>We can now create a preprocessing pipe that tag tweets with info about found drugs

In [10]:
# create functions for tagging docs with info on found drugs
has_drug_term = lambda doc : any([True for ent in doc.ents if ent.label_ == 'DRUG'])
has_slang_drug_term = lambda doc : any([ent._.get('is_slang') for ent in doc.ents if ent.label_ == 'DRUG'])
has_commonly_abused_drug_term = lambda doc : any([ent._.get('is_commonly_abused') for ent in doc.ents if ent.label_ == 'DRUG'])


def nlpify(text, lower=False):
    # returns doc object with 'DRUG' entity and flags to indicate info about found drugs
    text = text.lower() if lower else text
    doc = nlp(text)

    Span.set_extension('is_commonly_abused', default=False, force=True)
    Span.set_extension('is_slang', default=False, force=True)
    for ent in doc.ents:
        if ent.label_ == 'DRUG':
            ent._.set('is_commonly_abused', drug_dict[ent.text.lower()]['is_commonly_abused'])
            ent._.set('is_slang', drug_dict[ent.text.lower()]['is_slang'])
    Doc.set_extension('has_drug_term', getter=has_drug_term, force=True)
    Doc.set_extension('has_commonly_abused_drug_term', getter=has_commonly_abused_drug_term, force=True)
    Doc.set_extension('has_slang_drug_term', getter=has_slang_drug_term, force=True)
    return doc


# make a helper func to apply the above method to our dataframes
_nlpify = lambda row: nlpify(row['text'],lower=True)

In [11]:
# now let's apply the proprocessing to our data
# df_train['doc'] = df_train.progress_apply(_nlpify,  axis=1)
# df_val['doc'] = df_val.progress_apply(_nlpify,  axis=1)
# df_eval['doc'] = df_eval.progress_apply(_nlpify,  axis=1) #NOTE: dont really need eval since we have pred (eval+predictions)
# df_pred['doc'] = df_pred.progress_apply(_nlpify,  axis=1)

---
<h2>Define Override Rules

In [12]:
def retweet(doc, P, score=None, threshold=1,  pass_p=True):
    '''
    We check if this is a retweet
    '''

In [13]:
def common_songs(doc, P, score=None, threshold=1, pass_p=True):
    '''
    phrases from commonly referenced songs about drugs
    '''
    PASS = P if pass_p else None
    if re.search('(camila[ ]?cabello|camilzer|kcamexico|(nicotine, heroin, morphine)|(nicotine, valium, vicodin)|qotsa)', doc.text):
        return MENTION
    else:
        return PASS

In [14]:
def pet_meds(doc, P, score=None, threshold=1, pass_p=True):
    '''
    looks for a phrase to indicate the drug being mentioned is used by a pet
    e.g. "doggie valium", or "cat xanax"
    '''
    PASS = P if pass_p else None
    
    drug_ents = [ent for ent in doc.ents if ent.label_=='DRUG']
    for ent in drug_ents:
        if re.search('(pup(py|per)?|dog(go|gie)?|cat|kit(ten|ty)) $', doc.text[:ent.start_char]):
            return MENTION
    else:
        return PASS

In [15]:
def extra_dose(doc, P, score=None, threshold=1, pass_p=True):
    '''
    look for phrases like "took a double-dose of <drug>"
    indicates ABUSE
    '''
    PASS = P if pass_p else None
    
    if re.search('i (took|had)( (a|an))? (double|extra)( of (the|my))?[ -](dose|dosage|pill)', doc.text):
        return ABUSE
    else:
        return PASS

In [16]:
def lyrica_anderson(doc, P, score=None, threshold=1, pass_p=True):
    '''
    search for terms that are likely to be associated with Lyrica the artist
    
    LOGIC:
    - if is the word lyrica mentioned in tweet
        - if the number of unique possible drug mentions is 1 (e.g.lyrica)
            - check that we find at least one term in the tweet associated with the artist
            - check that we do not find a regex pattern that matches dose information for a drug
            - if we meet the above 2 criteria
                - override P and return UNRELATED
              else 
                - return P
          else
            - if P is UNRELATED (even though we see another likely drug term)
                - override P and return MENTION
              else 
                - return P
      else
        - return P
        
    '''
    PASS = P if pass_p else None
    
    person_terms = {'anderson', 'lhhh', 'lgbt', 'lgbtq',
                    'song', 'album', 'sing', 'sang', 'singing', 'record',
                    'hip hop', 'hiphop', 'teairra', 'moniece', 'masika', 
                    'safaree', 'omarion',}
    
    drug_terms = {'prescription(s)?', 'fybromyalgia', 'diabet(es|ic)',
                  'shingles', 'medicat(ion(s)?|ed)', 'seizure(s)?',
                  'pharmac(y|ist)',
                  r'\d+[ ]?(mg|mcg|ml|mcl|pills|tablet(s)?)'}
    drug_rex = '(' + '|'.join(drug_terms) +  ')'
     
    # pass if we dont see lyrica in the tweet
    found_drugs = [ent.text for ent in doc.ents if ent.label_=='DRUG']
    if 'lyrica' not in found_drugs:
        return PASS
    # if lyrica in the tweet, but there's another possible drug, we would rather pass
    elif len(set(found_drugs))>1:
        # however, if the model predicts unrelated we may want to override
        # with some more likely class, like MENTION since a second possible drug is not likely to be referring to a person
        if P==UNRELATED:
            return MENTION
        else:
            return PASS
    
    # check if we see any key terms we know are associated with the person
    # also check that the text does not match a typical dose string pattern.
    # if we pass both criteria  we will overrise and return UNRELATED, otherwise pass the original pred
    tokens = set([t.text for t in  doc])
    if len(tokens.intersection(person_terms))>0 and not re.search(drug_rex, doc.text):
        return UNRELATED
    else:
        return PASS

In [17]:
def drug_hadme(doc_raw, P, score=None, threshold=1, pass_p=True):
    '''
    We check for a phrase similar to "<DRUG> had/got/made me/you"
    
    LOGIC:
    - check for the phrase
        - If phrase found
            - check that the preceding term is a possible drug
                - if matches pattern
                    - check if proceeding text doesnt include abuse terms
                        - if abuse terms found
                            - override P and return ABUSE (?)
                          else
                            - override P and return CONSUMPTION
                  else
                    - return P
          else
            - return P
            
    LOGIC (DEP):
    - if root in accepted lemmas (e.g. root had/got/made)
        - merge noun phrases together (to simplify checking drug term dependency)
            - if root still acceptable lemma after merge
                - check that we have at least root-child that is a noun phrase with a drug term in it
                    - if we found at least one
                        - check that we have at least one root-child that is a verb and a complement to the root verb
                            - if we found at least one
                                - check that for at least one of these verbs, there is a child that is a nsubj matching the target nouns (e.g. i,me,you,us)
                                    - if we found at least one
                                        - if we found more than one drug or an abuse-related verb above
                                            - override p and return ABUSE
                                          else
                                            - override P and return CONSUMPTION
      else
        - return P
    '''
    PASS = P if pass_p else None
    
    # make a copy of doc since we modify it
    doc = Doc(doc_raw.vocab).from_bytes(doc_raw.to_bytes())
    
#     print(doc.text)
    ok_drugs = {'adderall', 'lisdexamfetamine', 'oxycodone', 'hydrocodone', 'alprazolam', 'percocet'}
    ok_roots = {'get', 'have', 'make'}
    abuse_verbs = {'trip', 'roll', 'rollin', 'hallucinate'}
    ok_nouns = {'i', 'me', 'us'}

    found_match = False
    abuse_match = False
    found_items = []
    # first loop used to merge nsubj children for each root
    for sent in doc.sents:
        #if there are multiple sentences we need to split it, but only do so if we needto (otherwise is slow)
        if len(list(doc.sents))>1:
            sentdoc = nlpify(sent.text) # make a new doc that we can modify
        else:
            sentdoc = doc
        roots = [token for token in sentdoc if token.head == token and token.lemma_ in ok_roots]
        for root in roots:
            nsubjs = [t for t in root.children if t.dep_=='nsubj']
            mergespans = [sentdoc[t.left_edge.i : t.right_edge.i+1] for t in nsubjs]

            with sentdoc.retokenize() as retokenizer:
                [retokenizer.merge(span) for span in mergespans]
            # second loop goes through each merged nsubj and checks it has a drug term
            # also checks that there is a verb operated on by our root
            # and that the children of the verb include a nounsubject matching i,you,me,use
            roots = [token for token in sentdoc if token.head == token and token.lemma_ in ok_roots]
            for root in roots:
                # check that we find at least one nsubj with a drug term
                nsubjs = [t for t in root.children if t.dep_=='nsubj']
                drug_found = False
                multidrug_found = False
                for n in nsubjs:
                    _d = nlpify(n.text)
                    numdrugs = len([e for e in _d.ents if e.label_=='DRUG' and drug_dict.get(e.text, {}).get('parent_term') in ok_drugs])
                    if numdrugs>0:
                        drug_found = True
                        if numdrugs>1:
                            multidrug_found = True
                if drug_found:
                    verbs = [t for t in root.children if t.pos_=='VERB']# and t.dep_ in ('xcomp', 'ccomp')]
                    if len(verbs)>0:
                        for verb in verbs:
                            v_children = [t for t in verb.children if t.dep_=='nsubj' and t.text in ok_nouns]
                            if len(v_children)==0:
                                #check if the noun subject (me/us) is for some reason treated as dobj of root
                                v_children = [t for t in root.children if t.dep_=='dobj' and t.text in ok_nouns]
                            if len(v_children)>0:
                                found_items += [root, verb]+v_children
                                found_match = True
                                if not abuse_match and (verb.lemma_ in abuse_verbs or multidrug_found):
                                    abuse_match = True

    #backup regex just to check for simple pattern
    for ent in [e for e in doc_raw.ents if e.label_=='DRUG' and drug_dict.get(e.text, {}).get('parent_term') in ok_drugs]:
#         print(doc_raw.text[ent.end_char+1:])
        if re.search('^[ ]*(have|having|has|had|got|made|making) (me|i|us)', 
                     doc_raw.text[ent.end_char+1:]):
#             print('regex match')
            found_match = True
         
    
    # hacky way to check if in rt. normally would get token start/end idx but these were changed when retokenizing
    is_in_rt = False
    has_rt = re.search(r'("|“|”)_U:.*?("|“|”)', doc.text)
#     print('has_rt', has_rt)
    if has_rt:
        start, end = has_rt.span()
        if set([i.text for i in found_items]).issubset(set(doc.text[start:end].split())):
            is_in_rt = True
        
                                    
    if found_match:
        if is_in_rt:
            return MENTION
        elif abuse_match:
            return ABUSE
        else:
            return CONSUMPTION
    else:
        return PASS

In [18]:
def lil_xan(doc, P, score=None, threshold=1, pass_p=True):
    '''
    We check for any mentions of lil xan or lil peep.
    
    in some cases if we find a reference to non-rapper term 
    we will override the model if it predicts unrelated, 
    since we assume there is at least one real drug mention
    
    LOGIC:
    - get prediction P from model
    - We look for all mentioned of xan/xanax
        - If all of those refer to the rapper
            - check if there are references to other drugs
                - if there are other non-xanax drug references
                    - if P is UNRELATED
                        - override P and return MENTION
                      else
                        - return P
                  else
                    - override P and return UNRELATED
          else
            - if P is UNRELATED
                - override P and return MENTION
              else
                - return P
    '''
    PASS = P if pass_p else None
    
    # if we don't see any possible mention to lil xan/peep, return original pred
    if not re.search(r"li(l|l'|ttle)[ -](xan|xanax|xanex|peep)", doc.text):
        return PASS
    
    # otherwise, we know there's some ambiguous mention to xan, or we found lil peep
    # for each possible DRUG mention
    for ent in [ent for ent in doc.ents if ent.label_ == 'DRUG']:
        if ent.text not in ('xan', 'xanex', 'xanax'):
            # we found a non- xan-ambiguous drug term
            
            # we only override pred if says the tweet is unrelated
            # or if we set a threshold and it's over the P score
            if P == UNRELATED or (score is not None and score<threshold):
                return MENTION 
            else:
                return PASS
          
        else:
            if not re.search(r"li(l|l'|ttle)[ -]&", doc.text[:ent.start_char]):
                # we found xan type mention that doesnt seem to be the rapper
                
                # same as above, we override under certain conditions
                if P == UNRELATED or (score is not None and score<threshold):
                    return MENTION 
                else:
                    return PASS
            else:
                # we see a xan mention of the rapper
                continue
    # if we go throug each possible DRUG ent and each is likely the rapper, then unrelated
    return UNRELATED

In [19]:
def airline_names(doc, P, score=None, threshold=1, pass_p=True):
    '''
    check for airlines with drug-like names
    we only override if we feel confident all mentions are related to airline
    
    LOGIC:
    - get all unique possible drug terms
        - if there is only "dolophine"
            - check that each mention matches a regex indicating the airline
                - if there is any mention not matching
                    - return P
                  else
                    - return UNRELATED
          else
            - return P  
    '''
    PASS = P if pass_p else None
    
    is_airline = True
    drug_ents = set([ent for ent in doc.ents if ent.label_ == 'DRUG'])
    
    # if only dolophine term found, check that each instance matches the regex
    if len(drug_ents)>0 and drug_ents.issubset({'dolophine', 'doliphine'}):
        for ent in drug_ents:
            if not any([re.search(r'fl(ew|y|ying|own)( (via|on))? $', doc.text[:ent.start_char]),
                        re.search(r'^ air(way|line)?(s)?', doc.text[ent.end_char+1:])]):
                # mention of dolophine not likely related to airline
                is_airline = False
                break
    else:
        is_airline = False
        
    if is_airline:
        return UNRELATED
    else:
        return PASS      

---
<h2>Now we can apply the rules to the data and original prediction

In [20]:
# create ordered dict indicating priority of rule override
# the final prediction for each tweet is the first rule in thislist
# that overrides the original prediction.
# If no rule overrides, then we use the original prediction labeel
rules_odict = OrderedDict({
    'rule_airline'     : airline_names,
    'rule_lilxan'      : lil_xan,
    'rule_lyrica'      : lyrica_anderson, 
    'rule_hadme'       : drug_hadme,
    'rule_extradose'   : extra_dose,
    'rule_petmeds'     : pet_meds,
    'rule_commonsongs' : common_songs,
})


def apply_rules(df, rules_odict):
    # apply rules to our data
    # note we want to cascade rules so we set it so that if a rule
    # doesnt override it returns None, making it easier to use bfill later
    # add any necessary placeholder columns, used by rules
    placeholder_cols = {'P', 'score'} - set(df.columns)
    for col in placeholder_cols:
        df[col] = None
    for k,v in tqdm_notebook(rules_odict.items(), desc='applying rules', leave=True):
        df[k] = df.progress_apply(lambda x: v(*x[['doc', 'P', 'score']], pass_p=False), axis=1)
    # using the order of rules defined above, we use bfill to select the first
    # rule that overrides the original prediction and use that value  and the final
    # if no rule overrides, then use the original prediction
    df['P_final'] = df[list(rules_odict.keys())+['P']].bfill(axis=1).iloc[:,0]
    return df


# df_train = apply_rules(df_train, rules_odict)
# df_val   = apply_rules(df_val, rules_odict)
# df_eval  = apply_rules(df_eval, rules_odict)
# df_pred  = apply_rules(df_pred, rules_odict)

---

---
<h2>We can review the samples whose prediction labels were modified by our override rules

In [21]:
def get_pchange(df):
    _df = df[(df.P!=df.P_final)&(~pd.isnull(df.P_final))]
    print(_df.shape[0])
    return _df

In [22]:
# get_pchange(df_pred)
# df_pred.to_excel('pred_results.xlsx', index=False)

In [23]:
# from sklearn.metrics import f1_score
# print(f1_score(df_valvotes['class'], df_valvotes['P_final'], average='macro'))
# df_valvotes.groupby('correct')['class'].count()

<h2>Run rules on prediction files from Izzy</h2>

In [24]:
# run eval data through preprocessor
df_eval  = load_tweets(fid_eval , {'Tweet':'text'})
df_eval['doc'] = df_eval.progress_apply(_nlpify,  axis=1)

HBox(children=(FloatProgress(value=0.0, max=3271.0), HTML(value='')))

In [25]:
# load and prep pred data to have rules applied

# fid_pred_bert = 'FINAL_PREDICTION_BERT.csv'
# fid_pred_503 = '503split-2020-06-04 01_10_49.626597-predictions.csv'
fid_pred_final = 'task4.csv'

def clean_pred_df(fid):
    df = pd.read_csv(fid)
    df['score'] = df[['MENTION', 'CONSUMPTION', 'ABUSE', 'UNRELATED']].max(axis=1)
    df['P'] = df['prediction'].map(lambda x: classmap[x])
    df = df.merge(df_eval[['tweetid', 'text', 'doc']], how='left', on='tweetid')
    return df

# df_pred_bert = clean_pred_df(fid_pred_bert)
# df_pred_503 = clean_pred_df(fid_pred_503)
df_pred_final = clean_pred_df(fid_pred_final)

In [26]:
# run data through rules

# df_pred_bert = apply_rules(df_pred_bert, rules_odict)
# df_pred_503 = apply_rules(df_pred_503, rules_odict)
df_pred_final = apply_rules(df_pred_final, rules_odict)

# save a full output and also submittable version for each model
# df_pred_bert.to_csv('BERT_pred_override_fullresults.csv', index=False)
# df_pred_bert[['tweetid','P_final']].rename(columns={'P_final':'Class'}).to_csv('BERT_pred_override.csv', index=False)

# df_pred_503.to_csv('503split_pred_ful_results.csv', index=False)
# df_pred_503[['tweetid','P_final']].rename(columns={'P_final':'Class'}).to_csv('503split_pred_override.csv', index=False)

df_pred_final.to_csv('task4_override_full_results.csv', index=False)
df_pred_final[['tweetid','P_final']].rename(columns={'P_final':'Class'}).to_csv('task4_override.csv', index=False)

HBox(children=(FloatProgress(value=0.0, description='applying rules', max=7.0, style=ProgressStyle(description…

HBox(children=(FloatProgress(value=0.0, max=3271.0), HTML(value='')))

HBox(children=(FloatProgress(value=0.0, max=3271.0), HTML(value='')))

HBox(children=(FloatProgress(value=0.0, max=3271.0), HTML(value='')))

HBox(children=(FloatProgress(value=0.0, max=3271.0), HTML(value='')))

HBox(children=(FloatProgress(value=0.0, max=3271.0), HTML(value='')))

HBox(children=(FloatProgress(value=0.0, max=3271.0), HTML(value='')))

HBox(children=(FloatProgress(value=0.0, max=3271.0), HTML(value='')))


