In this notebook, I do a quick exploration on a hand-labeled dataset of 500 reviews, which have information on whether or not any side effect was mentioned, to see if standard classifiers perform better than my homebrewed algorithm at finding side effects. I did not compare on the multilabel problem of identifying specific side effects, because the burden of labeling that dataset was too high for the ROI. I did, however, spend some time looking at the feature importances for the logistic regression model, and found that many of the features that positively correlated were what you might expect (the words "weight" and "diarrhea", negative sentiment, etc.) and the same was true for what was negatively correlated (positive sentiment, etc.). This would have been an interesting avenue to chase down, but having verified that my algorithm worked well, I decided to spend more time tuning the parameters on that, rather than exploring the classification problem.

It should be noted that this notebook is more of a scratch pad, and most of the code here that is undocumented and minimally commented is commented and documented in the .py files, which are what I used in my final analysis (i.e., what I consider to be the "production" code)

In [1]:
import pandas as pd
import numpy as np
import glob as glob

from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, confusion_matrix
from sklearn.model_selection import train_test_split, KFold, GridSearchCV

In [2]:
import nltk
from nltk.corpus import wordnet
import spacy
from spacy.tokenizer import Tokenizer
import en_core_web_lg
nlp = en_core_web_lg.load()

tokenizer = Tokenizer(nlp.vocab)

from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

from autocorrect import Speller
spell = Speller(lang='en')

# Functions for my homebrew method (implementation and grid search)

In [3]:
from nltk.sentiment import vader
VADER_SIA = vader.SentimentIntensityAnalyzer()

def find_polarity_scores(reviews):
    VADERscorePos = []
    VADERscoreNeg = []
    for rev in reviews:
        VADERscorePos.append(VADER_SIA.polarity_scores(rev)['pos'])    
        VADERscoreNeg.append(VADER_SIA.polarity_scores(rev)['neg'])            
        
    return VADERscorePos, VADERscoreNeg

In [4]:
# Magic tokenizer thing
def spacyTokenizer(s: str)-> list:
    doc = tokenizer(s.lower().strip())
    tokens = []
    for token in doc:
        if not token.is_stop and token.is_alpha and token.lemma_ != '-PRON-':
            tokens.append(token.lemma_)
        
    return tokens

# Doing the tf-idf bit
def findSEFeatures(strList):
    tfidf_vectr = TfidfVectorizer()
    corpus = [' '.join(SE) for SE in strList]
    tfidf_score = tfidf_vectr.fit_transform(corpus)
    features = np.array(tfidf_vectr.get_feature_names())
    
    return features

# Using the side effect features list as the vocabulary for the reviews, 
# as a backhanded way of pulling out features
def findTFIDFReviews(reviews, SEvocab):
    tfidf_vectr = TfidfVectorizer(vocabulary=SEvocab)
    corpus = [' '.join(rev) for rev in reviews]
   
    tfidf_score = tfidf_vectr.fit_transform(corpus).toarray()

    return tfidf_score

In [5]:
def processLabeledReviews(SEvocab, ReviewFile='LabeledReviews/randomlySelectedReviews.csv'):
    df = pd.read_csv(ReviewFile, sep='$', index_col=0)
    
    # Counting medication mentions as a feature
    medications = np.unique(df['Medication'])
    medications = np.array([med.lower().replace('-',' ') for med in medications])
    med_counts = []
    for rev in df['Full Review']:
        split_rev = rev.split(' ')
        matches = []
        for med in medications:
            medSplit = list(med)
            lenMed = len(medSplit)
            for word in split_rev:
                wordSplit = list(word.lower())
                ind = min([lenMed, len(wordSplit)])
                test = ''.join([w for i,w in enumerate(wordSplit[:ind]) if w == medSplit[i] ])
                if len(test) >= max([ind,len(medSplit)-2]):
                    matches.append(med)
        med_counts.append(np.unique(matches).size)

    df['Medication mentions'] = med_counts
    
    reviews = np.array(df['Full Review'])
    pos, neg = find_polarity_scores(reviews)
    reviews = [[spell(word) for word in spacyTokenizer(rev.replace('/', ' '))] for rev in reviews]
    
    features = findTFIDFReviews(reviews, SEvocab)
    stack = np.array([df['Medication mentions'], pos, neg]+[feat for feat in features.T]+[df['Presence of side effect']]).T
    
    newDF = pd.DataFrame(stack, columns = ['Medication mentions', 'Positive', 'Negative']+list(SEvocab)+['Side effect label'])

    return newDF

# Need to collect FAERs results and basic Drugs.com info
def processSideEffects(medFile='MedicationsAndSideEffects.csv',
                       conditions=['ADHD', 'Anxiety', 'Bipolar-Disorder', 'Depression', 'Schizophrenia']):
    
    # Processing the Drugs.com side effects
    df = pd.read_csv(medFile, sep='$', index_col=0)
    readLine = lambda s: s[2:-2].split('; ')[:-1]
    allSEs = []
    for key in ['More common', 'Less common', 'Incidence not known']:
        for item in [readLine(SE) for SE in df[key]]: allSEs += item
    allSEs = list(np.unique(allSEs))
    
    allSEs = [[spell(word) for word in spacyTokenizer(SE)] for SE in allSEs]
    
    
    # Handling the FAERs side effects
    moreSEs = []
    for condition in conditions:
        mfile = pd.read_csv('faers_results/{:s}/SideEffectsExtracted.csv'.format(condition),
                           sep='$', index_col=0)
        mfile = mfile.fillna('')
        moreSEs += [mfile.loc[ind]['Definition']+mfile.loc[ind]['Synonyms'] for ind in mfile.index]
    moreSEs = list(np.unique(moreSEs))
    moreSEs = [[spell(word) for word in spacyTokenizer(SE)] for SE in moreSEs]

    return allSEs+moreSEs

In [6]:
# Not super useful to take the mean across columns, instead look at top 10 scoring words in each side effect
def findTop(strList, keeptop=10, extracut=5, topcutoff=0.0099):
    tfidf_vectr = TfidfVectorizer()
    corpus = [' '.join(SE) for SE in strList]
    tfidf_score = tfidf_vectr.fit_transform(corpus).toarray()
    features = np.array(tfidf_vectr.get_feature_names())
    
    words = []
    for row in tfidf_score:
        topcut = min([keeptop,len(row)])
        inds = row.argsort()[::-1][:topcut]
        row_words = []
        for ind in inds:
            if row[ind] >= topcutoff:
                row_words.append(features[ind])
                #print(features[ind],' '*(50-len(features[ind])), row[ind].round(2))
        #print('\n')
        if not row_words:
            row_words = list(features[inds][:extracut])
        words.append(row_words)
    return words

def parseRevAndSEs(SEvocab, top_cutoff=0.0099, topRev=50, topRevextra=20,topSE=5, topSEextra=5):
    df = pd.read_csv('LabeledReviews/randomlySelectedReviews.csv', sep='$', index_col=0)
    reviews = df['Full Review']

    # Counting medication mentions as a feature
    medications = np.unique(df['Medication'])
    medications = np.array([med.lower().replace('-',' ') for med in medications])
    med_counts = []
    for rev in df['Full Review']:
        split_rev = rev.split(' ')
        matches = []
        for med in medications:
            medSplit = list(med)
            lenMed = len(medSplit)
            for word in split_rev:
                wordSplit = list(word.lower())
                ind = min([lenMed, len(wordSplit)])
                test = ''.join([w for i,w in enumerate(wordSplit[:ind]) if w == medSplit[i] ])
                if len(test) >= max([ind,len(medSplit)-2]):
                    matches.append(med)
        med_counts.append(np.unique(matches).size)
    
    pos, neg = find_polarity_scores(reviews)
    reviews = [[spell(word) for word in spacyTokenizer(rev.replace('/', ' '))] for rev in reviews]
    clean_reviews = findTop(reviews, keeptop=topRev, extracut=topRevextra)
    
    clean_SEs = findTop(SEvocab, keeptop=topSE, extracut=topSEextra)
    
    labels = df['Presence of side effect']
    
    return list(df['Full Review']), clean_reviews, clean_SEs, med_counts, labels
    
import time

def find_sideEffects_inReviews_FAERsinformed(SEvocab, top_cutoff=0.0099, topRev=50, 
                                             topRevextra=20,topSE=5, topSEextra=5):

    fullrevs, reviews, listSEs, med_counts, labels = parseRevAndSEs(SEvocab,top_cutoff=top_cutoff,
                                                                   topRev=topRev, topRevextra=topRevextra,
                                                                   topSE=topSE, topSEextra=topSEextra)
    
    #print(time.time())
    BagOSE = ' '.join([' '.join(SE) for SE in listSEs])

    # Finding review words that exist in the list of side effects
    # Only requiring space at the beginning because of words like nausea-nauseated, etc.
    found = [[word for word in rev if BagOSE.lower().find(' '+word.lower())] for rev in reviews]
    found = []
    for ind, rev in enumerate(reviews):
        item = {}
        for SE in listSEs:
            # Match words in reviews to side effects and then add them to found, build dataframe with this info
            item[', '.join(SE)] = len([word for word in rev if word.lower() in SE])
        found.append(item)
    
    SE_match = pd.DataFrame(found)
    SE_match['Full Review'] = fullrevs
    SE_match['Medication mentions'] = med_counts
    SE_match['Side effect label'] = labels
    pos, neg = find_polarity_scores(fullrevs)
    SE_match['Positive polarity'] = pos
    SE_match['Negative polarity'] = neg
    
    # Return the master product
    return SE_match

def screen_for_hits(df, posnegRat=2.5):
    newdf = df.drop(columns=['Full Review', 'Positive polarity', 'Negative polarity'])
    review_inds = []
    
    # Allowing for two item side effects UNLESS they contain very generic words
    colLens = np.array([len(col.split(', ')) + 2*((col.find('skin')!=-1)|
                                                  (col.find('feel') != -1)|
                                                  (col.find('pain')!= -1)|
                                                  (col.find('abnormal')!=-1)|
                                                  (col.find('change')!=-1)|
                                                  (col.find('disorder')!=-1)|
                                                  (col.find('problem')!=-1)|
                                                  (col.find('decrease')!=-1)|
                                                  (col.find('increase')!=-1)|
                                                  (col.find('loss')!=-1)) for col in newdf.columns])
    
    # If the column is not generic and has two or fewer words, count one word as a match, otherwise require 2
    for ind in newdf.index:
        if (((colLens < 3) & newdf.loc[ind].gt(0)) | newdf.loc[ind].gt(1)).sum(): 
            review_inds.append(ind)
            
    # Screening based on polarity
    cond = (df['Negative polarity'] != 0)
    diff_inds = df.index[cond][(df['Positive polarity'][cond]/df['Negative polarity'][cond] > posnegRat)]
    
    # Marking hits versus not
    found_reviews = []
    for ind in review_inds:
        if ind not in diff_inds and df.loc[ind]['Medication mentions'] < 2:
            conditions = np.logical_or(np.logical_and((colLens < 3), newdf.loc[ind].gt(0)), newdf.loc[ind].gt(1))
            hit = conditions.sum()
            if hit:
                found_reviews.append(ind)
    
    tp = df.loc[np.array(found_reviews)]['Side effect label'].sum()
    tn = len(found_reviews) - tp
    fp = df.drop(index=np.array(found_reviews))['Side effect label'].sum()
    fn = len(df) - len(found_reviews) - fp
    
    accuracy = round((tp+tn)/len(df),2)
    print("Accuracy: ", accuracy)
    print("tn, fp, fn, tp")
    print(tn, fp, fn, tp)
    #print(time.time())
    
    return accuracy
    
def mymethod(SEvocab, top_cutoff=0.005, topRev=50, topRevextra=20,topSE=5, topSEextra=5, posnegRat=2.5):
    df = find_sideEffects_inReviews_FAERsinformed(SEvocab, top_cutoff=top_cutoff, 
                                                  topRev=topRev, topRevextra=topRevextra,
                                                  topSE=topSE, topSEextra=topSEextra)
    acc = screen_for_hits(df, posnegRat=posnegRat)
    return df, acc
    
def gridSearch_mymethod(SEvocab):
    params = {'top_cutoff': [5e-2],
             'topRev': [25,50,75],
             'topRevextra':[10,15,20],
             'topSE': [3,5],
             'topSEextra':[3,5],
             'posnegRat':[6]}
#     params = {'top_cutoff': [5e-3],
#              'topRev': [50],#[25,50,75],
#              'topRevextra':[20],#[10,15,20],
#              'topSE': [5],#[3,5,10],
#              'topSEextra':[5],#[1,3,5],
#              'posnegRat':np.arange(3.5,10.2,0.5)}
    
    counts = 1
    #total = 3*3*3*3*3*4
    total = 3*4
    score = []
    for tc in params['top_cutoff']:
        for tR in params['topRev']:
            for tRe in params['topRevextra']:
                for tSE in params['topSE']:
                    for tSEe in params['topSEextra']:
                        for i,pnR in enumerate(params['posnegRat']):
                            if i == 0:
                                df, acc = mymethod(SEvocab, top_cutoff=tc, topRev=tR, topRevextra=tRe,
                                                  topSE=tSE, topSEextra=tSEe, posnegRat=pnR)
                            else:
                                acc = screen_for_hits(df, posnegRat=pnR)
                            score.append([tc,tR,tRe,tSE,tSEe,pnR, acc])
                            counts+=1
                            print('progress:\t{:g}\n\n'.format(round(counts/total,2)))
                            
    return score

# Functions for the classifiers

In [None]:
def fitXGBoost(feat_train, labels_train, feat_test, labels_test, seed=616):
    # Fit the model
    clf = XGBClassifier(seed=seed)
    clf.fit(feat_train, labels_train)
    labels_predict = clf.predict(feat_test)
    
    # Getting ranked feature importance
    feature_weights = clf.feature_importances_
    print("Accuracy: ", accuracy_score(labels_test, labels_predict))
    
    return feature_weights, confusion_matrix(labels_test, labels_predict).ravel()

def fitRF(feat_train, labels_train, feat_test, labels_test, 
          n_est=100, max_depth=10, seed=616):
    
    clf = RandomForestClassifier(n_estimators=n_est,
                                 max_depth=max_depth, 
                                 random_state=616,
                                 max_samples=0.8)
    clf.fit(feat_train, labels_train)
    
    feature_weights = clf.feature_importances_
    labels_predict = clf.predict(feat_test)
    
    # Metrics
    print("Accuracy: ", accuracy_score(labels_test, labels_predict))
    
    return feature_weights, confusion_matrix(labels_test, labels_predict).ravel()
    
def fitLR(feat_train, labels_train, feat_test, labels_test, seed=616):
    clf = LogisticRegression(random_state=seed)
    clf.fit(feat_train, labels_train)
    feature_weights = clf.coef_
    labels_predict = clf.predict(feat_test)
    
    print("Accuracy: ", accuracy_score(labels_test, labels_predict))
    
    return feature_weights, confusion_matrix(labels_test, labels_predict).ravel()
    
def fitKFolds(features, labels, fitter=None):
    kf = KFold(n_splits=5, shuffle=True, random_state=4)
    for train_index, test_index in kf.split(features):
        x_train, x_test = features.loc[train_index], features.loc[test_index]
        y_train, y_test = labels.loc[train_index], labels.loc[test_index]
        dummy, confusion = fitter(x_train, y_train, x_test, y_test)
        print("tn, fp, fn, tp")
        print(confusion)
        
def grid_search(features, labels, fitmtd, parameters):
    mtd = fitmtd()
    clf = GridSearchCV(mtd, parameters, verbose=0)
    clf.fit(features, labels)
    
    best_fitter = clf.best_estimator_
    best_params = pd.DataFrame(clf.cv_results_).loc[clf.best_index_]
    
    return best_fitter, best_params

# Working on the data and comparing

In [None]:
# Reading in the labeled data
df = pd.read_csv('LabeledReviews/randomlySelectedReviews.csv', sep='$', index_col=0)

# Getting the side effect vocab
SEvocab = processSideEffects()

# Getting the features
feature_df = processLabeledReviews(SEvocabParsed)

# Writing a dataframe with that info
feature_df.to_csv('LabeledAndTFIDFed.csv', sep='$')

In [None]:
# Setting up the parameters to explore, and the test/train split
parametersLR = {'solver': ['newton-cg', 'lbfgs', 'liblinear'],
               'C': [10,1,0.1,0.01,0.01],
               'random_state':[616]}

parametersRF = {'n_estimators': [50,100,500],
               'max_depth':[5,10,25],
               'min_samples_split':[2,3,4,5],
               'min_samples_leaf':[2,4,8,16,32],
               'bootstrap': ['False'],
               'random_state': [616]}

parametersXGB = {'eta': [0.3],#[0.1,0.3,0.5],
                'gamma': [0,1,10],#,100],
                'max_depth': [25],#[5,10,25],
                'lambda': [0.1,1,10],
                'seed':[616]}

features = np.array(feature_df.drop(columns=['Side effect label']))
inds = [ind for ind, col in enumerate(features.T) if np.unique(col).size > 1]
features = features[:,np.array(inds)]
features = (features-features.min(axis=0))/features.ptp(axis=0)

x_train, x_valid, y_train, y_valid = train_test_split(features, 
                                                      feature_df['Side effect label'], 
                                                      test_size=0.2, random_state=4)

In [None]:
# Fit all the models and tune them. Not tuning my method to begin, because that's a 
# beast and I just want to know if it's worth doing so
fitter, params = grid_search(x_train, y_train, LogisticRegression, parametersLR)
y_pred = fitter.predict(x_valid)
print("Accuracy: ", accuracy_score(y_valid, y_pred))
print("tn, fp, fn, tp")
print(confusion_matrix(y_valid, y_pred).ravel())

fitter, params = grid_search(x_train, y_train, RandomForestClassifier, parametersRF)
y_pred = fitter.predict(x_valid)
print("Accuracy: ", accuracy_score(y_valid, y_pred))
print("tn, fp, fn, tp")
print(confusion_matrix(y_valid, y_pred).ravel())

fitter, params = grid_search(x_train, y_train, XGBClassifier, parametersXGB)
y_pred = fitter.predict(x_valid)
print("Accuracy: ", accuracy_score(y_valid, y_pred))
print("tn, fp, fn, tp")
print(confusion_matrix(y_valid, y_pred).ravel())

# Use my model
dummy = mymethod(SEvocab)

# The final accuracies are: 
# LR = 0.6, RF = 0.68, XGB = 0.57, homebrew = 0.75

In [None]:
# Finished by grid searching my method, and found that I could improve the accuracy 0.06
# without dramatically increasing the computational cost by changing a few params
score = gridSearch_mymethod(SEvocab)