In [1]:
import requests
from bs4 import BeautifulSoup, SoupStrainer
import time
import pandas as pd
import numpy as np
import glob
import csv

In [252]:
import nltk
from nltk.stem import WordNetLemmatizer, PorterStemmer
from nltk.corpus import wordnet, stopwords
stops = stopwords.words('english')
import spacy
import en_core_web_lg
nlp = en_core_web_lg.load()

# Must process medsDF file to include info on brand name meds, in NLP, track number of tries by medication mentions

# Process the names into numbers...hashing of some kind?

In [2]:
# Grabbing original medications/SE dataframe
def makeListofStr(s: str) -> list:
    List = s[1:-1].split("'")
    List = [string for string in List if string and string != ', ']
    return List

medsDF = pd.read_csv('Medications_SideEffects_brandAndlinks.csv', index_col=0,
                    converters={'Parent links': lambda x: makeListofStr(x), 'Exact medications': lambda x: makeListofStr(x)})

In [3]:
# Making it possible to grab the files in Reviews/
def formatExactMed(s: str)->str:
    s = s.strip() # remove preceding and trailing white space
    s=s.replace(',','-') # Get rid of commas
    s=s.replace(' ', '-') # Get rid of white space
    s=s.replace('/', 'per') # remove things that make it look like a path
    return s

In [48]:
# Gathering all the files into one dictionary related to the associated medication
allfiles = []
for ind, medication in zip(medsDF.index, medsDF['Medication']):
    files = list(set([glob.glob('Reviews/'+formatExactMed(exactMed)+'_reviews.csv')[0] for exactMed in medsDF.loc[ind]['Exact medications'] if glob.glob('Reviews/'+formatExactMed(exactMed)+'_reviews.csv')]))
    files += list(set([glob.glob('Reviews/'+formatExactMed(exactMed).lower()+'_reviews.csv')[0] for exactMed in medsDF.loc[ind]['Exact medications'] if glob.glob('Reviews/'+formatExactMed(exactMed).lower()+'_reviews.csv')]))
    allfiles.append((medication, files))
allfiles = dict(allfiles)

In [49]:
# Removing all the hormone patches from Daytrana's results:
allfiles['Daytrana transdermal'] = ['Reviews/Daytrana-Patch--Transdermal-24-Hours_reviews.csv']

In [50]:
# Grouping together medications by all of their alternate names (allowing for access by brandnames or generics)
for ind, altnames in zip(medsDF.index, medsDF['Alternate names']):
    if type(altnames) != float:
        stack = []
        for name in [altnames.split(', ')+[medsDF.loc[ind]['Medication']]][0]:
            #if name in allfiles.keys() # Can't use this because if it's part of the word of a key, it includes it
            for key in allfiles.keys():
                if name == key: 
                    stack += allfiles[name]
        for name in [altnames.split(', ')+[medsDF.loc[ind]['Medication']]][0]:
            for key in allfiles.keys():
                if name == key:
                    allfiles[name] = stack

In [51]:
# Clearing up weird error I can't figure out for the life of me
# Error is that amphetamine is sneaking into dextroamphetamine box...despite the two strings not being equivalent
copy = allfiles['Dextroamphetamine '].copy()
for item in allfiles['Dextroamphetamine ']:
    if (item.lower().find('adderall') != -1 or item.lower().find('amphet') != -1) and item.find('oamphetamine') == -1:
        copy.remove(item)
    elif item.lower().find('dextroamphetamine-amphet') != -1:
        copy.remove(item)
        
copy = allfiles['Adderall'].copy()
for item in allfiles['Adderall']:
    if (item.lower().find('dextroamphetamine') != -1 or item.lower().find('dexedrine') != -1):
        copy.remove(item)
        
allfiles['Dextroamphetamine '] = copy
allfiles['Adderall'] = copy
allfiles['Amphetamine '].remove('Reviews/dextroamphetamine-amphetamine_reviews.csv')
allfiles['Evekeo'].remove('Reviews/dextroamphetamine-amphetamine_reviews.csv')
allfiles['Adderall'] = allfiles['Evekeo'].copy()

In [52]:
# Dealing with cases where there are keys that are capitalized or lowercase
lowkeys = [key for key in allfiles.keys() if key == key.lower()]
capkeys = [key for key in allfiles.keys() if key == key.capitalize()]

for lowkey in lowkeys:
    # Finding all matching capital and lowercase keys
    for capkey in capkeys:
        if capkey.lower() == lowkey:
            allfiles[capkey] = list(set(allfiles[capkey]) | set(allfiles[lowkey]))
            allfiles.pop(lowkey)
    # if lowkey still in the dictionary, capitalizing it...yes
    if lowkey in allfiles.keys():
        allfiles[lowkey.capitalize()] = allfiles[lowkey].copy()
        allfiles.pop(lowkey)

In [85]:
# Taking all the relevant files and stacking them into a single dataframe, then saving before further processing    
for medication in allfiles:
    for i, file in enumerate(allfiles[medication]):
        if file and i == 0:
            stackDF = pd.read_csv(file, sep='$', index_col=0, skip_blank_lines=False)
        else:
            stackDF = stackDF.append(pd.read_csv(file, sep='$', index_col=0, skip_blank_lines=False), ignore_index=True, sort=False)
        
        # Removing duplicate entries
        stackDF = stackDF.drop_duplicates()
        
        # Writing to a CSV file
        stackDF.to_csv('ProcessedReviews/{:s}_allconditions_raw_reviews.csv'.format(medication.strip().replace(' ','-')), sep='$')

In [106]:
# Grabbing all the information on conditions
conditions = []
for medication in allfiles:
    if allfiles[medication]:
        file = 'ProcessedReviews/{:s}_allconditions_raw_reviews.csv'.format(medication.strip().replace(' ','-'))
        conditions.append(np.unique(pd.read_csv(file, sep='$', index_col=0, skip_blank_lines=False)['conditionInfo']))
        
conditions = np.unique(np.hstack(conditions))
conditions = [cond[cond.find(': ')+2:] for cond in conditions if cond != 'Condition: ']

origConditions = [cond.replace('-',' ') for cond in np.unique(medsDF['Condition'])]

In [285]:
# Okay, writing actual code to process the conditions
# NLTK treats capital nouns as proper nouns! That's why it's not working!

def process_condition_string(s: str) -> list:
    # Breaking into words
    words = nltk.word_tokenize(s)
    
    # Managing frustrating formatting of certain conditions
    words = [word.replace('-',' ').split(' ') for word in words]
    newwords = []
    for word in words: 
        if type(word) == str:
            newwords.append(word)
        elif type(word) == list:
            newwords += word
    words = newwords
    
    # Breaking the words into stems
    lemmatizer = WordNetLemmatizer()
    lemmaed_words = [lemmatizer.lemmatize(word.lower(),'n') for word in words]

    # Removing stop words
    goWords = [word.lower() for word in lemmaed_words if word not in stops and word.isalpha()]
    
    return goWords

def process_original_conditions(s: str) -> list:
    # Breaking into words
    words = nltk.word_tokenize(s)
    
    # Breaking the words into stems
    lemmatizer = WordNetLemmatizer()
    lemmaed_words = [lemmatizer.lemmatize(word.lower(),'n') for word in words]
    
    # Adding in some terms relevant to particular conditions
    if 'bipolar' in lemmaed_words:
        lemmaed_words += ['manic', 'mania']
        lemmaed_words.remove('disorder')
    elif 'adhd' in lemmaed_words:
        synonyms = []
        for syn in wordnet.synsets('ADHD'):
            for l in syn.lemmas():
                synonyms.append(l.name().replace('_', ' '))
        synonyms = list(np.unique(synonyms))
        synonyms = [[lemmatizer.lemmatize(word.lower(),'n') for word in syn.split(' ')] for syn in synonyms]
        synonyms = list(np.hstack(synonyms))
        lemmaed_words += list(set(synonyms))
#         lemmaed_words += ['attention', 'deficit disorder hyperactivity']
        lemmaed_words.remove('disorder')
    elif 'depression' in lemmaed_words:
        lemmaed_words += ['depressive','depressed']
    elif 'anxiety' in lemmaed_words:
        lemmaed_words += [lemmatizer.lemmatize(word) for word in ['anxious', 'anxiousness']]
    elif 'schizophrenia' in lemmaed_words:
        lemmaed_words += ['schizoaffective']
        
    # Dropping everything into lowercase
    lemmaed_words = [word.lower() for word in lemmaed_words]
    
    return lemmaed_words

In [286]:
# Processed words for matching
processedOrig = [(orig, process_original_conditions(orig)) for orig in origConditions]
processedConds = [(cond, process_condition_string(cond)) for cond in conditions]
processedConds = dict(processedConds) # Making a dictionary to iterate through
processedOrig = dict(processedOrig)

In [288]:
# Split review conditions into buckets of "keep" and "drop", based on relevance
keep = {}
for orig in processedOrig: keep[orig] = []
drop = []
for cond in processedConds:
    check = []
    for orig in processedOrig:
        test = list(set([orig for c in processedConds[cond] if c in processedOrig[orig]]))
        if test:
            check.append(test[0])
    for c in check:
        if c:
            keep[c].append(cond)
    if c not in keep.values(): drop.append(cond)
        
# Removing repeat items that arise due to imperfections in the above process
for orig in processedOrig:
    for item in keep[orig]:
        if item in drop: drop.remove(item)

In [292]:
# Cleaning up adhd and depression:
keep['ADHD'] = ['Attention Deficit Disorder with Hyperactivity']
depKeeps = keep['Depression'].copy()
for item in keep['Depression']:
    if item.lower().find('bipolar') != -1: 
        depKeeps.remove(item)
    elif item.lower().find('manic') != -1:
        depKeeps.remove(item)
keep['Depression'] = depKeeps

{'ADHD': ['Attention Deficit Disorder with Hyperactivity'],
 'Anxiety': ['Anxiety associated with an Operation',
  'Anxious',
  'Anxiousness associated with Depression',
  'Repeated Episodes of Anxiety',
  'Sleep Disturbance with Extreme Anxiety'],
 'Bipolar Disorder': ['Agitation associated with Bipolar Mania',
  'Bipolar Depression',
  'Bipolar Disorder in Remission',
  'Bipolar I Disorder with Most Recent Episode Mixed',
  'Depression associated with Bipolar Disorder, Adjunct Treatment',
  'Mania associated with Bipolar Disorder',
  'Mania associated with Bipolar Disorder, Adjunct Treatment',
  'Manic-Depression',
  'Rapid Cycle Manic-Depression'],
 'Depression': ['Additional Medications to Treat Depression',
  'Anxiousness associated with Depression',
  'Depressed Mood Disorder Occurring Every Year at the Same Time',
  'Depression',
  'Depression following Delivery of Baby',
  'Major Depressive Disorder'],
 'Schizophrenia': ['Agitation associated with Schizophrenia',
  'Chronic Typ

In [335]:
keep

{'ADHD': ['Attention Deficit Disorder with Hyperactivity'],
 'Anxiety': ['Anxiety associated with an Operation',
  'Anxious',
  'Anxiousness associated with Depression',
  'Repeated Episodes of Anxiety',
  'Sleep Disturbance with Extreme Anxiety'],
 'Bipolar Disorder': ['Agitation associated with Bipolar Mania',
  'Bipolar Depression',
  'Bipolar Disorder in Remission',
  'Bipolar I Disorder with Most Recent Episode Mixed',
  'Depression associated with Bipolar Disorder, Adjunct Treatment',
  'Mania associated with Bipolar Disorder',
  'Mania associated with Bipolar Disorder, Adjunct Treatment',
  'Manic-Depression',
  'Rapid Cycle Manic-Depression'],
 'Depression': ['Additional Medications to Treat Depression',
  'Anxiousness associated with Depression',
  'Depressed Mood Disorder Occurring Every Year at the Same Time',
  'Depression',
  'Depression following Delivery of Baby',
  'Major Depressive Disorder'],
 'Schizophrenia': ['Agitation associated with Schizophrenia',
  'Chronic Typ

In [153]:
# Need to split by conditions
# Need to remove all reviews with no comment info
# Need to process the comments
# Need to process the review info

# Need to strategize on NLP stuff T.T

In [332]:
# Writing code to sort dataframe into different conditions
def splitDFbyCondition(file, condDict, condColumn='conditionInfo'):
    # Reading in file and cleaning up condition column if contains extraneous text
    df = pd.read_csv(file, sep='$', index_col=0)
    procCond = list(df['conditionInfo'])
    procCond = [cond.replace('Condition: ','') for cond in procCond]
    df['conditionInfo'] = procCond
    
    # Finding all the conditions for each file and gathering relevant indices
    splitDFs = []
    for key in condDict:
        condIndices = []
        for condition in condDict[key]:
            for ind, dfCond in zip(df.index, df[condColumn]):
                if dfCond == condition: condIndices.append(ind)
        splitDFs.append((key, condIndices))
    splitDFs = dict(splitDFs)
    
    for key in splitDFs:
        name = key.replace(' ','-')
        newDF = pd.DataFrame([df.loc[ind] for ind in splitDFs[key]])
        if newDF.any().any(): # Making sure there's actually data
            newDF.to_csv(file.replace('raw', name+'_raw').replace('allconditions_',''), sep='$')

In [334]:
for medication in allfiles:
    if allfiles[medication]:
        file = 'ProcessedReviews/{:s}_allconditions_raw_reviews.csv'.format(medication.strip().replace(' ','-'))
        splitDFbyCondition(file, keep)

# Final files saved in last step have no duplicate rows and are sorted by condition (and have no conditions I'm not considering in this work)