# In this notebook I will:
* Go through and remove reviews that only have advertisements? (NOT AT THIS TIME)
* Tokenize, lemmatize, remove stop words, and remove instances of words that only show up once that aren't special (words that indicate a condition, medication, side effect, or caregiver role)
* Rejoin processed review into a string for BOW analysis

In [1]:
import pandas as pd
import numpy as np
import glob

# Haven't decided whether I like nltk or spacy better yet
import nltk
from nltk.stem import WordNetLemmatizer, PorterStemmer
from nltk.corpus import wordnet, stopwords
stops = stopwords.words('english')
import spacy
from spacy.tokenizer import Tokenizer
import en_core_web_lg
nlp = en_core_web_lg.load()

# A method to process text in nltk:
# https://pythonhealthcare.org/2018/12/14/101-pre-processing-data-tokenization-stemming-and-removal-of-stop-words/

# same process in spacy
# https://spacy.io/usage/linguistic-features

In [2]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression

# Just don't have time to learn these right now
#from sklearn.base import TransformerMixin
#from sklearn.pipeline import Pipeline

In [3]:
import matplotlib.pyplot as plt
import seaborn as sns

In [4]:
# Adjusting stop words in spacy to not lose a bunch of negatives for the sentiment analysis
for word in [u'nor',u'none',u'not',u'alone',u'no',u'never',u'cannot',u'always']:
    nlp.vocab[word].is_stop = False
nlp.vocab[u'thing'].is_stop = True
tokenizer = Tokenizer(nlp.vocab)

# Working on processing text data

In [5]:
def spacyTokenizer(s: str)-> list:
    doc = tokenizer(s.lower().strip())
    tokens = []
    for token in doc:
        if not token.is_stop and token.is_alpha and token.lemma_ != '-PRON-':
            tokens.append(token.lemma_)
        
    return tokens

def getSynonyms(word):
    #https://www.geeksforgeeks.org/get-synonymsantonyms-nltk-wordnet-python/
    synonyms = []
    for syn in wordnet.synsets(word):
        for l in syn.lemmas():
            synonyms+= spacyTokenizer(l.name().replace('_', ' ').lower())
           
    return list(set(synonyms))

# Setting a very basic metric for uniqueness to test a new way to quantify side effect presence
def quantify_uniqueness(word):
    syns = getSynonyms(word)
    #print(syns)
    return len(syns+[word])**-1

In [6]:
def get_wordnet_pos(treebank_tag):
    # https://stackoverflow.com/questions/15586721/wordnet-lemmatization-and-pos-tagging-in-python
    if treebank_tag.startswith('J'):
        return wordnet.ADJ
    elif treebank_tag.startswith('V'):
        return wordnet.VERB
#     elif treebank_tag.startswith('NN'):
#         return wordnet.ADJ # Considering ADJ_SET to be same as ADJ
    elif treebank_tag.startswith('N'):
        return wordnet.NOUN
    elif treebank_tag.startswith('R'):
        return wordnet.ADV
    else:
        return ''

In [7]:
def check_PoS(word, pos=wordnet.NOUN):
    return get_wordnet_pos(nltk.pos_tag([word])[0][1]) == pos


# Working off of:
# https://nlpforhackers.io/convert-words-between-forms/

def convert_word_plus_synonyms(word, to_pos=wordnet.NOUN):
    # Finding all synonyms in all the parts of speech
    words = []
    syns = wordnet.synsets(word)

    # Chopping down to most common versions of words...this works for side effects more than words like 'cat'
    if len(syns)%2 and (len(syns) != 1):
        synList = syns[:len(syns)//2]
    else:
        synList = syns[:len(syns)//2+1]

    # Finding all the forms of a word
    for syn in synList:
        for l in syn.lemmas():
            form = l.derivationally_related_forms()
            words.append(l.name())
            for f in form:
                words.append(f.name())
                
    # Getting all the unique words that match the desired part of speech
    words = list(np.unique(words))
    pos = nltk.pos_tag(words)
    return_words = [word for word, word_pos in pos if get_wordnet_pos(word_pos)==to_pos]

    # Getting around weirdness with somehow dropping PoS for original word if matches to_pos (e.g., with weight)
    if get_wordnet_pos(nltk.pos_tag([word])[0][1]) == to_pos and word not in return_words: return_words.append(word)
        
    return return_words

In [17]:
# Initializing the count vectorizer
vectorizer = CountVectorizer()

# Parsing side effects
def parseSideEffects_basic(file):
    # Loading in the file I curated
    sideEff = np.genfromtxt(file, delimiter='$', dtype=str)

    cleanedSEs = []
    uniqueness = []
    for SE in sideEff:
        if len(SE.strip().split(' ')) == 1:
            cleanedSEs.append(' '.join(list(set(getSynonyms(SE.strip().lower())))))
        else:
            cleanedSEs.append(' '.join(list(set(spacyTokenizer(SE)))))
        uniqueness.append([quantify_uniqueness(word) for word in spacyTokenizer(cleanedSEs[-1])])
    
    return cleanedSEs, uniqueness

def parseSideEffects_advanced(file):
    sideEff = np.genfromtxt(file, delimiter='$', dtype=str)
    sideEffectBlocks = {}
    for SE in sideEff:
        SEsplit = spacyTokenizer(SE)
        syns = []
        for word in SEsplit:
            convword = convert_word_plus_synonyms(word)
            syns.append(convword)
        sideEffectBlocks[SE] = syns
    return sideEffectBlocks

def parseReviewSentences_basic(file):
    reviews = pd.read_csv(file, sep='$')['Comment']
    
    cleanedRevs = [[' '.join(spacyTokenizer(sent)) for sent in rev.split('.')] for rev in reviews]
    return cleanedRevs

def findSideEffects_advanced(seFile, revFile):
    # Parsing side effects and reviews
    cleanedSEs = parseSideEffects_advanced(seFile)
    cleanedRevs = parseReviewSentences_basic(revFile)
    
    tracer = {}
    for i, rev in enumerate(cleanedRevs):
        tracer[i] = {}
        for j, cSE in enumerate(cleanedSEs):
            tracer[i][cSE] = np.zeros(len(cleanedSEs[cSE]))
            for k,wordSyns in enumerate(cleanedSEs[cSE]):
                if wordSyns:
                    vectorizer.vocabulary = wordSyns
                    X = vectorizer.fit_transform(rev).toarray()
                    result = (X > 0).sum() > 0
                    tracer[i][cSE][k] += result
    
    return tracer, cleanedSEs.keys(), cleanedRevs
    
def findSideEffects_basic(seFile, revFile):
    # Parsing side effects and reviews
    cleanedSEs, uniqSEs = parseSideEffects_basic(seFile)
    cleanedRevs = parseReviewSentences_basic(revFile)
    
    
    # For each sentence in each review
    tracer = {}
    for i,rev in enumerate(cleanedRevs):
        tracer[i] = {}
        for j,cSE in enumerate(cleanedSEs):
            concern = cSE.strip().split(' ')
            vectorizer.vocabulary = concern
            X = vectorizer.fit_transform(rev).toarray()
            result = (X*np.array(uniqSEs[j])[-1,None]).sum(axis=1)
            tracer[i][cSE] = result
            
    return tracer, cleanedSEs, cleanedRevs

In [21]:
findings, SEs, Revs = findSideEffects_advanced('SideEffects/ADHD_SideEffects_denormed.csv',
                                           'ProcessedReviews/ADHD/Bupropion_ADHD_parsed_reviews.csv')

In [23]:
for key in findings:
     for j in findings[key]:
            if findings[key][j].sum() > (len(findings[key][j])//2):
                print(Revs[key],'\n', j, '\n\n')

['drug screw sleep take difficult', 'want fair shake keep', 'develope odd neurological tingle twitch finger', 'associate drug', 'horrible neurological condition rush hospital', 'hypokalemia low potassium', 'turn associate', 'business boyfriend month late take smoke drive hospital reason', ''] 
 Twitching 


['med day feel like massive pain knee muscle ear sore swell dry increase heart feel like chemical taste mouth like swallow bleach feel invincible focus little well not worth feel like verge death', 'design drug company spend billion dollar garbage come plz'] 
 dry mouth 


['med day feel like massive pain knee muscle ear sore swell dry increase heart feel like chemical taste mouth like swallow bleach feel invincible focus little well not worth feel like verge death', 'design drug company spend billion dollar garbage come plz'] 
 sore throat pain 


['med day feel like massive pain knee muscle ear sore swell dry increase heart feel like chemical taste mouth like swallow bleach feel i

In [None]:
# for i in findings:
#     for j in findings[i]:
#         if findings[i][j].sum() > 0.25: print(findings[i][j].sum(), j, '\n', Revs[i], '\n\n')
        
scores = []
revs = {}
for i in findings:
    revs['. '.join(Revs[i])] = []
    for j in findings[i]:
        score = findings[i][j].sum()/len(j.split(' '))**2
        scores.append(score)
        if score > 0.03: revs['. '.join(Revs[i])].append(j)
        
plt.hist(scores, bins=100)
plt.yscale('log')

In [None]:
for rev in revs:
    if revs[rev]:
        print(rev, '\n', revs[rev], '\n\n')

In [None]:
getSynonyms('sleepiness')

# Example of classical sentiment analysis

In [None]:
# Working on sentiment analysis
# Starting source: https://www.datacamp.com/community/tutorials/simplifying-sentiment-analysis-python
# Other source: https://www.dataquest.io/blog/tutorial-text-classification-in-python-using-spacy/

def comment_features(comment):
    document = list(nltk.FreqDist(w for w in spacyTokenizer(comment)))
    document_words = set(document)
    
    features = {}
    for word in document:
        features['contains({})'.format(word)] = (word in document_words)
    return features

def dichotomize_satisfaction(score):
    if score > 3:
        return 'positive'
    else:
        return 'negative'

featuresets = [(comment_features(comment),dichotomize_satisfaction(sat)) for comment, sat in zip(parsedDF['Comment'], parsedDF['Effectiveness'])] 
classifier = nltk.NaiveBayesClassifier.train(featuresets)
classifier.show_most_informative_features(20)