# In this notebook I will:
* Go through and remove reviews that only have advertisements? (NOT AT THIS TIME)
* Tokenize, lemmatize, remove stop words, and remove instances of words that only show up once that aren't special (words that indicate a condition, medication, side effect, or caregiver role)
* Rejoin processed review into a string for BOW analysis

In [1]:
import pandas as pd
import numpy as np
import glob

# Haven't decided whether I like nltk or spacy better yet
import nltk
from nltk.stem import WordNetLemmatizer
from nltk.corpus import wordnet#, stopwords
#stops = stopwords.words('english')
import spacy
from spacy.tokenizer import Tokenizer
import en_core_web_lg
nlp = en_core_web_lg.load()

# Magical gensim module
from gensim import corpora
from gensim.models import LsiModel, LdaModel
from gensim.models.coherencemodel import CoherenceModel

# A method to process text in nltk:
# https://pythonhealthcare.org/2018/12/14/101-pre-processing-data-tokenization-stemming-and-removal-of-stop-words/

# same process in spacy
# https://spacy.io/usage/linguistic-features

In [2]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression

In [3]:
import matplotlib.pyplot as plt
import seaborn as sns

In [4]:
# https://stackoverflow.com/questions/13928155/spell-checker-for-python/48280566
from autocorrect import Speller
spell = Speller(lang='en')

In [5]:
# Adjusting stop words in spacy to not lose a bunch of negatives for the sentiment analysis
# for word in [u'nor',u'none',u'not',u'alone',u'no',u'never',u'cannot',u'always']:
#     nlp.vocab[word].is_stop = False
# nlp.vocab[u'thing'].is_stop = True
tokenizer = Tokenizer(nlp.vocab)

# Working on processing text data

In [None]:
def get_wordnet_pos(treebank_tag):
    # https://stackoverflow.com/questions/15586721/wordnet-lemmatization-and-pos-tagging-in-python
    if treebank_tag.startswith('J'):
        return wordnet.ADJ
    elif treebank_tag.startswith('V'):
        return wordnet.VERB
#     elif treebank_tag.startswith('NN'):
#         return wordnet.ADJ # Considering ADJ_SET to be same as ADJ
    elif treebank_tag.startswith('N'):
        return wordnet.NOUN
    elif treebank_tag.startswith('R'):
        return wordnet.ADV
    else:
        return ''

def check_PoS(word):
    return get_wordnet_pos(nltk.pos_tag([word])[0][1])

def useful_synonyms(word):
    # Finding PoS of word
    to_pos = check_PoS(word)
    
    # Finding all synonyms in all the parts of speech
    words = []
    syns = wordnet.synsets(word)

    # Chopping down to most common versions of words...this works for side effects more than words like 'cat'
    if len(syns) >= 2:
        synList = syns[:2]
    else:
        synList = syns
    #     if len(syns)%2 and (len(syns) != 1):
#         synList = syns[:len(syns)//2]
#     else:
#         synList = syns[:len(syns)//2+1]

    # Finding all the forms of a word
    for syn in synList:
        for l in syn.lemmas():
            form = l.derivationally_related_forms()
            words.append(l.name())
            for f in form:
                words.append(f.name())
                
    # Getting all the unique words that match the desired part of speech
    words = list(np.unique(words))
    pos = nltk.pos_tag(words)
    return_words = [word.replace('_',' ') for word, word_pos in pos if get_wordnet_pos(word_pos)==to_pos]

    # Getting around weirdness with somehow dropping PoS for original word if matches to_pos (e.g., with weight)
    if get_wordnet_pos(nltk.pos_tag([word])[0][1]) == to_pos and word not in return_words: return_words.append(word)
        
    return return_words

In [None]:
# Magic tokenizer thing
def spacyTokenizer(s: str)-> list:
    doc = tokenizer(s.lower().strip())
    tokens = []
    for token in doc:
        if not token.is_stop and token.is_alpha and token.lemma_ != '-PRON-':
            tokens.append(token.lemma_)
        
    return tokens

In [None]:
def parseRevnew(file):
    sideEff = pd.read_csv(file, sep='$')['Comment']
    clean_SEs = [spacyTokenizer(SE) for SE in sideEff]

#     ignore = [SE for SE in clean_SEs if len(SE) <= 2]
#     consider = [SE for SE in clean_SEs if len(SE) > 2]
    
#     # Testing effect of just adding in more language to work with
#     new_consider = []
#     for chunk in consider:
#         extended = []
#         for w in chunk:
#             extended += [s for s in useful_synonyms(w) if s.find('_') == -1]
#         new_consider.append(extended)
    
    return clean_SEs#consider, ignore

def parseSEnew(file):
    sideEff = np.genfromtxt(file, delimiter='$', dtype=str)
    clean_SEs = [[spell(word) for word in spacyTokenizer(SE)] for SE in sideEff]

#     ignore = [SE for SE in clean_SEs if len(SE) <= 2]
#     consider = [SE for SE in clean_SEs if len(SE) > 2]
    
#     # Testing effect of just adding in more language to work with
#     new_consider = []
#     for chunk in consider:
#         extended = []
#         for w in chunk:
#             extended += [s for s in useful_synonyms(w) if s.find('_') == -1]
#         new_consider.append(extended)
    
    return clean_SEs#consider, ignore

In [None]:
# https://www.datacamp.com/community/tutorials/discovering-hidden-topics-python
def genDictandDocMatrix(cleaned_text):
    dictionary = corpora.Dictionary(cleaned_text)
    matrix = [dictionary.doc2bow(doc) for doc in cleaned_text]
    return dictionary, matrix

def formatLSAresult(topics:list)->list:
    for topic in topics:
        title = "Topic {:g}: \n".format(topic[0])
        term_cluster = [term.strip().split('*')[1][1:-1] for term in topic[1].split('+')]
        term_weight = [term.strip().split('*')[0] for term in topic[1].split('+')]

        print(title, ', '.join(term_cluster),'\n',', '.join(term_weight))
        
def produceLSA(n_topics, cleanText, n_word_report=10):
    dictionary, matrix = genDictandDocMatrix(cleanText)
    lsamodel = LsiModel(matrix, num_topics=n_topics, id2word=dictionary)
    result = lsamodel.print_topics(num_topics=n_topics, num_words=n_word_report)

    return result, lsamodel

def produceLDA(n_topics, cleanText, n_word_report=10):
    dictionary, matrix = genDictandDocMatrix(cleanText)
    ldamodel = LdaModel(matrix, num_topics=n_topics, id2word=dictionary)
    result = ldamodel.print_topics(num_topics=n_topics, num_words=n_word_report)

    return result, ldamodel

#result, model = produceLSA(10, reviews)
#formatLSAresult(result)

In [None]:
def find_sideEffects_inReviews(revFile, SEfile)

    # Parsing reviews
    reviews = parseRevnew(revFile)

    # Finding TFIDF values for every review word
    tfidf_vectr = TfidfVectorizer()
    corpus = [' '.join(rev) for rev in reviews]
    X = tfidf_vectr.fit_transform(corpus)

    # Parsing side effects
    listSEs = parseSEnew(SEfile)
    BagOSE = ' '.join([' '.join(SE) for SE in listSEs])

    # Finding review words that exist in the list of side effects
    found = [(f, i) for i,f in enumerate(np.array(tfidf_vectr.get_feature_names())) if BagOSE.lower().find(' '+f+' ') != -1]
    found = dict(found)
    word_found = [f for f in found]

    # Creating a tokenizer to drop words that were not found in the side effects
    wordsNotFound = [f for f in tfidf_vectr.get_feature_names() if f not in word_found]
    from spacy.vocab import Vocab
    allWords = tfidf_vectr.get_feature_names()
    vocab = Vocab(strings=allWords)

    for word in allWords:
        if word in wordsNotFound:
            vocab[word].is_stop = True
        else:
            vocab[word].is_stop = False

    SETokenizer = Tokenizer(vocab)

    def SECleaner(s: str):
        tokenized = SETokenizer(s)
        tokens = []
        for token in tokenized:
            if not token.is_stop:
                tokens.append(token.lemma_)

        return list(set(tokens))

    # Creating a processed corpus to only contain words present in the side effects list
    processed_corpus = [SECleaner(rev) for rev in corpus]

    # Scoring the level at which a review discusses side effects
    scoreDict = {}
    overall_score = X.toarray()
    found_scores = dict([(word,overall_score[:,found[word]]) for word in found])

    # Scoring each review
    allscores = []
    for SE in listSEs:
        if SE:
            key = ' '.join(SE)
            scoreDict[key] = []
            for i,result in enumerate(processed_corpus):
                score = 0
                for word in set(result):
                    if len(word) > 1:
                        TFIDF_score = found_scores[word][i]

                        # I can't explain why this is true right now, but I will later
                        if TFIDF_score: 
                            score += (word in SE)*TFIDF_score
                                #print(TFIDF_score, word)
                scoreDict[key].append(score)
                allscores.append(score)

    # Finding the upper decile of review scores
    cutoff = np.percentile(allscores, 90)

    # creating scored dataframe
    score_dict_byrevs = []
    for i in range(len(corpus)):
        entry = {}
        for key in scoreDict:
            entry[key] = scoreDict[key][i]
        score_dict_byrevs.append(entry)

    df = pd.DataFrame(score_dict_byrevs)

    # Return the master product
    return df

In [8]:
np.percentile?