# In this notebook I will:
* Go through and remove reviews that only have advertisements? (NOT AT THIS TIME)
* Tokenize, lemmatize, remove stop words, and remove instances of words that only show up once that aren't special (words that indicate a condition, medication, side effect, or caregiver role)
* Rejoin processed review into a string for BOW analysis

In [1]:
import pandas as pd
import numpy as np
import glob

# Haven't decided whether I like nltk or spacy better yet
import nltk
from nltk.stem import WordNetLemmatizer
from nltk.corpus import wordnet#, stopwords
from nltk import sentiment
VADER_SIA = sentiment.vader.SentimentIntensityAnalyzer()
#stops = stopwords.words('english')
import spacy
from spacy.tokenizer import Tokenizer
import en_core_web_lg
nlp = en_core_web_lg.load()

# Magical gensim module
from gensim import corpora
from gensim.models import LsiModel, LdaModel
from gensim.models.coherencemodel import CoherenceModel

# A method to process text in nltk:
# https://pythonhealthcare.org/2018/12/14/101-pre-processing-data-tokenization-stemming-and-removal-of-stop-words/

# same process in spacy
# https://spacy.io/usage/linguistic-features

In [3]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression

from scipy.spatial import distance
cdist = distance.cdist

In [4]:
import matplotlib.pyplot as plt
import seaborn as sns

In [5]:
# https://stackoverflow.com/questions/13928155/spell-checker-for-python/48280566
from autocorrect import Speller
spell = Speller(lang='en')

In [6]:
# Adjusting stop words in spacy to not lose a bunch of negatives for the sentiment analysis
# for word in [u'nor',u'none',u'not',u'alone',u'no',u'never',u'cannot',u'always']:
#     nlp.vocab[word].is_stop = False
# nlp.vocab[u'thing'].is_stop = True
tokenizer = Tokenizer(nlp.vocab)

# Working on processing text data

In [7]:
def get_wordnet_pos(treebank_tag):
    # https://stackoverflow.com/questions/15586721/wordnet-lemmatization-and-pos-tagging-in-python
    if treebank_tag.startswith('J'):
        return wordnet.ADJ
    elif treebank_tag.startswith('V'):
        return wordnet.VERB
#     elif treebank_tag.startswith('NN'):
#         return wordnet.ADJ # Considering ADJ_SET to be same as ADJ
    elif treebank_tag.startswith('N'):
        return wordnet.NOUN
    elif treebank_tag.startswith('R'):
        return wordnet.ADV
    else:
        return ''

def check_PoS(word):
    return get_wordnet_pos(nltk.pos_tag([word])[0][1])

def useful_synonyms(word):
    # Finding PoS of word
    to_pos = check_PoS(word)
    
    # Finding all synonyms in all the parts of speech
    words = []
    syns = wordnet.synsets(word)

    # Chopping down to most common versions of words...this works for side effects more than words like 'cat'
    if len(syns) >= 2:
        synList = syns[:2]
    else:
        synList = syns

    # Finding all the forms of a word
    for syn in synList:
        for l in syn.lemmas():
            form = l.derivationally_related_forms()
            words.append(l.name())
            for f in form:
                words.append(f.name())
                
    # Getting all the unique words that match the desired part of speech
    words = list(np.unique(words))
    pos = nltk.pos_tag(words)
    return_words = [word.replace('_',' ') for word, word_pos in pos if get_wordnet_pos(word_pos)==to_pos]

    # Getting around weirdness with somehow dropping PoS for original word if matches to_pos (e.g., with weight)
    if get_wordnet_pos(nltk.pos_tag([word])[0][1]) == to_pos and word not in return_words: return_words.append(word)
        
    return return_words

In [8]:
# Magic tokenizer thing
def spacyTokenizer(s: str)-> list:
    doc = tokenizer(s.lower().strip())
    tokens = []
    for token in doc:
        if not token.is_stop and token.is_alpha and token.lemma_ != '-PRON-':
            tokens.append(token.lemma_)
        
    return tokens

In [10]:
useful_synonyms('balding')

['bald', 'balding']