In [1]:
# Train data with Naive Bayes Classifier

In [59]:
import pandas as pd
import numpy as np
import re


In [6]:
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import Pipeline


In [7]:
from sklearn.metrics import classification_report, confusion_matrix

# Cleaning pipeline

In [8]:
pipeline = Pipeline([
    ('bow', CountVectorizer()),  # strings to token integer counts
    ('tfidf', TfidfTransformer()),  # integer counts to weighted TF-IDF scores
    ('classifier', MultinomialNB()),  # train on TF-IDF vectors w/ Naive Bayes classifier
])

In [21]:
data = pd.read_csv('../util/reviews_token.csv')

In [22]:
data.drop(['Unnamed: 0', 'Unnamed: 0.1'],axis=1,inplace=True)

In [29]:
data.dropna(inplace=True)

In [34]:
msg_train, msg_test, label_train, label_test = train_test_split(data.clean_text, data.Question, test_size=0.4)

In [35]:
pipeline.fit(msg_train,label_train)

Pipeline(steps=[('bow', CountVectorizer()), ('tfidf', TfidfTransformer()),
                ('classifier', MultinomialNB())])

In [36]:
predictions = pipeline.predict(msg_test)

In [37]:
print(confusion_matrix(label_test,predictions))
print('\n')
print(classification_report(label_test,predictions))

[[ 183    1    0   65  482    1    0    0]
 [   0  518    0   25   39    0    7    0]
 [   1    2  140   12  164    0   14    0]
 [   2    2    0 1074  240    0    2    0]
 [   0    0    0   79 1479    1    9    0]
 [   0    0    0  177  232    9    0    0]
 [   0    5    1    2  161    0  471    0]
 [   0    0    0    2    0    0    0  244]]


                                   precision    recall  f1-score   support

  advice give prospective patient       0.98      0.25      0.40       732
                             cost       0.98      0.88      0.93       589
eSET vs. multiple embryo transfer       0.99      0.42      0.59       333
           experience with clinic       0.75      0.81      0.78      1320
           experience with doctor       0.53      0.94      0.68      1568
            experience with nurse       0.82      0.02      0.04       418
            protocols and success       0.94      0.74      0.82       640
       specific things went wrong       1.00      0.

# SVM Model

In [93]:
from sklearn import svm

In [101]:
SVM = svm.SVC(C=1.0, degree=3) # default kernel = ’rbf’

In [102]:
pipeline1 = Pipeline([
    ('bow', CountVectorizer()),  # strings to token integer counts
    ('tfidf', TfidfTransformer()),  # integer counts to weighted TF-IDF scores
    ('classifier', SVM),  # train on TF-IDF vectors w/ SVM
])

In [103]:
pipeline1.fit(msg_train,label_train)

Pipeline(steps=[('bow', CountVectorizer()), ('tfidf', TfidfTransformer()),
                ('classifier', SVC())])

In [104]:
predictions1 = pipeline1.predict(msg_test)

In [105]:
print(confusion_matrix(label_test,predictions1))
print('\n')
print(classification_report(label_test,predictions1))

[[ 483    2    6   66  165    5    5    0]
 [   9  538    4   12    8    1   17    0]
 [  16    1  290    3   17    0    6    0]
 [  28    6    4 1112  108   56    6    0]
 [  32    2    2  116 1373   17   26    0]
 [  13    0    0  148   71  185    1    0]
 [   9    6   12    4   59    0  550    0]
 [   0    0    0    0    0    0    0  246]]


                                   precision    recall  f1-score   support

  advice give prospective patient       0.82      0.66      0.73       732
                             cost       0.97      0.91      0.94       589
eSET vs. multiple embryo transfer       0.91      0.87      0.89       333
           experience with clinic       0.76      0.84      0.80      1320
           experience with doctor       0.76      0.88      0.82      1568
            experience with nurse       0.70      0.44      0.54       418
            protocols and success       0.90      0.86      0.88       640
       specific things went wrong       1.00      1.

# Model with word embedding

In [108]:
from UtilWordEmbedding import DocPreprocess

ModuleNotFoundError: No module named 'UtilWordEmbedding'

In [109]:
word_model = Word2Vec()

NameError: name 'Word2Vec' is not defined

In [110]:
pip install word2vec

Note: you may need to restart the kernel to use updated packages.


# LDA Model

In [39]:
from sklearn.decomposition import LatentDirichletAllocation as LDA

  return f(*args, **kwds)
  return f(*args, **kwds)


In [50]:
from gensim import corpora



In [60]:
def preprocessor(text):
    if type(text) == str:
        text = re.sub('<[^>]*>', '', text)
        text = re.sub('[\W]+', '', text.lower())
    return text

  text = re.sub('[\W]+', '', text.lower())


In [64]:
import spacy
spacy.load('en')
from spacy.lang.en import English
parser = English()
def tokenize(text):
    lda_tokens = []
    tokens = parser(text)
    for token in tokens:
        if token.orth_.isspace():
            continue
        #elif token.like_url:
        #    lda_tokens.append('URL')
        #elif token.orth_.startswith('@'):
        #    lda_tokens.append('SCREEN_NAME')
        else:
            lda_tokens.append(token.lower_)
    return lda_tokens

In [48]:
import nltk
nltk.download('wordnet')
from nltk.corpus import wordnet as wn
def get_lemma(word):
    lemma = wn.morphy(word)
    if lemma is None:
        return word
    else:
        return lemma
    
from nltk.stem.wordnet import WordNetLemmatizer
def get_lemma2(word):
    return WordNetLemmatizer().lemmatize(word)

[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/jacobberger/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [49]:
nltk.download('stopwords')
en_stop = set(nltk.corpus.stopwords.words('english'))

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/jacobberger/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [51]:
def prepare_text_for_lda(text):
    tokens = tokenize(text)
    #tokens = [token for token in tokens if len(token) > 4]
    tokens = [token for token in tokens if token not in en_stop]
    tokens = [get_lemma(token) for token in tokens]
    return tokens

In [66]:
data['tokens']= data.clean_text.apply(prepare_text_for_lda)

In [67]:
dictionary = corpora.Dictionary(data.tokens)

In [68]:
corpus = [dictionary.doc2bow(text) for text in data.tokens]

In [69]:
import gensim
NUM_TOPICS = 8
ldamodel = gensim.models.ldamodel.LdaModel(corpus, num_topics = NUM_TOPICS, id2word=dictionary, passes=15)

In [70]:
import pyLDAvis.gensim
lda_display = pyLDAvis.gensim.prepare(ldamodel, corpus, dictionary, sort_topics=False)
pyLDAvis.display(lda_display)

Clinic Specific LDA

In [72]:
columbia = data[data.clinic_name=='Columbia University']['clean_text'].apply(prepare_text_for_lda)

In [73]:
rmanj = data[data.clinic_name=='RMA of New Jersey']['clean_text'].apply(prepare_text_for_lda)

In [75]:
columbia_dic = corpora.Dictionary(columbia)

In [76]:
rmanj_dic = corpora.Dictionary(rmanj)

In [79]:
columbia_corpus = [columbia_dic.doc2bow(text) for text in columbia]

In [78]:
rmanj_corpus = [rmanj_dic.doc2bow(text) for text in rmanj]

In [80]:
NUM_TOPICS = 8
columbia_ldamodel = gensim.models.ldamodel.LdaModel(columbia_corpus, num_topics = NUM_TOPICS, id2word=columbia_dic, passes=15)

In [81]:
NUM_TOPICS = 8
rmanj_ldamodel = gensim.models.ldamodel.LdaModel(rmanj_corpus, num_topics = NUM_TOPICS, id2word=rmanj_dic, passes=15)

In [84]:
lda_display = pyLDAvis.gensim.prepare(columbia_ldamodel, columbia_corpus, columbia_dic, sort_topics=False)
pyLDAvis.display(lda_display)

In [85]:
lda_display = pyLDAvis.gensim.prepare(rmanj_ldamodel, rmanj_corpus, rmanj_dic, sort_topics=False)
pyLDAvis.display(lda_display)

In [90]:
columbia_data = data[data.clinic_name=="Columbia University"]
columbia_data.shape

(1061, 11)

In [89]:
columbia_data[columbia_data.Question=="specific things went wrong"].shape

(44, 11)

In [91]:
rmanj_data = data[data.clinic_name=="RMA of New Jersey"]
rmanj_data.shape

(2451, 11)

In [92]:
rmanj_data[rmanj_data.Question=="specific things went wrong"].shape

(59, 11)