# Imports

In [1]:
import pandas as pd # DataFrame Manipulation Package
import numpy as np
import matplotlib.pyplot as plt

from sklearn.feature_extraction.text import TfidfVectorizer # Convert a collection of raw documents to a matrix of TF-IDF features
from sklearn.decomposition import LatentDirichletAllocation # Latent Dirichlet Allocation is a topic model that is used for discovering abstract topics from a collection of documents (variational Bayes algorithm)
from sklearn.feature_extraction.text import CountVectorizer # Convert a collection of text documents to a matrix of token counts

from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV
from sklearn.naive_bayes import MultinomialNB # The multinomial Naive Bayes classifier is suitable for classification with discrete features (e.g., word counts for text classification)

import string # Collection of string operations
from nltk.corpus import stopwords 
from nltk.stem.wordnet import WordNetLemmatizer #Lemmatize using WordNet's built-in morphy function. Returns the input word unchanged if it cannot be found in WordNet.
from nltk import word_tokenize

from nltk.sentiment.util import mark_negation



In [2]:
!pip install corextopic



In [3]:
import nltk

nltk.download('stopwords')
nltk.download('punkt')
nltk.download('wordnet')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

# Functions

In [4]:
stop_words = set(stopwords.words('english')) 

for negation in ["no", "not", "shouldn't", "aren't", "couldn't", "didn't", "doesn't", "don't", "wasn't", "weren't", "wouldn't"]:
    stop_words.remove(negation)

#stop_words

In [5]:
def to_list(x):
    list_words = x.split(' ')
    return list_words

def to_string(x):
    string = " ".join(x)
    return string

#===============================================================

def punctuation(x):
    for punctuation in string.punctuation:
        x =  x.replace(punctuation, '')
    return x.lower()

def remove_numbers (x):
    words_only = ''.join([i for i in x if not i.isdigit()])
    return words_only

def m_negation(x):
    tokenized = word_tokenize(x)
    without_stopwords = [word for word in tokenized if not word in stop_words]
    tokenized_neg = mark_negation(without_stopwords)
    return tokenized_neg

def remove_stopwords(x):
    tokenized = word_tokenize(x)
    without_stopwords = [word for word in tokenized if not word in stop_words]
    return without_stopwords

def lemmatize_review(x):
    lemma = WordNetLemmatizer()
    lista = []
    for w in x:
       lista.append(lemma.lemmatize(w))
    return lista

#===============================================================

def count_words(x):
    wordfreq = []
    for w in x:
        wordfreq.append(x.count(w))
    return dict(zip(x, wordfreq))

def total_count(x):
    total_count = {}
    for row in x:
        for key in row.keys():
          if key in total_count:
              total_count[key] += 1
          else:
              total_count[key] = 1
    return pd.DataFrame(sorted(total_count.items(), key=lambda x: x[1], reverse=True)).head(30).T

#===============================================================

def print_topics(model, vectorizer):
    for idx, topic in enumerate(model.components_):
        print("Topic %d:" % (idx))
        print([(vectorizer.get_feature_names()[i], round(topic[i], 2))
                        for i in topic.argsort()[:-10 - 1:-1]])

# Manual

In [6]:
manual = pd.read_csv('manually_labelled_data.csv')

manual = manual.drop(["uniqueID", "drugName", "condition", "date", "rating", "usefulCount"], axis = 1)

manual["clean_review"] = manual["review"].apply(punctuation)
manual['clean_review'] = manual.clean_review.apply(remove_numbers)
manual['clean_review_lst'] = manual.clean_review.apply(to_list)

manual["NonStopwords_review_lst"] = manual.clean_review.apply(remove_stopwords)
manual["NonStopwords_review_str"] = manual.NonStopwords_review_lst.apply(to_string)

manual["NonStopwords_review_lst_MN"] = manual.clean_review.apply(m_negation)
manual["NonStopwords_review_str_MN"] = manual.NonStopwords_review_lst_MN.apply(to_string)

manual["Lemmatized_review_lst"] = manual.NonStopwords_review_lst_MN.apply(lemmatize_review)
manual["Lemmatized_review_str"] = manual.Lemmatized_review_lst.apply(to_string)

manual["Lemmatized_review"] = manual.NonStopwords_review_lst.apply(lemmatize_review)
manual["Lemmatized_review"] = manual.Lemmatized_review.apply(to_string)

manual["words_count"] = manual.Lemmatized_review_lst.apply(count_words)

X = manual["Lemmatized_review_str"]

y = manual["sideEffect"]

manual.head(3)

Unnamed: 0,review,sideEffect,clean_review,clean_review_lst,NonStopwords_review_lst,NonStopwords_review_str,NonStopwords_review_lst_MN,NonStopwords_review_str_MN,Lemmatized_review_lst,Lemmatized_review_str,Lemmatized_review,words_count
0,"""It has no side effect, I take it in combinati...",0,it has no side effect i take it in combination...,"[it, has, no, side, effect, i, take, it, in, c...","[no, side, effect, take, combination, bystolic...",no side effect take combination bystolic mg fi...,"[no, side_NEG, effect_NEG, take_NEG, combinati...",no side_NEG effect_NEG take_NEG combination_NE...,"[no, side_NEG, effect_NEG, take_NEG, combinati...",no side_NEG effect_NEG take_NEG combination_NE...,no side effect take combination bystolic mg fi...,"{'no': 1, 'side_NEG': 1, 'effect_NEG': 1, 'tak..."
1,"""My son is halfway through his fourth week of ...",1,my son is halfway through his fourth week of i...,"[my, son, is, halfway, through, his, fourth, w...","[son, halfway, fourth, week, intuniv, became, ...",son halfway fourth week intuniv became concern...,"[son, halfway, fourth, week, intuniv, became, ...",son halfway fourth week intuniv became concern...,"[son, halfway, fourth, week, intuniv, became, ...",son halfway fourth week intuniv became concern...,son halfway fourth week intuniv became concern...,"{'son': 1, 'halfway': 1, 'fourth': 1, 'week': ..."
2,"""I used to take another oral contraceptive, wh...",1,i used to take another oral contraceptive whic...,"[i, used, to, take, another, oral, contracepti...","[used, take, another, oral, contraceptive, pil...",used take another oral contraceptive pill cycl...,"[used, take, another, oral, contraceptive, pil...",used take another oral contraceptive pill cycl...,"[used, take, another, oral, contraceptive, pil...",used take another oral contraceptive pill cycl...,used take another oral contraceptive pill cycl...,"{'used': 1, 'take': 1, 'another': 1, 'oral': 1..."


In [7]:
X.shape

(100,)

In [8]:
y.shape

(100,)

In [9]:
total_count(manual["words_count"])

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29
0,day,side_NEG,not_NEG,not,year,effects_NEG,taking,started,mg,no,week,im,take_NEG,month,first,like_NEG,time,took,one,would_NEG,started_NEG,day_NEG,taking_NEG,pain,ive,take,time_NEG,work_NEG,ago,two
1,28,26,24,24,24,23,22,20,19,18,18,18,17,17,15,15,14,14,14,14,13,13,13,13,13,12,12,12,12,11


# Data

In [10]:
data = pd.read_csv('drugsComTrain_raw.csv')

print(f"The shape of the data is {data.shape[0]} rows and {data.shape[1]} columns")
print("\n")
print(f"The amount of unique ID is {len(data['uniqueID'].unique())}") # check if any of the uniqueID repeat
print("\n")
print(f"The number of unique drugs reviewed is {len(data['drugName'].unique())}")
print("\n")
print(f"The number of unique conditions is {len(data['condition'].unique())}")
print("\n")

The shape of the data is 161297 rows and 7 columns


The amount of unique ID is 161297


The number of unique drugs reviewed is 3436


The number of unique conditions is 885




In [11]:
pd.DataFrame(data["drugName"].value_counts()).head(14).T

Unnamed: 0,Levonorgestrel,Etonogestrel,Ethinyl estradiol / norethindrone,Nexplanon,Ethinyl estradiol / norgestimate,Ethinyl estradiol / levonorgestrel,Phentermine,Sertraline,Escitalopram,Mirena,Implanon,Gabapentin,Bupropion,Venlafaxine
drugName,3657,3336,2850,2156,2117,1888,1543,1360,1292,1242,1102,1047,1022,1016


In [12]:
pd.DataFrame(data["condition"].value_counts()).head(14).T

Unnamed: 0,Birth Control,Depression,Pain,Anxiety,Acne,Bipolar Disorde,Insomnia,Weight Loss,Obesity,ADHD,"Diabetes, Type 2",Emergency Contraception,High Blood Pressure,Vaginal Yeast Infection
condition,28788,9069,6145,5904,5588,4224,3673,3609,3568,3383,2554,2463,2321,2274


In [13]:
NUMERO_SAMPLES = 150

data = data.drop(["uniqueID", "drugName", "condition", "date", "rating", "usefulCount"], axis = 1).head(NUMERO_SAMPLES) # SACAR ESTOOOOOO!!

In [14]:
data["clean_review"] = data["review"].apply(punctuation)
data['clean_review'] = data.clean_review.apply(remove_numbers)
data['clean_review_lst'] = data.clean_review.apply(to_list)

data["NonStopwords_review_lst"] = data.clean_review.apply(remove_stopwords)
data["NonStopwords_review_str"] = data.NonStopwords_review_lst.apply(to_string)

data["NonStopwords_review_lst_MN"] = data.clean_review.apply(m_negation)
data["NonStopwords_review_str_MN"] = data.NonStopwords_review_lst_MN.apply(to_string)

data["Lemmatized_review_lst"] = data.NonStopwords_review_lst_MN.apply(lemmatize_review)
data["Lemmatized_review_str"] = data.Lemmatized_review_lst.apply(to_string)

data["Lemmatized_review"] = data.NonStopwords_review_lst.apply(lemmatize_review)
data["Lemmatized_review"] = data.Lemmatized_review.apply(to_string)

data["words_count"] = data.Lemmatized_review_lst.apply(count_words)

data.head(3)

Unnamed: 0,review,clean_review,clean_review_lst,NonStopwords_review_lst,NonStopwords_review_str,NonStopwords_review_lst_MN,NonStopwords_review_str_MN,Lemmatized_review_lst,Lemmatized_review_str,Lemmatized_review,words_count
0,"""It has no side effect, I take it in combinati...",it has no side effect i take it in combination...,"[it, has, no, side, effect, i, take, it, in, c...","[no, side, effect, take, combination, bystolic...",no side effect take combination bystolic mg fi...,"[no, side_NEG, effect_NEG, take_NEG, combinati...",no side_NEG effect_NEG take_NEG combination_NE...,"[no, side_NEG, effect_NEG, take_NEG, combinati...",no side_NEG effect_NEG take_NEG combination_NE...,no side effect take combination bystolic mg fi...,"{'no': 1, 'side_NEG': 1, 'effect_NEG': 1, 'tak..."
1,"""My son is halfway through his fourth week of ...",my son is halfway through his fourth week of i...,"[my, son, is, halfway, through, his, fourth, w...","[son, halfway, fourth, week, intuniv, became, ...",son halfway fourth week intuniv became concern...,"[son, halfway, fourth, week, intuniv, became, ...",son halfway fourth week intuniv became concern...,"[son, halfway, fourth, week, intuniv, became, ...",son halfway fourth week intuniv became concern...,son halfway fourth week intuniv became concern...,"{'son': 1, 'halfway': 1, 'fourth': 1, 'week': ..."
2,"""I used to take another oral contraceptive, wh...",i used to take another oral contraceptive whic...,"[i, used, to, take, another, oral, contracepti...","[used, take, another, oral, contraceptive, pil...",used take another oral contraceptive pill cycl...,"[used, take, another, oral, contraceptive, pil...",used take another oral contraceptive pill cycl...,"[used, take, another, oral, contraceptive, pil...",used take another oral contraceptive pill cycl...,used take another oral contraceptive pill cycl...,"{'used': 1, 'take': 1, 'another': 1, 'oral': 1..."


In [15]:
total_count(data["words_count"])

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29
0,day,side_NEG,not_NEG,year,effects_NEG,not,no,started,month,mg,week,taking,time,im,pain,take_NEG,first,ive,get_NEG,like_NEG,would_NEG,im_NEG,no_NEG,doctor,medication,day_NEG,taking_NEG,one,get,take
1,41,39,36,34,32,32,31,29,29,27,26,25,23,23,23,22,22,21,20,20,20,19,19,18,18,18,18,18,17,17


In [16]:
# print(type(data["clean_review"][0]))

# print(type(data["review_words"][0]))

In [17]:
# correlation = data["rating"].corr(data["usefulCount"], method='pearson')

# correlation

In [18]:
# plt.matshow(data[["rating", "usefulCount"]].corr())
# plt.show()

# Semi

In [19]:
# seed_topics = {'NASA': 0,
#                'SpaceX': 0,
#                'Apple': 1,
#                'Google': 1,
#                'Physics': 2,
#                'Chemistry': 2,}model.fit(X, seed_topics=seed_topics, seed_confidence=0.15).

# TF-IDF features | Latent Dirichlet allocation

In [20]:
vectorizer = TfidfVectorizer(min_df = 0.01, 
                             max_df = 0.50, 
                             max_features = None,
                             vocabulary = None,
                             ngram_range = (1, 2)).fit(manual["Lemmatized_review"]) #

# MINDF Ignore terms that have a document frequency strictly higher than the given threshold
# MAXDF When building the vocabulary ignore terms that have a document frequency strictly lower than the given threshold

data_vectorized = vectorizer.transform(manual["Lemmatized_review"]) #

lda_model = LatentDirichletAllocation(n_components = 2,
                                      learning_method = 'online',   
                                      random_state = 29,
                                      #batch_size = 128,
                                      learning_decay = 0.5,
                                      #learning_offset = 10.0,
                                      #evaluate_every = -1,
                                      verbose = 0,
                                      max_iter = 50).fit(data_vectorized)

vocab = vectorizer.get_feature_names()

In [21]:
vectorizer = vectorizer.fit(manual["Lemmatized_review"])
tfidf = vectorizer.transform(manual["Lemmatized_review"])
vocab = vectorizer.get_feature_names()
print(len(vocab))

5663


In [22]:
from corextopic import corextopic as ct

anchors = []
model = ct.Corex(n_hidden = 7, seed=42)
model = model.fit(
    tfidf,
    words = vocab
)

In [23]:
for i, topic_ngrams in enumerate(model.get_topics(n_words=10)):
    topic_ngrams = [ngram[0] for ngram in topic_ngrams if ngram[1] > 0]
    print("Topic #{}: {}".format(i+1, ", ".join(topic_ngrams)))

Topic #1: cat, cost, suboxone, spent, starting, bitten, started using, medical, breast went, went small
Topic #2: abilify, caused, caused gain, gain pound, thought, say, overall, cough, get sleep, lady
Topic #3: water, irsquom, period, want, sometimes, nexplanon, trulicity, available, drinking
Topic #4: vl
Topic #5: good response, useful, response, response useful, evening, ringing, itchy, manageable, seemed, lithium
Topic #6: birth, birth control, form
Topic #7: nuvaring, insomnia


In [24]:
# Anchors designed to nudge the model towards measuring specific genres
anchors = [
    ["headache", "side effect"],
    ["diarrhea", "side effect"],
    ["acne", "side effect"],
    ["problems", "side effect"],
    ["insomnia", "side effect"],
    ["gain pound", "caused gain", "side effect"],
    ["good", "better", "fine", "well", "no", "useful", "good", "response", "no side"]
]
anchors = [
    [a for a in topic if a in vocab]
    for topic in anchors
]

model = ct.Corex(n_hidden=7, seed=42)
model = model.fit(
    tfidf,
    words=vocab,
    anchors=anchors, # Pass the anchors in here
    anchor_strength=3 # Tell the model how much it should rely on the anchors
)

In [25]:
for i, topic_ngrams in enumerate(model.get_topics(n_words=10)):
    topic_ngrams = [ngram[0] for ngram in topic_ngrams if ngram[1] > 0]
    print("Topic #{}: {}".format(i+1, ", ".join(topic_ngrams)))

Topic #1: birth control, birth
Topic #2: 
Topic #3: sometimes, wouldnt, zepatier, say, ringing
Topic #4: side, vl, side effect
Topic #5: cat, nuvaring, needed, evening
Topic #6: caused gain, gain pound, period, pound, irsquom
Topic #7: response, useful, abilify, response useful, good response, amitiza


In [26]:
topic_df = pd.DataFrame(
    model.transform(tfidf), 
    columns=["topic_{}".format(i+1) for i in range(7)]).astype(float)

topic_df.index = manual.index
df = pd.concat([manual, topic_df], axis=1)

In [27]:
df.drop(["clean_review", "clean_review_lst", "NonStopwords_review_lst", "NonStopwords_review_str", "NonStopwords_review_lst_MN",
         "NonStopwords_review_str_MN", "Lemmatized_review_lst", "Lemmatized_review_str", "words_count", "Lemmatized_review"], axis = 1).head(3)

Unnamed: 0,review,sideEffect,topic_1,topic_2,topic_3,topic_4,topic_5,topic_6,topic_7
0,"""It has no side effect, I take it in combinati...",0,0.0,1.0,0.0,1.0,0.0,0.0,0.0
1,"""My son is halfway through his fourth week of ...",1,1.0,0.0,1.0,1.0,1.0,0.0,1.0
2,"""I used to take another oral contraceptive, wh...",1,0.0,1.0,0.0,1.0,0.0,1.0,0.0


In [28]:
"""A model with higher log-likelihood and lower perplexity
(exp(-1. * log-likelihood per word)) is considered to be good. Let’s check for our model."""

# Log Likelyhood: Higher the better
print("Log Likelihood: ", lda_model.score(data_vectorized))

# Perplexity: Lower the better. Perplexity = exp(-1. * log-likelihood per word)
print("Perplexity: ", lda_model.perplexity(data_vectorized))

# See model parameters
print(lda_model.get_params())

Log Likelihood:  -8172.475858222873
Perplexity:  17199.05238299002
{'batch_size': 128, 'doc_topic_prior': None, 'evaluate_every': -1, 'learning_decay': 0.5, 'learning_method': 'online', 'learning_offset': 10.0, 'max_doc_update_iter': 100, 'max_iter': 50, 'mean_change_tol': 0.001, 'n_components': 2, 'n_jobs': None, 'perp_tol': 0.1, 'random_state': 29, 'topic_word_prior': None, 'total_samples': 1000000.0, 'verbose': 0}


In [29]:
# # Define Search Param
# search_params = {'n_components' : [2, 3, 4], 'learning_decay' : [.2, .3, .4], "max_iter" : [50, 100], "learning_offset" : [5, 10]}

# # Init the Model
# lda = LatentDirichletAllocation()

# # Init Grid Search Class
# model = GridSearchCV(lda, param_grid=search_params)

# # Do the Grid Search
# model.fit(data_vectorized)

# # Best Model
# best_lda_model = model.best_estimator_

# # Model Parameters
# print("Best Model's Params: ", model.best_params_)

# # Log Likelihood Score
# print("Best Log Likelihood Score: ", model.best_score_)

# # Perplexity
# print("Model Perplexity: ", best_lda_model.perplexity(data_vectorized))

In [30]:
# vectorizer.vocabulary_

In [31]:
# lda_model.components_ # Vectors

In [32]:
print_topics(lda_model, vectorizer)

Topic 0:
[('day', 1.95), ('not', 1.84), ('week', 1.8), ('im', 1.63), ('month', 1.55), ('pain', 1.52), ('side', 1.51), ('no', 1.46), ('side effect', 1.45), ('effect', 1.42)]
Topic 1:
[('mg', 1.81), ('not', 1.81), ('year', 1.78), ('taking', 1.78), ('work', 1.75), ('pain', 1.73), ('effect', 1.64), ('side', 1.63), ('take', 1.62), ('day', 1.61)]


In [33]:
example = ["side effect"]

example_vectorized = vectorizer.transform(example)

lda_vectors = lda_model.transform(example_vectorized)

print("topic 0 :", lda_vectors[0][0])
print("topic 1 :", lda_vectors[0][1])

topic 0 : 0.35010083056727265
topic 1 : 0.6498991694327274


In [34]:
for review in manual["Lemmatized_review_lst"]:
    predictions = []
    vectorized = vectorizer.transform(review)
    lda_vectors = predictions.append(lda_model.transform(vectorized))

predictions

[array([[0.59604762, 0.40395238],
        [0.65435304, 0.34564696],
        [0.35424455, 0.64575545],
        [0.51166804, 0.48833196],
        [0.45770213, 0.54229787],
        [0.35118319, 0.64881681],
        [0.39819386, 0.60180614],
        [0.34402768, 0.65597232],
        [0.32786605, 0.67213395],
        [0.35118319, 0.64881681],
        [0.34401793, 0.65598207],
        [0.34402945, 0.65597055],
        [0.34402811, 0.65597189],
        [0.28173506, 0.71826494],
        [0.65473986, 0.34526014],
        [0.34404523, 0.65595477],
        [0.30969019, 0.69030981],
        [0.39753381, 0.60246619],
        [0.30009476, 0.69990524],
        [0.30446004, 0.69553996],
        [0.563252  , 0.436748  ],
        [0.30588105, 0.69411895],
        [0.2841019 , 0.7158981 ],
        [0.56996919, 0.43003081],
        [0.3044677 , 0.6955323 ],
        [0.34403005, 0.65596995],
        [0.30588105, 0.69411895],
        [0.54693097, 0.45306903],
        [0.53377802, 0.46622198],
        [0.351

In [35]:
# # Get Log Likelyhoods from Grid Search Output
# n_components = [2, 3, 4]

# log_likelyhoods_5 = [round(gscore.mean_validation_score) for gscore in model.cv_results_ if gscore.parameters['learning_decay']==0.2]
# log_likelyhoods_7 = [round(gscore.mean_validation_score) for gscore in model.cv_results_ if gscore.parameters['learning_decay']==0.3]
# log_likelyhoods_9 = [round(gscore.mean_validation_score) for gscore in model.cv_results_ if gscore.parameters['learning_decay']==0.4]

# # Show graph
# plt.figure(figsize=(12, 8))
# plt.plot(n_components, log_likelyhoods_5, label='0.5')
# plt.plot(n_components, log_likelyhoods_7, label='0.7')
# plt.plot(n_components, log_likelyhoods_9, label='0.9')

# plt.title("Choosing Optimal LDA Model")
# plt.xlabel("Num Topics")
# plt.ylabel("Log Likelyhood Scores")
# plt.legend(title='Learning decay', loc='best')

# plt.show()

# CountVectorizer | Latent Dirichlet allocation

In [36]:
vectorizer = CountVectorizer(min_df = 0.01, 
                             max_df = 0.50, 
                             max_features = None,
                             vocabulary = None,
                             ngram_range = (1, 2)).fit(data["Lemmatized_review_str"])

# MINDF Ignore terms that have a document frequency strictly higher than the given threshold
# MAXDF When building the vocabulary ignore terms that have a document frequency strictly lower than the given threshold

data_vectorized = vectorizer.transform(data["Lemmatized_review_str"])

lda_model = LatentDirichletAllocation(n_components = 2,
                                      verbose = 0,
                                      max_iter = 50).fit(data_vectorized)

In [37]:
"""A model with higher log-likelihood and lower perplexity
(exp(-1. * log-likelihood per word)) is considered to be good. Let’s check for our model."""

# Log Likelyhood: Higher the better
print("Log Likelihood: ", lda_model.score(data_vectorized))

# Perplexity: Lower the better. Perplexity = exp(-1. * log-likelihood per word)
print("Perplexity: ", lda_model.perplexity(data_vectorized))

# See model parameters
print(lda_model.get_params())

Log Likelihood:  -38127.6917832083
Perplexity:  923.3653941434659
{'batch_size': 128, 'doc_topic_prior': None, 'evaluate_every': -1, 'learning_decay': 0.7, 'learning_method': 'batch', 'learning_offset': 10.0, 'max_doc_update_iter': 100, 'max_iter': 50, 'mean_change_tol': 0.001, 'n_components': 2, 'n_jobs': None, 'perp_tol': 0.1, 'random_state': None, 'topic_word_prior': None, 'total_samples': 1000000.0, 'verbose': 0}


In [38]:
# Define Search Param
search_params = {'n_components' : [2, 3, 4], 'learning_decay' : [.2, .3, .4], "max_iter" : [50, 100], "learning_offset" : [5, 10]}

# Init the Model
lda = LatentDirichletAllocation()

# Init Grid Search Class
model = GridSearchCV(lda, param_grid=search_params)

# Do the Grid Search
model.fit(data_vectorized)

# Best Model
best_lda_model = model.best_estimator_

# Model Parameters
print("Best Model's Params: ", model.best_params_)

# Log Likelihood Score
print("Best Log Likelihood Score: ", model.best_score_)

# Perplexity
print("Model Perplexity: ", best_lda_model.perplexity(data_vectorized))

Best Model's Params:  {'learning_decay': 0.4, 'learning_offset': 10, 'max_iter': 100, 'n_components': 2}
Best Log Likelihood Score:  -9211.94173617145
Model Perplexity:  923.0570433369943


In [39]:
# vectorizer.vocabulary_

In [40]:
# lda_model.components_ # Vectors

In [41]:
print_topics(lda_model, vectorizer)

Topic 0:
[('not_neg', 52.4), ('side_neg', 45.34), ('effects_neg', 36.4), ('side_neg effects_neg', 31.43), ('take_neg', 27.46), ('not', 26.75), ('im_neg', 25.45), ('get_neg', 24.47), ('like_neg', 23.42), ('no', 23.35)]
Topic 1:
[('day', 55.65), ('week', 40.82), ('mg', 31.16), ('pain', 30.75), ('year', 27.6), ('month', 25.28), ('im', 24.5), ('started', 22.33), ('time', 20.88), ('get', 19.3)]


In [42]:
example = ["side effect"]

example_vectorized = vectorizer.transform(example)

lda_vectors = lda_model.transform(example_vectorized)

print("topic 0 :", lda_vectors[0][0])
print("topic 1 :", lda_vectors[0][1])

topic 0 : 0.12527176489455374
topic 1 : 0.8747282351054463


# TF-IDF features | Pipeline

In [43]:
import itertools

laplace = lidstone = range(1, 4)

lap_lid = list(itertools.product(laplace, lidstone))

lap_lid

[(1, 1), (1, 2), (1, 3), (2, 1), (2, 2), (2, 3), (3, 1), (3, 2), (3, 3)]

In [44]:
# # Create Pipeline
# pipe = Pipeline([('tfidf', TfidfVectorizer()),
#                  ('nb', MultinomialNB())
#                 ])

# # Set parameters to search (model and vectorizer)
# parameters = {
#     'tfidf__ngram_range': (lap_lid), # The lower and upper boundary of the range of n-values for different word n-grams or char n-grams to be extracted
#     'tfidf__min_df': (np.linspace(0.01, 0.49, num = 10)),
#     'tfidf__max_df': (np.linspace(0.50, 0.99, num = 10)),
#     'nb__alpha': (np.linspace(0.01, 0.99, num = 10)) # Additive (Laplace/Lidstone) smoothing parameter (0 for no smoothing)
#     }

# # Perform grid search
# grid_search = GridSearchCV(pipe, parameters,
#                            n_jobs=-1, 
#                            verbose=1,
#                            scoring = "accuracy", 
#                            refit=True,
#                            cv=5)

# grid_search.fit(X, y)

In [45]:
grid_search.best_params_

NameError: ignored

In [None]:
best_model = grid_search.best_estimator_

# CountVectorizer | Pipeline

In [None]:
# # Create Pipeline
# pipe = Pipeline([('Count', CountVectorizer()),
#                  ('nb', MultinomialNB())
#                 ])

# # Set parameters to search (model and vectorizer)
# parameters = {
#     'Count__ngram_range': (lap_lid), # The lower and upper boundary of the range of n-values for different word n-grams or char n-grams to be extracted
#     'Count__min_df': (np.linspace(0.01, 0.49, num = 10)),
#     'Count__max_df': (np.linspace(0.50, 0.99, num = 10)),
#     'Count__max_features' : ([1 , 2, 3, 4, 5]),
#     'nb__alpha': (np.linspace(0.01, 0.99, num = 10)), # Additive (Laplace/Lidstone) smoothing parameter (0 for no smoothing)
#     }

# # Perform grid search
# grid_search = GridSearchCV(pipe, parameters, n_jobs=-1, 
#                            verbose=1, scoring = "accuracy", 
#                            refit=True, cv=5)

# grid_search.fit(X, y)

In [None]:
grid_search.best_params_

In [None]:
best_model = grid_search.best_estimator_