# Imports

In [2]:
import pandas as pd # DataFrame Manipulation Package
import numpy as np
import matplotlib.pyplot as plt

from sklearn.feature_extraction.text import TfidfVectorizer # Convert a collection of raw documents to a matrix of TF-IDF features
from sklearn.decomposition import LatentDirichletAllocation # Latent Dirichlet Allocation is a topic model that is used for discovering abstract topics from a collection of documents (variational Bayes algorithm)
from sklearn.feature_extraction.text import CountVectorizer # Convert a collection of text documents to a matrix of token counts

from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV
from sklearn.naive_bayes import MultinomialNB # The multinomial Naive Bayes classifier is suitable for classification with discrete features (e.g., word counts for text classification)

import string # Collection of string operations
from nltk.corpus import stopwords 
from nltk.stem.wordnet import WordNetLemmatizer #Lemmatize using WordNet's built-in morphy function. Returns the input word unchanged if it cannot be found in WordNet.
from nltk import word_tokenize 

In [3]:
import nltk

nltk.download('stopwords')
nltk.download('punkt')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

# Data

In [4]:
data = pd.read_csv('drugsComTrain_raw.csv')

print(f"The shape of the data is {data.shape[0]} rows and {data.shape[1]} columns")
print("\n")
print(f"The amount of unique ID is {len(data['uniqueID'].unique())}") # check if any of the uniqueID repeat
print("\n")
print(f"The number of unique drugs reviewed is {len(data['drugName'].unique())}")
print("\n")
print(f"The number of unique conditions is {len(data['condition'].unique())}")
print("\n")

#data["drugName"].unique()

FileNotFoundError: ignored

In [None]:
data

In [None]:
correlation = data["rating"].corr(data["usefulCount"], method='pearson')

correlation

In [None]:
plt.matshow(data[["rating", "usefulCount"]].corr())
plt.show()

In [None]:
pd.DataFrame(data["drugName"].value_counts()).head(10).T

In [None]:
pd.DataFrame(data["condition"].value_counts()).head(14).T

In [None]:
NUMERO_SAMPLES = 500

y = data["rating"].head(NUMERO_SAMPLES) # SACAR ESTOOOOOO!!

data = data.drop(["uniqueID", "date", "rating", "usefulCount"], axis = 1).head(NUMERO_SAMPLES) # SACAR ESTOOOOOO!!

In [None]:
data

# Functions

In [None]:
def punctuation(x):
    for punctuation in string.punctuation:
        x =  x.replace(punctuation, '')
    return x.lower()

data["clean_review"] = data["review"].apply(punctuation)

In [None]:
def remove_numbers (x):
    words_only = ''.join([i for i in x if not i.isdigit()])
    return words_only

data['clean_review'] = data.clean_review.apply(remove_numbers)

In [None]:
stop_words = set(stopwords.words('english')) 

def remove_stopwords (x):
    tokenized = word_tokenize(x)
    without_stopwords = [word for word in tokenized if not word in stop_words]
    return without_stopwords

data["review_words"] = data.clean_review.apply(remove_stopwords)

In [None]:
# data["review_words"]

In [None]:
# data["clean_review"]

In [None]:
data.head(15)

In [None]:
print(type(data["clean_review"][0]))

print(type(data["review_words"][0]))

# TF-IDF features | Latent Dirichlet allocation

In [None]:
import itertools

laplace = lidstone = range(1, 4)

lap_lid = list(itertools.product(laplace, lidstone))

lap_lid

In [None]:
vectorizer = TfidfVectorizer(min_df = 0.05, 
                             max_df = 0.40, 
                             max_features = None,
                             vocabulary = None,
                             ngram_range = (2, 2)).fit(data["clean_review"])

# MINDF Ignore terms that have a document frequency strictly higher than the given threshold
# MAXDF When building the vocabulary ignore terms that have a document frequency strictly lower than the given threshold

data_vectorized = vectorizer.transform(data["clean_review"])

lda_model = LatentDirichletAllocation(n_components = 2,
                                      verbose = 0,
                                      max_iter = 50).fit(data_vectorized)

def print_topics(model, vectorizer):
    for idx, topic in enumerate(model.components_):
        print("Topic %d:" % (idx))
        print([(vectorizer.get_feature_names()[i], round(topic[i], 2))
                        for i in topic.argsort()[:-10 - 1:-1]])

In [None]:
# vectorizer.vocabulary_

In [None]:
lda_model.components_ # Vectors?

In [None]:
print_topics(lda_model, vectorizer)

In [None]:
example = ["side effect"]

example_vectorized = vectorizer.transform(example)

lda_vectors = lda_model.transform(example_vectorized)

print("topic 0 :", lda_vectors[0][0])
print("topic 1 :", lda_vectors[0][1])

# TF-IDF features | Pipeline

In [None]:
# Create Pipeline
pipe = Pipeline([('tfidf', TfidfVectorizer()),
                 ('nb', MultinomialNB())
                ])

# Set parameters to search (model and vectorizer)
parameters = {
    'tfidf__ngram_range': (lap_lid), # The lower and upper boundary of the range of n-values for different word n-grams or char n-grams to be extracted
    'tfidf__min_df': (np.linspace(0.01, 0.49, num = 10)),
    'tfidf__max_df': (np.linspace(0.50, 0.99, num = 10)),
    'nb__alpha': (np.linspace(0.01, 0.99, num = 10)) # Additive (Laplace/Lidstone) smoothing parameter (0 for no smoothing)
    }

# Perform grid search
grid_search = GridSearchCV(pipe, parameters,
                           n_jobs=-1, 
                           verbose=1,
                           scoring = "accuracy", 
                           refit=True,
                           cv=5)

grid_search.fit(data["clean_review"], y)

In [None]:
grid_search.best_params_

In [None]:
best_model = grid_search.best_estimator_

# best_model.transform(data["review"]) # Predict?!?!

In [None]:
# best_model.LatentDirichletAllocation(n_components = 2,
#                                       verbose = 0,
#                                       max_iter = 50).fit(data_vectorized)

# def print_topics(model, vectorizer):
#     for idx, topic in enumerate(model.components_):
#         print("Topic %d:" % (idx))
#         print([(vectorizer.get_feature_names()[i], round(topic[i], 2))
#                         for i in topic.argsort()[:-10 - 1:-1]])

In [None]:
# best_model.components_

In [None]:
# print_topics(best_model, vectorizer)

# CountVectorizer | Latent Dirichlet allocation

In [None]:
vectorizer = CountVectorizer(min_df = 0.05, 
                             max_df = 0.40, 
                             max_features = None,
                             vocabulary = None,
                             ngram_range = (2, 2)).fit(data["review"])

# MINDF Ignore terms that have a document frequency strictly higher than the given threshold
# MAXDF When building the vocabulary ignore terms that have a document frequency strictly lower than the given threshold

data_vectorized = vectorizer.transform(data["review"])

lda_model = LatentDirichletAllocation(n_components = 2,
                                      verbose = 0,
                                      max_iter = 50).fit(data_vectorized)

def print_topics(model, vectorizer):
    for idx, topic in enumerate(model.components_):
        print("Topic %d:" % (idx))
        print([(vectorizer.get_feature_names()[i], round(topic[i], 2))
                        for i in topic.argsort()[:-10 - 1:-1]])

In [None]:
# vectorizer.vocabulary_

In [None]:
lda_model.components_ # Vectors?

In [None]:
print_topics(lda_model, vectorizer)

In [None]:
example = ["side effect"]

example_vectorized = vectorizer.transform(example)

lda_vectors = lda_model.transform(example_vectorized)

print("topic 0 :", lda_vectors[0][0])
print("topic 1 :", lda_vectors[0][1])

# CountVectorizer | Pipeline

In [None]:
# Create Pipeline
pipe = Pipeline([('Count', CountVectorizer()),
                 ('nb', MultinomialNB())
                ])

# Set parameters to search (model and vectorizer)
parameters = {
    'Count__ngram_range': (lap_lid), # The lower and upper boundary of the range of n-values for different word n-grams or char n-grams to be extracted
    'Count__min_df': (np.linspace(0.01, 0.49, num = 10)),
    'Count__max_df': (np.linspace(0.50, 0.99, num = 10)),
    'Count__max_features' : ([1 , 2, 3, 4, 5]),
    'nb__alpha': (np.linspace(0.01, 0.99, num = 10)), # Additive (Laplace/Lidstone) smoothing parameter (0 for no smoothing)
    }

# Perform grid search
grid_search = GridSearchCV(pipe, parameters, n_jobs=-1, 
                           verbose=1, scoring = "accuracy", 
                           refit=True, cv=5)

grid_search.fit(data["clean_review"], y)

In [None]:
grid_search.best_params_

In [None]:
best_model = grid_search.best_estimator_