<a href="https://colab.research.google.com/github/jbloewencolon/Psychedelic-Trip-Generator/blob/main/LDA_and_Word2Vec.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [16]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import gensim
import re
import xgboost as xgb
import pickle
import tensorflow as tf
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report
from sklearn.preprocessing import LabelEncoder, StandardScaler
from imblearn.over_sampling import SMOTE
from imblearn.pipeline import make_pipeline
from xgboost import XGBClassifier
from gensim.models import Word2Vec, LdaModel, CoherenceModel
from gensim.utils import simple_preprocess
from gensim import corpora
from gensim.corpora import Dictionary
from sklearn.base import BaseEstimator, TransformerMixin
import nltk
from nltk.tokenize import word_tokenize
nltk.download('punkt')
tf.config.list_physical_devices('GPU')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


[PhysicalDevice(name='/physical_device:GPU:0', device_type='GPU')]

In [11]:
df = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/Data/processed.csv')

In [None]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 76450 entries, 0 to 76449
Data columns (total 11 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   title             76448 non-null  object 
 1   drug              76447 non-null  object 
 2   dosage            73699 non-null  object 
 3   delivery          74213 non-null  object 
 4   weight            76449 non-null  float64
 5   year              76449 non-null  float64
 6   gender            76446 non-null  object 
 7   report            76439 non-null  object 
 8   processed_report  76438 non-null  object 
 9   mixed             76448 non-null  float64
 10  drug_category     76448 non-null  object 
dtypes: float64(3), object(8)
memory usage: 6.4+ MB


In [12]:
# Fill missing values with 'unknown' in 'drug_category' column
df[['drug', 'dosage', 'delivery']] = df[['drug', 'dosage', 'delivery']].fillna('unknown')

# Replace non-finite values in 'year' and 'mixed' columns with 0
df[['year', 'mixed']] = df[['year', 'mixed']].fillna(0)

# Convert 'year' and 'mixed' columns to integers
df[['year', 'mixed']] = df[['year', 'mixed']].astype(int)

# Convert the 'processed_report' column to string type
df['report'] = df['report'].astype(str)

df = df.dropna()

df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 76438 entries, 0 to 76449
Data columns (total 11 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   title             76438 non-null  object 
 1   drug              76438 non-null  object 
 2   dosage            76438 non-null  object 
 3   delivery          76438 non-null  object 
 4   weight            76438 non-null  float64
 5   year              76438 non-null  int64  
 6   gender            76438 non-null  object 
 7   report            76438 non-null  object 
 8   processed_report  76438 non-null  object 
 9   mixed             76438 non-null  int64  
 10  drug_category     76438 non-null  object 
dtypes: float64(1), int64(2), object(8)
memory usage: 7.0+ MB


In [None]:
df.head()

Unnamed: 0,title,drug,dosage,delivery,weight,year,gender,report,processed_report,mixed,drug_category
0,Ode to Joy,mdma,1.5 tablets,oral,185.0,2000,male,My friend had some experience with X and had t...,friend experi x told one day said come across ...,0,mdma
1,Make Sure the Music's Not Too Complex,cannabis,unknown,smoked,152.0,1999,not specified,This was the first experience that either my f...,first experi either friend salvia housem check...,0,cannabis
2,After Hours,mdma,160 mg,oral,150.0,2001,male,Preparation: I have heard some conflicting opi...,prepar heard conflict opinion 5htp ie load day...,1,mdma
3,After Hours,mdma,100 mg,oral,150.0,2001,male,Preparation: I have heard some conflicting opi...,prepar heard conflict opinion 5htp ie load day...,1,mdma
4,After Hours,mdma,50 mg,insufflated,150.0,2001,male,Preparation: I have heard some conflicting opi...,prepar heard conflict opinion 5htp ie load day...,1,mdma


In [None]:
df['drug_category'].value_counts()

pharmaceutical           10125
cannabis                  9536
stimulant                 5780
mushrooms                 4055
botanical                 3814
opioid                    3795
mdma                      3560
alcohol                   3368
hallucinogen              3128
lsd                       3064
salvia                    2831
2c                        2812
other                     2746
dissociative              2624
entheogen                 2546
DMT                       2362
entactogen                1754
nootropic                 1653
5-meo                     1290
ketamine                  1233
unknown                    923
mescaline                  726
depressant                 686
synthetic cannabinoid      451
anxiolytic                 448
oneirogen                  307
ayahuasca                  229
phencyclidine              216
antidepressant             167
3-MeO                      131
ibogaine                    78
Name: drug_category, dtype: int64

In [13]:
# Define the mapping from old categories to new ones
category_mapping = {
    'pharmaceutical': 'Pharmaceutical',
    'cannabis': 'Cannabinoid',
    'stimulant': 'Stimulant',
    'mushrooms': 'Psychedelic',
    'botanical': 'Other',
    'opioid': 'Opioid',
    'mdma': 'Entactogen/Empathogen',
    'alcohol': 'Depressant',
    'hallucinogen': 'Psychedelic',
    'lsd': 'Psychedelic',
    'salvia': 'Psychedelic',
    '2c': 'Psychedelic',
    'other': 'Other',
    'dissociative': 'Dissociative',
    'entheogen': 'Entheogen',
    'DMT': 'Psychedelic',
    'entactogen': 'Entactogen/Empathogen',
    'nootropic': 'Other',
    '5-meo': 'Psychedelic',
    'ketamine': 'Dissociative',
    'unknown': 'Other',
    'mescaline': 'Psychedelic',
    'depressant': 'Depressant',
    'synthetic cannabinoid': 'Cannabinoid',
    'anxiolytic': 'Pharmaceutical',
    'oneirogen': 'Other',
    'ayahuasca': 'Psychedelic',
    'phencyclidine': 'Dissociative',
    'antidepressant': 'Pharmaceutical',
    '3-MeO': 'Other',
    'ibogaine': 'Entheogen'
}

# Apply the mapping to the 'drug_category' column and create a new column 'grouped_drug_category'
df['drug_category'] = df['drug_category'].map(category_mapping)

df['drug_category'].value_counts()

Psychedelic              20497
Pharmaceutical           10740
Cannabinoid               9987
Other                     9574
Stimulant                 5780
Entactogen/Empathogen     5314
Dissociative              4073
Depressant                4054
Opioid                    3795
Entheogen                 2624
Name: drug_category, dtype: int64

In [5]:
df.head()

Unnamed: 0,title,drug,dosage,delivery,weight,year,gender,report,processed_report,mixed,drug_category
0,Ode to Joy,mdma,1.5 tablets,oral,185.0,2000,male,My friend had some experience with X and had t...,friend experi x told one day said come across ...,0,Entactogen/Empathogen
1,Make Sure the Music's Not Too Complex,cannabis,unknown,smoked,152.0,1999,not specified,This was the first experience that either my f...,first experi either friend salvia housem check...,0,Cannabinoid
2,After Hours,mdma,160 mg,oral,150.0,2001,male,Preparation: I have heard some conflicting opi...,prepar heard conflict opinion 5htp ie load day...,1,Entactogen/Empathogen
3,After Hours,mdma,100 mg,oral,150.0,2001,male,Preparation: I have heard some conflicting opi...,prepar heard conflict opinion 5htp ie load day...,1,Entactogen/Empathogen
4,After Hours,mdma,50 mg,insufflated,150.0,2001,male,Preparation: I have heard some conflicting opi...,prepar heard conflict opinion 5htp ie load day...,1,Entactogen/Empathogen


In [20]:
# Tokenize the reports
tokenized_reports = df['report'].apply(simple_preprocess)

# Train a Word2Vec model
word2vec = Word2Vec(sentences=tokenized_reports, vector_size=100, window=5, min_count=5, workers=4)

# Function to convert a report into a vector
def report_to_vector(report):
    report_vec = []
    numw = 0
    for word in report:
        try:
            if numw == 0:
                report_vec = word2vec.wv[word] # Use word2vec.wv[word] instead of model[word]
            else:
                report_vec = np.add(report_vec, word2vec.wv[word]) # Use word2vec.wv[word] instead of model[word]
            numw+=1
        except:
            pass

    return np.asarray(report_vec) / numw if numw != 0 else np.zeros(word2vec.vector_size)

# Convert reports into vectors
X = df['report'].apply(report_to_vector).tolist()

In [7]:
class Word2VecVectorizer(BaseEstimator, TransformerMixin):
    def __init__(self, model):
        self.model = model
        self.size = model.vector_size

    def fit(self, X, y=None):
        return self

    def transform(self, X):
        return np.array([report_to_vector(report) for report in X])

In [None]:
# Instantiate Word2Vec model (replace with your model parameters and training data)
model = gensim.models.Word2Vec(sentences=tokenized_reports, vector_size=100, window=5, min_count=1, workers=4)

# Define feature extractor
vectorizer = Word2VecVectorizer(model)

In [22]:
print(tokenized_reports.head())

0    [my, friend, had, some, experience, with, and,...
1    [this, was, the, first, experience, that, eith...
2    [preparation, have, heard, some, conflicting, ...
3    [preparation, have, heard, some, conflicting, ...
4    [preparation, have, heard, some, conflicting, ...
Name: report, dtype: object


In [9]:
# Define the file paths in your Google Drive where you want to save the pickle files
word2vec_model_file_path = '/content/drive/MyDrive/Colab Notebooks/Data/word2vec_model.pkl'
word2vec_vectorizer_file_path = '/content/drive/MyDrive/Colab Notebooks/Data/word2vec_vectorizer.pkl'

In [24]:
# Pickle the Word2Vec model
with open(word2vec_model_file_path, 'wb') as f:
    pickle.dump(model, f)

# Pickle the Word2VecVectorizer
with open(word2vec_vectorizer_file_path, 'wb') as f:
    pickle.dump(vectorizer, f)

In [10]:
model = gensim.models.Word2Vec()
vectorizer = Word2VecVectorizer(model)

# Load the Word2Vec model
with open(word2vec_model_file_path, 'rb') as f:
    model = pickle.load(f)

# Load the Word2VecVectorizer
with open(word2vec_vectorizer_file_path, 'rb') as f:
    vectorizer = pickle.load(f)

In [11]:
# Retrieve the original text
original_text = df['report'][0]

# Retrieve the tokens generated from Word2Vec
word2vec_tokens = tokenized_reports[0]

# Compare
print("Original Text:", original_text)
print("Word2Vec Tokens:", word2vec_tokens)

NameError: ignored

In [None]:
# Tokenize the texts
texts = df['processed_report'].apply(word_tokenize).tolist()

# Create a Gensim dictionary from the texts
dictionary = Dictionary(texts)

# Remove extremes
dictionary.filter_extremes(no_below=3, no_above=0.5)

# Convert the dictionary into a bag-of-words corpus
corpus = [dictionary.doc2bow(text) for text in texts]

In [27]:
# Initialize the LDA model with hyperparameters
num_topics = 10
lda = LdaModel(corpus=corpus, id2word=dictionary, num_topics=num_topics, passes=10)

In [28]:
# Compute Coherence Score
coherence_model_lda = CoherenceModel(model=lda, texts=texts, dictionary=dictionary, coherence='c_v')
coherence_lda = coherence_model_lda.get_coherence()

print(f'\nCoherence Score: {coherence_lda}')


Coherence Score: 0.4189523267818478


In [29]:
# Pickle the LDA model to the specified location
with open('/content/drive/MyDrive/Colab Notebooks/Data/lda_model.pkl', 'wb') as f:
    pickle.dump(lda, f)

# Pickle the Dictionary to the specified location
with open('/content/drive/MyDrive/Colab Notebooks/Data/dictionary.pkl', 'wb') as f:
    pickle.dump(dictionary, f)

# Pickle the CoherenceModel to the specified location
with open('/content/drive/MyDrive/Colab Notebooks/Data/coherence_model.pkl', 'wb') as f:
    pickle.dump(coherence_model_lda, f)

In [33]:
# Load the LDA model
with open('/content/drive/MyDrive/Colab Notebooks/Data/lda_model.pkl', 'rb') as f:
    lda_model = pickle.load(f)

# Load the Dictionary
with open('/content/drive/MyDrive/Colab Notebooks/Data/dictionary.pkl', 'rb') as f:
    dictionary = pickle.load(f)

# Load the CoherenceModel
with open('/content/drive/MyDrive/Colab Notebooks/Data/coherence_model.pkl', 'rb') as f:
    coherence_model_lda = pickle.load(f)

In [30]:
def print_top_words(lda_model, dictionary, n_top_words):
    for i, topic in enumerate(lda_model.get_topics()):
        top_feature_ids = topic.argsort()[-n_top_words:][::-1]
        features = [dictionary.id2token[token_id] for token_id in top_feature_ids]
        print(f"Topic {i + 1}: {', '.join(features)}")

print_top_words(lda, dictionary, n_top_words=10)

Topic 1: high, year, addict, opiat, snort, line, month, cocain, pill, everi
Topic 2: walk, us, light, hous, beauti, tree, began, mushroom, visual, color
Topic 3: tea, seed, water, tast, gram, drink, kratom, extract, plant, cup
Topic 4: life, mind, world, sens, love, state, experienc, person, part, peopl
Topic 5: dmt, nitrou, balloon, dpt, die, oxid, inhal, 5meodmt, ich, breakthrough
Topic 6: rememb, said, told, ask, couldnt, room, say, happen, talk, tell
Topic 7: eye, room, visual, hit, light, move, close, everyth, color, salvia
Topic 8: peopl, talk, bit, pretti, weed, lot, roll, mdma, high, music
Topic 9: sleep, dose, mg, pill, work, pain, week, anxieti, help, high
Topic 10: dose, visual, mdma, mg, substanc, bit, stimul, psychedel, notic, report


In [34]:
def generate_corpus(df_subset):
    # If the reports are already tokenized
    texts = df_subset['processed_report'].tolist()

    # Create a Gensim dictionary from the texts
    dictionary = Dictionary(texts)

    # Remove extremes
    dictionary.filter_extremes(no_below=3, no_above=0.5)

    # Convert the dictionary into a bag-of-words corpus
    corpus = [dictionary.doc2bow(text) for text in texts]

    return corpus, dictionary

# Get unique values in drug_category
unique_categories = df['drug_category'].unique()

# Iterate over unique categories
for category in unique_categories:
    # Filter the dataset based on the category
    subset = df[df['drug_category'] == category]

    # Generate the corpus and dictionary for the subset
    corpus_subset, dictionary_subset = generate_corpus(subset)

    # Apply LDA on the corpus
    lda_subset = LdaModel(corpus=corpus_subset, id2word=dictionary_subset, num_topics=10, passes=10)

    # Print the topics for the category
    print(f"Topics for category: {category}")
    for i in range(10):
        print(f"Topic {i+1}: {lda_subset.print_topic(i, 5)}")  # prints top 5 words for each topic
    print()

TypeError: ignored

In [None]:
# Get unique values in drug_category
unique_categories = df['drug_category'].unique()

# Iterate over unique categories
for category in unique_categories:
    # Filter the dataset based on the category
    subset = df[df['drug_category'] == category]

    # Generate the corpus and dictionary for the subset
    corpus_subset, dictionary_subset = generate_corpus(subset)

    # Apply LDA on the corpus
    lda_subset = LdaModel(corpus=corpus_subset, id2word=dictionary_subset, num_topics=10, passes=10)

    # Print the topics for the category
    print(f"Topics for category: {category}")
    for i in range(10):
        print(f"Topic {i+1}: {lda_subset.print_topic(i, 5)}")  # prints top 5 words for each topic
    print()

In [None]:
from wordcloud import WordCloud
import matplotlib.pyplot as plt

# Filter the dataset for the "Psychedelic" drug category
subset = df[df['drug_category'] == 'Psychedelic']

# Tokenize the texts
texts = subset['report'].apply(word_tokenize).tolist()

# Create a Gensim dictionary from the texts
dictionary = Dictionary(texts)

# Convert the dictionary into a bag-of-words corpus
corpus = [dictionary.doc2bow(text) for text in texts]

# Apply LDA on the corpus
lda = LdaModel(corpus=corpus, id2word=dictionary, num_topics=10, passes=10)

# Assuming we want to create a wordcloud for the first topic
topic_num = 0
topic_terms = lda.get_topic_terms(topic_num, topn=len(dictionary))

# Create a dictionary with word frequencies
word_freq = {dictionary.get(id): freq for id, freq in topic_terms}

# Generate a word cloud
wordcloud = WordCloud(width=800, height=400, background_color='white').generate_from_frequencies(word_freq)

# Display the word cloud
plt.figure(figsize=(10, 5))
plt.imshow(wordcloud, interpolation='bilinear')
plt.axis('off')
plt.show()

In [None]:
# Define oversampler
oversampler = SMOTE()

# Encode the labels in y
label_encoder = LabelEncoder()
y_encoded = label_encoder.fit_transform(y)

# Set up the XGBoost parameters
params = {
    'objective': 'multi:softmax',
    'num_class': 3
}

# Define classifiers
classifiers = {
    'Logistic Regression': LogisticRegression(max_iter=1000, class_weight=class_weights),
    'Random Forest': RandomForestClassifier(class_weight=class_weights, n_estimators=100, random_state=42),
    'XGB': XGBClassifier(eval_metric='mlogloss')
}

# Define pipelines
pipelines = {
    name: make_pipeline(oversampler, model)
    for name, model in classifiers.items()
}

# Split your data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
# Fit and evaluate Logistic Regression
print('\nTraining Logistic Regression...')
pipeline.fit(X_train, y_train)
y_pred_lr = pipeline.predict(X_test)
print('\nLogistic Regression:')
print(classification_report(y_test, y_pred_lr, target_names=encoder.classes_, zero_division=1))


Training Logistic Regression...


NameError: ignored

In [None]:
# Fit and evaluate Random Forest
print('\nTraining Random Forest...')
pipeline.fit(X_train, y_train)
y_pred = pipeline.predict(X_test)
print('\nRandom Forest:')
print(classification_report(y_test, y_pred_rf, target_names=encoder.classes_, zero_division=1))

In [None]:
# Fit and evaluate XGBoost with the pipeline
print('\nTraining XGB through pipeline...')
pipeline_xgb.fit(X_train, y_train, xgbclassifier__sample_weight=weights_train)
y_pred_xgb = pipeline_xgb.predict(X_test)
print('\nXGBoost (through pipeline):')
print(classification_report(y_test, y_pred_xgb, target_names=encoder.classes_, zero_division=1))

In [None]:
# Define the parameter grid
param_grid = {'C': [0.1, 1, 10, 100], 'penalty': ['l1', 'l2']}

# Create a GridSearchCV object
grid_search = GridSearchCV(lr, param_grid, cv=5)

# Perform grid search
grid_search.fit(X_train, y_train)

# Print the best parameters and the best score
print(grid_search.best_params_)
print(grid_search.best_score_)


In [None]:
# Define the parameter grid
param_grid = {
    'n_components': [5, 10, 15],  # Number of topics
    'learning_method': ['batch', 'online'],  # Learning method
    'doc_topic_prior': [0.1, 0.5, 1.0],  # Alpha parameter
    'topic_word_prior': [0.01, 0.1, 1.0]  # Beta parameter
}

# Create the GridSearchCV object
grid_search = GridSearchCV(lda, param_grid, cv=3)

# Fit the grid search to the data
grid_search.fit(X)

# Print the best parameters and score
print("Best Parameters:", grid_search.best_params_)
print("Best Score:", grid_search.best_score_)