<a href="https://colab.research.google.com/github/jbloewencolon/Psychedelic-Trip-Generator/blob/main/NLP.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [24]:
import pandas as pd
import numpy as np
import gensim
import re
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import LatentDirichletAllocation as LDA
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report
from sklearn.preprocessing import LabelEncoder, StandardScaler
from imblearn.over_sampling import SMOTE
from imblearn.pipeline import make_pipeline as make_pipeline_imb
from xgboost import XGBClassifier
from gensim.models import Word2Vec
from gensim.utils import simple_preprocess
from sklearn.base import BaseEstimator, TransformerMixin


In [10]:
df = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/Data/processed.csv')

In [11]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 77088 entries, 0 to 77087
Data columns (total 11 columns):
 #   Column            Non-Null Count  Dtype 
---  ------            --------------  ----- 
 0   title             76974 non-null  object
 1   drug              76969 non-null  object
 2   dosage            74319 non-null  object
 3   delivery          74718 non-null  object
 4   weight            77088 non-null  int64 
 5   year              77088 non-null  int64 
 6   gender            76967 non-null  object
 7   report            76959 non-null  object
 8   processed_report  76959 non-null  object
 9   mixed             77088 non-null  int64 
 10  drug_category     77088 non-null  object
dtypes: int64(3), object(8)
memory usage: 6.5+ MB


In [12]:
# Drop rows with missing 'drug_category' or 'processed_report'
df = df.dropna(subset=['drug_category', 'processed_report'])
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 76959 entries, 0 to 77087
Data columns (total 11 columns):
 #   Column            Non-Null Count  Dtype 
---  ------            --------------  ----- 
 0   title             76959 non-null  object
 1   drug              76958 non-null  object
 2   dosage            74190 non-null  object
 3   delivery          74709 non-null  object
 4   weight            76959 non-null  int64 
 5   year              76959 non-null  int64 
 6   gender            76959 non-null  object
 7   report            76959 non-null  object
 8   processed_report  76959 non-null  object
 9   mixed             76959 non-null  int64 
 10  drug_category     76959 non-null  object
dtypes: int64(3), object(8)
memory usage: 7.0+ MB


In [13]:
df.head()

Unnamed: 0,title,drug,dosage,delivery,weight,year,gender,report,processed_report,mixed,drug_category
0,Ode to Joy,mdma,1.5 tablets,oral,185,2000,male,My friend had some experience with X and had t...,friend experi x told one day said come across ...,0,mdma
1,Make Sure the Music's Not Too Complex,cannabis,unknown,smoked,152,1999,not specified,This was the first experience that either my f...,first experi either friend salvia housem check...,0,cannabis
2,After Hours,mdma,160 mg,oral,150,2001,male,Preparation: I have heard some conflicting opi...,prepar heard conflict opinion 5htp ie load day...,1,mdma
3,After Hours,mdma,100 mg,oral,150,2001,male,Preparation: I have heard some conflicting opi...,prepar heard conflict opinion 5htp ie load day...,1,mdma
4,After Hours,mdma,50 mg,insufflated,150,2001,male,Preparation: I have heard some conflicting opi...,prepar heard conflict opinion 5htp ie load day...,1,mdma


In [14]:
df['drug_category'].value_counts()

pharmaceutical           10142
cannabis                  9621
stimulant                 5799
mushrooms                 4090
botanical                 3832
opioid                    3810
mdma                      3587
alcohol                   3425
hallucinogen              3143
lsd                       3105
salvia                    2847
2c                        2825
other                     2778
dissociative              2648
entheogen                 2556
DMT                       2373
entactogen                1757
nootropic                 1664
5-meo                     1297
ketamine                  1253
unknown                    931
mescaline                  734
depressant                 686
anxiolytic                 457
synthetic cannabinoid      452
oneirogen                  308
ayahuasca                  232
phencyclidine              219
antidepressant             167
3-MeO                      137
ibogaine                    84
Name: drug_category, dtype: int64

In [28]:
# Tokenize the reports
tokenized_reports = df['processed_report'].apply(simple_preprocess)

# Train a Word2Vec model
word2vec = Word2Vec(sentences=tokenized_reports, vector_size=100, window=5, min_count=5, workers=4)

# Function to convert a report into a vector
def report_to_vector(report):
    report_vec = []
    numw = 0
    for word in report:
        try:
            if numw == 0:
                report_vec = word2vec.wv[word] # Use word2vec.wv[word] instead of model[word]
            else:
                report_vec = np.add(report_vec, word2vec.wv[word]) # Use word2vec.wv[word] instead of model[word]
            numw+=1
        except:
            pass

    return np.asarray(report_vec) / numw if numw != 0 else np.zeros(word2vec.vector_size)

# Convert reports into vectors
X = df['processed_report'].apply(report_to_vector).tolist()

In [29]:
class Word2VecVectorizer(BaseEstimator, TransformerMixin):
    def __init__(self, model):
        self.model = model
        self.size = model.vector_size

    def fit(self, X, y=None):
        return self

    def transform(self, X):
        return np.array([report_to_vector(report) for report in X])

# Instantiate Word2Vec model (replace with your model parameters and training data)
model = gensim.models.Word2Vec(sentences=tokenized_reports, vector_size=100, window=5, min_count=1, workers=4)

# Define feature extractor
vectorizer = Word2VecVectorizer(model)

In [30]:
# Define LDA model
lda = LDA(n_components=10)

# Define label encoder
encoder = LabelEncoder()

# Fit and transform the processed reports
X = vectorizer.fit_transform(df['processed_report'])
y = encoder.fit_transform(df['drug_category'])

# Fit LDA model
lda.fit(X)

# Function to print the top words for each topic
def print_top_words(model, feature_names, n_top_words):
    for topic_idx, topic in enumerate(model.components_):
        print("\nTopic #%d:" % topic_idx)
        print(" ".join([feature_names[i]
                        for i in topic.argsort()[:-n_top_words - 1:-1]]))

  perword_bound = bound / word_cnt


In [None]:
# Print the top 10 words per topic
print_top_words(lda, vectorizer.get_feature_names_out(), 20)

In [31]:
# Define oversampler
oversampler = SMOTE()

# Define classifiers
classifiers = {
    'Logistic Regression': LogisticRegression(max_iter=1000),
    'Random Forest': RandomForestClassifier(n_estimators=100, random_state=42),
    'XGB': XGBClassifier(eval_metric='mlogloss')
}

# Define pipelines
pipelines = {
    name: make_pipeline_imb(oversampler, model)
    for name, model in classifiers.items()
}

# Split your data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)



In [32]:
# Fit each pipeline to the training data and print performance
for name, pipeline in pipelines.items():
    print(f'\nTraining {name}...')
    pipeline.fit(X_train, y_train)
    y_pred = pipeline.predict(X_test)
    print(f'\n{name}:')
    print(classification_report(y_test, y_pred, target_names=encoder.classes_))


Training Logistic Regression...

Logistic Regression:
                       precision    recall  f1-score   support

                   2c       0.04      1.00      0.07       560
                3-MeO       0.00      0.00      0.00        31
                5-meo       0.00      0.00      0.00       243
                  DMT       0.00      0.00      0.00       480
              alcohol       0.00      0.00      0.00       693
       antidepressant       0.00      0.00      0.00        36
           anxiolytic       0.00      0.00      0.00        77
            ayahuasca       0.00      0.00      0.00        49
            botanical       0.00      0.00      0.00       761
             cannabis       0.00      0.00      0.00      1933
           depressant       0.00      0.00      0.00       148
         dissociative       0.00      0.00      0.00       509
           entactogen       0.00      0.00      0.00       376
            entheogen       0.00      0.00      0.00       523

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))



Random Forest:
                       precision    recall  f1-score   support

                   2c       0.00      0.00      0.00       560
                3-MeO       0.00      0.00      0.00        31
                5-meo       0.02      1.00      0.03       243
                  DMT       0.00      0.00      0.00       480
              alcohol       0.00      0.00      0.00       693
       antidepressant       0.00      0.00      0.00        36
           anxiolytic       0.00      0.00      0.00        77
            ayahuasca       0.00      0.00      0.00        49
            botanical       0.00      0.00      0.00       761
             cannabis       0.00      0.00      0.00      1933
           depressant       0.00      0.00      0.00       148
         dissociative       0.00      0.00      0.00       509
           entactogen       0.00      0.00      0.00       376
            entheogen       0.00      0.00      0.00       523
         hallucinogen       0.00      

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))



XGB:
                       precision    recall  f1-score   support

                   2c       0.04      1.00      0.07       560
                3-MeO       0.00      0.00      0.00        31
                5-meo       0.00      0.00      0.00       243
                  DMT       0.00      0.00      0.00       480
              alcohol       0.00      0.00      0.00       693
       antidepressant       0.00      0.00      0.00        36
           anxiolytic       0.00      0.00      0.00        77
            ayahuasca       0.00      0.00      0.00        49
            botanical       0.00      0.00      0.00       761
             cannabis       0.00      0.00      0.00      1933
           depressant       0.00      0.00      0.00       148
         dissociative       0.00      0.00      0.00       509
           entactogen       0.00      0.00      0.00       376
            entheogen       0.00      0.00      0.00       523
         hallucinogen       0.00      0.00      

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [None]:
# Split data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Create a Logistic Regression model
lr = LogisticRegression()

# Train the model
lr.fit(X_train, y_train)

# Make predictions on the test set
y_pred = lr.predict(X_test)

# Print a classification report
print(classification_report(y_test, y_pred))


In [None]:
# Use a RandomForestClassifier
clf = RandomForestClassifier(n_estimators=100, random_state=42)
clf.fit(X_train, y_train)

# Predict the test set results
y_pred = clf.predict(X_test)

# Evaluate the performance of the model
print(classification_report(y_test, y_pred, target_names=encoder.classes_))

In [None]:
# Instantiate the XGBClassifier
xgb = XGBClassifier(use_label_encoder=False)

# Fit the model to the training data
xgb.fit(X_train, y_train, eval_metric='mlogloss')

# Predict the labels of the test set
y_pred = xgb.predict(X_test)

# Print the classification report
print(classification_report(y_test, y_pred))

In [None]:
# Define the parameter grid
param_grid = {'C': [0.1, 1, 10, 100], 'penalty': ['l1', 'l2']}

# Create a GridSearchCV object
grid_search = GridSearchCV(lr, param_grid, cv=5)

# Perform grid search
grid_search.fit(X_train, y_train)

# Print the best parameters and the best score
print(grid_search.best_params_)
print(grid_search.best_score_)
