<a href="https://colab.research.google.com/github/jbloewencolon/Psychedelic-Trip-Generator/blob/main/BERT.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [23]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import re
import torch
import xgboost as xgb
import os
from joblib import dump, load
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report
from sklearn.preprocessing import LabelEncoder, StandardScaler
from imblearn.pipeline import make_pipeline
from xgboost import XGBClassifier
from sklearn.base import BaseEstimator, TransformerMixin
from transformers import BertTokenizer, BertModel
from transformers import AutoTokenizer, AutoModel
from torch import device

In [None]:
# Define function to load pre-trained model and tokenizer
def load_pretrained_model_and_tokenizer(model_file, tokenizer_file):
    # Check if the model and tokenizer are already saved
    if os.path.exists(model_file) and os.path.exists(tokenizer_file):
        model = load(model_file)
        tokenizer = load(tokenizer_file)
    else:
        # Load pre-trained model tokenizer
        tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

        # Load pre-trained model (weights)
        model = BertModel.from_pretrained('bert-base-uncased', output_hidden_states=True)
        model = model.to(device)  # Move model to GPU if available
        model.eval()  # Put the model in "evaluation" mode

        # Save the model and tokenizer files
        dump(model, model_file)
        dump(tokenizer, tokenizer_file)

    return model, tokenizer

In [24]:
# Load the dataset
df = pd.read_csv('D:/Cloud/Google Drive/Colab Notebooks/Data/processed.csv')

In [None]:
# Specify the directory where you want to save the file
output_dir = 'D:/Cloud/Google Drive/Colab Notebooks/Data'
os.makedirs(output_dir, exist_ok=True)  # Create the directory if it does not exist

model_file = os.path.join(output_dir, "bert_model.joblib")
tokenizer_file = os.path.join(output_dir, "bert_tokenizer.joblib")

# Load pre-trained model and tokenizer
model, tokenizer = load_pretrained_model_and_tokenizer(model_file, tokenizer_file)

In [25]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 76450 entries, 0 to 76449
Data columns (total 11 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   title             76448 non-null  object 
 1   drug              76447 non-null  object 
 2   dosage            73699 non-null  object 
 3   delivery          74213 non-null  object 
 4   weight            76449 non-null  float64
 5   year              76449 non-null  float64
 6   gender            76446 non-null  object 
 7   report            76439 non-null  object 
 8   processed_report  76438 non-null  object 
 9   mixed             76448 non-null  float64
 10  drug_category     76448 non-null  object 
dtypes: float64(3), object(8)
memory usage: 6.4+ MB


In [26]:
# Fill missing values with 'unknown' in 'drug_category' column
df[['drug', 'dosage', 'delivery']] = df[['drug', 'dosage', 'delivery']].fillna('unknown')

df = df.dropna()

df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 76438 entries, 0 to 76449
Data columns (total 11 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   title             76438 non-null  object 
 1   drug              76438 non-null  object 
 2   dosage            76438 non-null  object 
 3   delivery          76438 non-null  object 
 4   weight            76438 non-null  float64
 5   year              76438 non-null  float64
 6   gender            76438 non-null  object 
 7   report            76438 non-null  object 
 8   processed_report  76438 non-null  object 
 9   mixed             76438 non-null  float64
 10  drug_category     76438 non-null  object 
dtypes: float64(3), object(8)
memory usage: 7.0+ MB


In [27]:
df.head()

Unnamed: 0,title,drug,dosage,delivery,weight,year,gender,report,processed_report,mixed,drug_category
0,Ode to Joy,mdma,1.5 tablets,oral,185.0,2000.0,male,My friend had some experience with X and had t...,friend experi x told one day said come across ...,0.0,mdma
1,Make Sure the Music's Not Too Complex,cannabis,unknown,smoked,152.0,1999.0,not specified,This was the first experience that either my f...,first experi either friend salvia housem check...,0.0,cannabis
2,After Hours,mdma,160 mg,oral,150.0,2001.0,male,Preparation: I have heard some conflicting opi...,prepar heard conflict opinion 5htp ie load day...,1.0,mdma
3,After Hours,mdma,100 mg,oral,150.0,2001.0,male,Preparation: I have heard some conflicting opi...,prepar heard conflict opinion 5htp ie load day...,1.0,mdma
4,After Hours,mdma,50 mg,insufflated,150.0,2001.0,male,Preparation: I have heard some conflicting opi...,prepar heard conflict opinion 5htp ie load day...,1.0,mdma


In [28]:
df['drug_category'].value_counts()

pharmaceutical           10125
cannabis                  9536
stimulant                 5780
mushrooms                 4055
botanical                 3814
opioid                    3795
mdma                      3560
alcohol                   3368
hallucinogen              3128
lsd                       3064
salvia                    2831
2c                        2812
other                     2746
dissociative              2624
entheogen                 2546
DMT                       2362
entactogen                1754
nootropic                 1653
5-meo                     1290
ketamine                  1233
unknown                    923
mescaline                  726
depressant                 686
synthetic cannabinoid      451
anxiolytic                 448
oneirogen                  307
ayahuasca                  229
phencyclidine              216
antidepressant             167
3-MeO                      131
ibogaine                    78
Name: drug_category, dtype: int64

In [29]:
# Define the mapping from old categories to new ones
category_mapping = {
    'pharmaceutical': 'Pharmaceutical',
    'cannabis': 'Cannabinoid',
    'stimulant': 'Stimulant',
    'mushrooms': 'Psychedelic',
    'botanical': 'Other',
    'opioid': 'Opioid',
    'mdma': 'Entactogen/Empathogen',
    'alcohol': 'Depressant',
    'hallucinogen': 'Psychedelic',
    'lsd': 'Psychedelic',
    'salvia': 'Psychedelic',
    '2c': 'Psychedelic',
    'other': 'Other',
    'dissociative': 'Dissociative',
    'entheogen': 'Entheogen',
    'DMT': 'Psychedelic',
    'entactogen': 'Entactogen/Empathogen',
    'nootropic': 'Other',
    '5-meo': 'Psychedelic',
    'ketamine': 'Dissociative',
    'unknown': 'Other',
    'mescaline': 'Psychedelic',
    'depressant': 'Depressant',
    'synthetic cannabinoid': 'Cannabinoid',
    'anxiolytic': 'Pharmaceutical',
    'oneirogen': 'Other',
    'ayahuasca': 'Psychedelic',
    'phencyclidine': 'Dissociative',
    'antidepressant': 'Pharmaceutical',
    '3-MeO': 'Other',
    'ibogaine': 'Entheogen'
}

# Apply the mapping to the 'drug_category' column and create a new column 'grouped_drug_category'
df['drug_category'] = df['drug_category'].map(category_mapping)

df['drug_category'].value_counts()

Psychedelic              20497
Pharmaceutical           10740
Cannabinoid               9987
Other                     9574
Stimulant                 5780
Entactogen/Empathogen     5314
Dissociative              4073
Depressant                4054
Opioid                    3795
Entheogen                 2624
Name: drug_category, dtype: int64

In [30]:
df.head()

Unnamed: 0,title,drug,dosage,delivery,weight,year,gender,report,processed_report,mixed,drug_category
0,Ode to Joy,mdma,1.5 tablets,oral,185.0,2000.0,male,My friend had some experience with X and had t...,friend experi x told one day said come across ...,0.0,Entactogen/Empathogen
1,Make Sure the Music's Not Too Complex,cannabis,unknown,smoked,152.0,1999.0,not specified,This was the first experience that either my f...,first experi either friend salvia housem check...,0.0,Cannabinoid
2,After Hours,mdma,160 mg,oral,150.0,2001.0,male,Preparation: I have heard some conflicting opi...,prepar heard conflict opinion 5htp ie load day...,1.0,Entactogen/Empathogen
3,After Hours,mdma,100 mg,oral,150.0,2001.0,male,Preparation: I have heard some conflicting opi...,prepar heard conflict opinion 5htp ie load day...,1.0,Entactogen/Empathogen
4,After Hours,mdma,50 mg,insufflated,150.0,2001.0,male,Preparation: I have heard some conflicting opi...,prepar heard conflict opinion 5htp ie load day...,1.0,Entactogen/Empathogen


In [None]:
# Set the device to GPU (if available)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [None]:
def get_bert_embeddings(text, tokenizer, model):
    # Add special tokens takes care of adding [CLS], [SEP], <s>... tokens in the right way for BERT
    marked_text = "[CLS] " + text + " [SEP]"
    tokenized_text = tokenizer.tokenize(marked_text)

    # Check tokenized text length
    if len(tokenized_text) > 512:
        tokenized_text = tokenized_text[:512]  # Truncate if too long

    indexed_tokens = tokenizer.convert_tokens_to_ids(tokenized_text)

    segments_ids = [1] * len(tokenized_text)

    # Convert inputs to PyTorch tensors
    tokens_tensor = torch.tensor([indexed_tokens]).to(device)
    segments_tensors = torch.tensor([segments_ids]).to(device)

    # Predict hidden states features for each layer
    with torch.no_grad():
        outputs = model(tokens_tensor, segments_tensors)
        hidden_states = outputs.hidden_states

    token_embeddings = torch.stack(hidden_states, dim=0)
    token_embeddings = torch.squeeze(token_embeddings, dim=1)
    token_embeddings = token_embeddings.permute(1,0,2)

    # Sum the vectors from the last four layers.
    sum_vec = torch.sum(token_embeddings[-4:], dim=0)

    return sum_vec

In [None]:
# Compute the BERT embeddings for each report
df['processed_report'] = df['processed_report'].astype(str)
processed_reports = df['processed_report'].tolist()
bert_embeddings = [get_bert_embeddings(report, tokenizer, model) for report in processed_reports]

In [None]:
# Convert tensors to numpy arrays for pickling
bert_embeddings = [embedding.cpu().numpy() for embedding in bert_embeddings]

# Assign the embeddings back to the dataframe
df['report_embeddings'] = bert_embeddings

In [None]:
# Specify the full path of the file
output_file = os.path.join(output_dir, 'bert_embeddings.joblib')

# Save the embeddings to a file
with open(output_file, 'wb') as f:
    dump(bert_embeddings, f)

In [33]:
# Convert list of tensors to numpy array
df['report_embeddings'] = df['report_embeddings'].apply(lambda x: np.mean([t.numpy() for t in x], axis=0))

# Split data into train and test
X_train, X_test, y_train, y_test = train_test_split(df['report_embeddings'].to_list(), df['drug_category'], test_size=0.2, random_state=42)

KeyError: ignored

In [None]:
# Create pipelines
pipelines = {
    "lr": Pipeline([('scaler', StandardScaler()), ('clf', LogisticRegression(solver='liblinear', random_state=42))]),
    "rfc": Pipeline([('clf', RandomForestClassifier(n_estimators=100, random_state=42))]),
    "xgb": Pipeline([('scaler', StandardScaler()), ('clf', XGBClassifier(use_label_encoder=False, eval_metric='mlogloss'))])
}

In [None]:
# Fit each model, print classification reports for both training and test data
for model_name, pipeline in pipelines.items():
    # Fit the model
    pipeline.fit(X_train, y_train)

    # Predict on training data and print classification report
    y_train_pred = pipeline.predict(X_train)
    print(f"\nTraining classification report for {model_name}:")
    print(classification_report(y_train, y_train_pred))

    # Predict on test data and print classification report
    y_test_pred = pipeline.predict(X_test)
    print(f"\nTest classification report for {model_name}:")
    print(classification_report(y_test, y_test_pred))

In [None]:
# Create the GridSearchCV instance
grid_search = GridSearchCV(model, param_grid, cv=3)

# Tokenize the reports
tokenized_reports = df['processed_report'].apply(simple_preprocess)

# Train the Word2Vec model and perform grid search
grid_search.fit(tokenized_reports)

# Get the best model and its hyperparameters
best_model = grid_search.best_estimator_
best_params = grid_search.best_params_

# Print the best hyperparameters
print("Best Hyperparameters:")
for param, value in best_params.items():
    print(f"{param}: {value}")

TypeError: ignored

In [None]:
# Define oversampler
oversampler = SMOTE()

# Compute class frequencies in 'mixed' column
class_freq = df['mixed'].value_counts()

# Define class weights based on frequencies
class_weights = {value: 10 if value == '0' else 1 for value in class_freq.index}

# Encode the labels in y
label_encoder = LabelEncoder()
y_encoded = label_encoder.fit_transform(y)

# Set up the XGBoost parameters
params = {
    'objective': 'multi:softmax',
    'num_class': 3
}

# Define classifiers
classifiers = {
    'Logistic Regression': LogisticRegression(max_iter=1000, class_weight=class_weights),
    'Random Forest': RandomForestClassifier(class_weight=class_weights, n_estimators=100, random_state=42),
    'XGB': XGBClassifier(eval_metric='mlogloss')
}

# Define pipelines
pipelines = {
    name: make_pipeline(oversampler, model)
    for name, model in classifiers.items()
}

# Split your data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
# Fit and evaluate Logistic Regression
print('\nTraining Logistic Regression...')
pipeline.fit(X_train, y_train)
y_pred_lr = pipeline.predict(X_test)
print('\nLogistic Regression:')
print(classification_report(y_test, y_pred_lr, target_names=encoder.classes_, zero_division=1))

In [None]:
# Fit and evaluate Random Forest
print('\nTraining Random Forest...')
pipeline.fit(X_train, y_train)
y_pred = pipeline.predict(X_test)
print('\nRandom Forest:')
print(classification_report(y_test, y_pred_rf, target_names=encoder.classes_, zero_division=1))

In [None]:
# Fit and evaluate XGBoost with the pipeline
print('\nTraining XGB through pipeline...')
pipeline_xgb.fit(X_train, y_train, xgbclassifier__sample_weight=weights_train)
y_pred_xgb = pipeline_xgb.predict(X_test)
print('\nXGBoost (through pipeline):')
print(classification_report(y_test, y_pred_xgb, target_names=encoder.classes_, zero_division=1))

In [None]:
# Define the parameter grid
param_grid = {'C': [0.1, 1, 10, 100], 'penalty': ['l1', 'l2']}

# Create a GridSearchCV object
grid_search = GridSearchCV(lr, param_grid, cv=5)

# Perform grid search
grid_search.fit(X_train, y_train)

# Print the best parameters and the best score
print(grid_search.best_params_)
print(grid_search.best_score_)
