<a href="https://colab.research.google.com/github/jbloewencolon/Psychedelic-Trip-Generator/blob/main/Modeling.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [2]:
import pandas as pd
import numpy as np
import xgboost as xgb
import pickle
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split, GridSearchCV, StratifiedShuffleSplit
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import LabelEncoder, StandardScaler
from xgboost import XGBClassifier
from sklearn.feature_selection import SelectKBest, chi2
from imblearn.over_sampling import SMOTE
from imblearn.pipeline import Pipeline as ImbPipeline
from sklearn.preprocessing import FunctionTransformer


In [3]:
df = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/Data/processed.csv')
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 67516 entries, 0 to 67515
Data columns (total 12 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   Unnamed: 0        67516 non-null  int64  
 1   title             67516 non-null  object 
 2   drug              67516 non-null  object 
 3   dosage            67516 non-null  object 
 4   delivery          67516 non-null  object 
 5   weight            67516 non-null  float64
 6   year              67516 non-null  int64  
 7   gender            67516 non-null  object 
 8   report            67516 non-null  object 
 9   processed_report  67516 non-null  object 
 10  mixed             67516 non-null  int64  
 11  drug_category     67516 non-null  object 
dtypes: float64(1), int64(3), object(8)
memory usage: 6.2+ MB


In [4]:
df.head()

Unnamed: 0.1,Unnamed: 0,title,drug,dosage,delivery,weight,year,gender,report,processed_report,mixed,drug_category
0,0,The Happiest I Remember Feeling,mdma,1 tablet,oral,140.0,2013,female,"I had taken Ecstasy once before, but it was to...","[['i', 'had', 'taken', 'ecstasy', 'once', 'bef...",0,Entactogen/Empathogen
1,1,A Short Trip to Extravagance,cannabis,1 cig.,smoked,175.0,2005,male,It all started at about 6:00AM one morning aft...,"[['it', 'all', 'started', 'at', 'about', '600a...",1,Cannabinoid
2,2,The Yins and Yangs of the Speedball,heroin,0.1 g,IV,170.0,2007,male,INTRODUCTION:I am not just writing a ``I got f...,"[['introduction', '', 'i', 'am', 'not', 'just'...",1,Opioid
3,3,Well What Do You Know About That?,cannabis,1 bowl,smoked,145.0,2011,male,8:00 eat 160 mg of butylone on a mostly empty ...,"[['800', 'eat', '160', 'mg', 'of', 'butylone',...",1,Cannabinoid
4,4,Quick and Satisfying,tea,,oral,130.0,2012,female,Due to the recent lack of availability of mimo...,"[['due', 'to', 'the', 'recent', 'lack', 'of', ...",1,Other


In [7]:
# Define y
y = df['drug_category']

# Encode the labels in y
label_encoder = LabelEncoder()
y_encoded = label_encoder.fit_transform(y)

X = df['processed_report']

In [None]:
# Splitting the data
X_train, X_test, y_train, y_test = train_test_split(X, y_encoded, test_size=0.2, random_state=42)

# Define pipeline for Logistic Regression with TF-IDF
pipeline_lr = ImbPipeline([
    ('to_string', FunctionTransformer(lambda x: [' '.join(i) if isinstance(i, list) else i for i in x])),
    ('tfidf', TfidfVectorizer()),
    ('smote', SMOTE()),
    ('clf', LogisticRegression(max_iter=500, C=1.0, solver='saga', n_jobs=-1))
])


In [None]:
print("\nTraining Logistic Regression...")
# Fit the pipeline
pipeline_lr.fit(X_train, y_train)

# Predict on the train and test set
y_train_pred = pipeline_lr.predict(X_train)
y_test_pred = pipeline_lr.predict(X_test)

# Generate and print the classification report for both train and test
print("Train Classification Report:")
print(classification_report(y_train, y_train_pred, zero_division=1))
print("Test Classification Report:")
print(classification_report(y_test, y_test_pred, zero_division=1))

# Pickle the results
#with open("/content/drive/MyDrive/Colab Notebooks/Data/logistic_regression_train.pkl", "wb") as f:
#    pickle.dump((y_train, y_train_pred), f)
#with open("/content/drive/MyDrive/Colab Notebooks/Data/logistic_regression_test.pkl", "wb") as f:
#    pickle.dump((y_test, y_test_pred), f)


Training Logistic Regression...
Train Classification Report:
              precision    recall  f1-score   support

           0       0.60      0.60      0.60      4595
           1       0.54      0.78      0.64      1771
           2       0.70      0.90      0.79      2113
           3       0.81      0.85      0.83      5161
           4       0.75      0.93      0.83      2831
           5       0.77      0.88      0.82      1927
           6       0.82      0.79      0.81      4936
           7       0.83      0.79      0.81      5204
           8       0.92      0.83      0.87     22613
           9       0.74      0.76      0.75      2861

    accuracy                           0.81     54012
   macro avg       0.75      0.81      0.77     54012
weighted avg       0.82      0.81      0.81     54012

Test Classification Report:
              precision    recall  f1-score   support

           0       0.39      0.41      0.40      1135
           1       0.31      0.41      0.3

In [8]:
# Define the file paths in your Google Drive where you want to save the pickle files
word2vec_model_file_path = '/content/drive/MyDrive/Colab Notebooks/Data/word2vec_model.pkl'
transformed_reports_file_path = '/content/drive/MyDrive/Colab Notebooks/Data/transformed_reports.pkl'

# Load Word2Vec Model
with open(word2vec_model_file_path, "rb") as f:
    word2vec_model = pickle.load(f)

# Load transformed reports
with open(transformed_reports_file_path, "rb") as f:
    X = pickle.load(f)

print("X shape:", X.shape)

X shape: (67516, 100)


In [4]:
# Print the first 5 elements of the list X
print("First 5 elements of X:")
print(X[:5])

# Check the length of the list X
print("\nLength of X:")
print(len(X))

# Check the shape of individual arrays inside the list X
print("\nShapes of the first 5 arrays in X:")
print([x.shape for x in X[:5]])


First 5 elements of X:
[[-4.06293273e-01 -6.84446216e-01  2.48858660e-01 -2.46311843e-01
  -2.99457729e-01 -6.65272713e-01  2.54382789e-01  2.00265929e-01
   4.17558700e-01 -5.42980075e-01  4.52948920e-02  8.75082374e-01
   1.03775191e+00 -5.87108016e-01  3.09272017e-02  1.18148685e+00
  -1.20084262e+00  3.93117517e-01 -1.14910856e-01  1.16010107e-01
  -8.09521377e-01 -7.19257712e-01 -1.57608420e-01  1.35609758e+00
  -6.91913605e-01  2.80939251e-01  7.55737424e-01  1.99904695e-01
  -4.84348714e-01 -1.39134631e-01 -4.18761194e-01  2.88326710e-01
  -1.25573620e-01 -2.27493167e-01 -1.62226558e-01  9.96523321e-01
   9.01370402e-03  5.21282375e-01 -3.22541505e-01  8.21831584e-01
   6.64582670e-01  3.44376385e-01 -7.64632285e-01  8.05959553e-02
   7.88948417e-01 -1.16871846e+00  2.80606568e-01  4.12085891e-01
   4.57317889e-01 -1.17682970e+00 -7.63193071e-01  2.07477972e-01
   4.52991277e-01  3.13612133e-01  3.73220235e-01  9.60330427e-01
  -1.20376539e+00 -3.31902325e-01  1.85902983e-01 -4.

In [None]:
vocabulary = word2vec_model.wv.index_to_key
print("Vocabulary size: ", len(vocabulary))
print('/')
print("Some words in vocabulary: ", vocabulary[:50])
print('/')

similar_words = word2vec_model.wv.most_similar('psychedelic', topn=10)
print("Some words in to 'psychedelic: ", similar_words)
print('/')

word_vector = word2vec_model.wv['psychedelic']
print(word_vector)

Vocabulary size:  90323
/
Some words in vocabulary:  ['', 'i', 'the', 'and', 'a', 'to', 'of', 'wa', 'my', 'it', 'in', 'that', 'this', 'with', 'me', 'had', 'but', 'on', 'for', 'is', 'at', 'nt', 'we', 'not', 'like', 'have', 'so', 'time', 'about', 'up', 'all', 'were', 'be', 'out', 'some', 'just', 'from', 's', 'or', 'felt', 'would', 'very', 'more', 'could', 'an', 'what', 'feel', 'do', 'did', 'experience']
/
Some words in to 'psychedelic:  [('psychadelic', 0.8326389193534851), ('hallucinogenic', 0.8289550542831421), ('dissociative', 0.7624799609184265), ('entheogenic', 0.7561480402946472), ('visionary', 0.7225837707519531), ('tryptamine', 0.7080605626106262), ('mindexpanding', 0.6862577795982361), ('phenethylamine', 0.6861829161643982), ('mindopening', 0.6857782602310181), ('mindbending', 0.6824239492416382)]
/
[ 1.7105453  -0.28656834 -1.9225749  -0.763428   -0.01348514 -0.9188493
  1.1516739   1.4653172  -1.5001339  -1.4014486  -1.4708631   2.2129803
 -1.7954363   0.5857399   1.0475065  -

In [9]:
# Convert list of arrays into a 2D array
X = np.vstack(X)

# Splitting the data
X_train, X_test, y_train, y_test = train_test_split(X, y_encoded, test_size=0.2, random_state=42)

# Initialize StratifiedShuffleSplit
sss = StratifiedShuffleSplit(n_splits=2, test_size=0.2, random_state=42)

# Perform stratified sampling on the 80% data
for train_index, test_index in sss.split(X, y):
    X_train, X_test = X[train_index], X[test_index]
    y_train, y_test = y[train_index], y[test_index]

In [10]:
# Define pipeline for RFC and XGB
pipeline_rf = ImbPipeline([
    ('smote', SMOTE()),
    ('clf', RandomForestClassifier(n_estimators=100, max_depth=None, n_jobs=-1))
])

pipeline_xgb = ImbPipeline([
    ('smote', SMOTE()),
    ('clf', XGBClassifier(n_estimators=100, max_depth=3, n_jobs=-1, eval_metric='mlogloss'))
])

In [11]:
print("\nTraining Random Forest...")
# Fit the pipeline
pipeline_rf.fit(X_train, y_train)

# Predict on the train and test set
y_train_pred = pipeline_rf.predict(X_train)
y_test_pred = pipeline_rf.predict(X_test)

# Generate and print the classification report for both train and test
print("Train Classification Report:")
print(classification_report(y_train, y_train_pred, zero_division=1))
print("Test Classification Report:")
print(classification_report(y_test, y_test_pred, zero_division=1))

# Pickle the results
with open("/content/drive/MyDrive/Colab Notebooks/Data/rf_train.pkl", "wb") as f:
    pickle.dump((y_train, y_train_pred), f)
with open("/content/drive/MyDrive/Colab Notebooks/Data/rf_test.pkl", "wb") as f:
    pickle.dump((y_test, y_test_pred), f)



Training Random Forest...
Train Classification Report:
                       precision    recall  f1-score   support

          Cannabinoid       0.71      0.72      0.71      4584
           Depressant       0.67      0.75      0.71      1809
         Dissociative       0.82      0.88      0.85      2113
Entactogen/Empathogen       0.84      0.88      0.86      5152
            Entheogen       0.83      0.91      0.87      2797
               Opioid       0.85      0.89      0.87      1953
                Other       0.87      0.87      0.87      4929
       Pharmaceutical       0.86      0.84      0.85      5232
          Psychedelic       0.93      0.89      0.91     22571
            Stimulant       0.82      0.82      0.82      2872

             accuracy                           0.86     54012
            macro avg       0.82      0.85      0.83     54012
         weighted avg       0.86      0.86      0.86     54012

Test Classification Report:
                       precisio

In [None]:
print("\nTraining XGBoost...")
# Fit the pipeline
pipeline_xgb.fit(X_train, y_train)

# Predict on the train and test set
y_train_pred = pipeline_xgb.predict(X_train)
y_test_pred = pipeline_xgb.predict(X_test)

# Generate and print the classification report for both train and test
print("Train Classification Report:")
print(classification_report(y_train, y_train_pred, zero_division=1))
print("Test Classification Report:")
print(classification_report(y_test, y_test_pred, zero_division=1))

# Pickle the results
#with open("/content/drive/MyDrive/Colab Notebooks/Data/xgboost_train.pkl", "wb") as f:
#    pickle.dump((y_train, y_train_pred), f)
#with open("/content/drive/MyDrive/Colab Notebooks/Data/xgboost_test.pkl", "wb") as f:
#    pickle.dump((y_test, y_test_pred), f)


In [None]:
# Define the parameter grid
param_grid = {
    'clf__n_estimators': [100, 200, 500],
    'clf__max_depth': [None, 10, 20, 30],
    'clf__min_samples_split': [2, 5, 10],
    'clf__min_samples_leaf': [1, 2, 4],
    'clf__bootstrap': [True, False]
}

# Create a GridSearchCV object
grid_search = GridSearchCV(pipeline_rf, param_grid, cv=5)

print("\nTraining Random Forest...")
# Perform grid search
grid_search.fit(X_train, y_train)

# Print the best parameters and the best score
print(grid_search.best_params_)
print(grid_search.best_score_)

# Predict on the train and test set using the best model
y_train_pred = grid_search.predict(X_train)
y_test_pred = grid_search.predict(X_test)

# Generate and print the classification report for both train and test
print("Train Classification Report:")
print(classification_report(y_train, y_train_pred, zero_division=1))
print("Test Classification Report:")
print(classification_report(y_test, y_test_pred, zero_division=1))

# Pickle the results
with open("/content/drive/MyDrive/Colab Notebooks/Data/random_forest_train.pkl", "wb") as f:
    pickle.dump((y_train, y_train_pred), f)
with open("/content/drive/MyDrive/Colab Notebooks/Data/random_forest_test.pkl", "wb") as f:
    pickle.dump((y_test, y_test_pred), f)


Training Random Forest...


In [5]:
# Load the training data
with open("/content/drive/MyDrive/Colab Notebooks/Data/random_forest_train.pkl", "rb") as f:
    y_train, y_train_pred = pickle.load(f)

# Load the testing data
with open("/content/drive/MyDrive/Colab Notebooks/Data/random_forest_test.pkl", "rb") as f:
    y_test, y_test_pred = pickle.load(f)

In [6]:
# Generate and print the classification report for both train and test
print("Train Classification Report:")
print(classification_report(y_train, y_train_pred, zero_division=1))
print("Test Classification Report:")
print(classification_report(y_test, y_test_pred, zero_division=1))

Train Classification Report:
              precision    recall  f1-score   support

           0       0.75      0.74      0.74      3667
           1       0.71      0.79      0.75      1447
           2       0.82      0.89      0.85      1690
           3       0.85      0.90      0.87      4122
           4       0.86      0.93      0.89      2238
           5       0.87      0.89      0.88      1562
           6       0.88      0.89      0.89      3943
           7       0.88      0.87      0.87      4185
           8       0.94      0.90      0.92     18057
           9       0.82      0.85      0.84      2298

    accuracy                           0.88     43209
   macro avg       0.84      0.87      0.85     43209
weighted avg       0.88      0.88      0.88     43209

Test Classification Report:
              precision    recall  f1-score   support

           0       0.32      0.29      0.31       917
           1       0.26      0.23      0.24       362
           2       0.