<a href="https://colab.research.google.com/github/jbloewencolon/Psychedelic-Trip-Generator/blob/main/Modeling.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [54]:
import pandas as pd
import numpy as np
import xgboost as xgb
import pickle
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split, GridSearchCV, StratifiedShuffleSplit
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import LabelEncoder, StandardScaler
from xgboost import XGBClassifier
from sklearn.feature_selection import SelectKBest, chi2
from imblearn.over_sampling import SMOTE
from imblearn.pipeline import Pipeline as ImbPipeline
from sklearn.preprocessing import FunctionTransformer


In [55]:
df = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/Data/processed.csv')
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 67516 entries, 0 to 67515
Data columns (total 12 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   title             67516 non-null  object 
 1   drug              67516 non-null  object 
 2   dosage            67516 non-null  object 
 3   delivery          67516 non-null  object 
 4   weight            67516 non-null  float64
 5   year              67516 non-null  int64  
 6   gender            67516 non-null  object 
 7   report            67516 non-null  object 
 8   processed_report  67516 non-null  object 
 9   mixed             67516 non-null  int64  
 10  drug_category     67516 non-null  object 
 11  language          67516 non-null  object 
dtypes: float64(1), int64(2), object(9)
memory usage: 6.2+ MB


In [56]:
df.head()

Unnamed: 0,title,drug,dosage,delivery,weight,year,gender,report,processed_report,mixed,drug_category,language
0,The Happiest I Remember Feeling,mdma,1 tablet,oral,140.0,2013,female,"I had taken Ecstasy once before, but it was to...",taken ecstasi low dose experi fulli felt happi...,0,Entactogen/Empathogen,en
1,A Short Trip to Extravagance,cannabis,1 cig.,smoked,175.0,2005,male,It all started at about 6:00AM one morning aft...,start 600am one morn gotten readi work pull fa...,1,Cannabinoid,en
2,The Yins and Yangs of the Speedball,heroin,0.1 g,IV,170.0,2007,male,INTRODUCTION:I am not just writing a ``I got f...,introductioni write got fuck report guid tour ...,1,Opioid,en
3,Well What Do You Know About That?,cannabis,1 bowl,smoked,145.0,2011,male,8:00 eat 160 mg of butylone on a mostly empty ...,800 eat 160 mg butylon mostli empti stomach al...,1,Cannabinoid,en
4,Quick and Satisfying,tea,,oral,130.0,2012,female,Due to the recent lack of availability of mimo...,due recent lack avail mimosa hostili bark jure...,1,Other,en


In [58]:
# Define y
y = df['drug_category']

# Encode the labels in y
label_encoder = LabelEncoder()
y_encoded = label_encoder.fit_transform(y)

X = df['processed_report']

In [61]:
# Splitting the data
X_train, X_test, y_train, y_test = train_test_split(X, y_encoded, test_size=0.2, random_state=42)

# Define pipeline for Logistic Regression with TF-IDF
pipeline_lr = ImbPipeline([
    ('to_string', FunctionTransformer(lambda x: [' '.join(i) if isinstance(i, list) else i for i in x])),
    ('tfidf', TfidfVectorizer()),
    ('smote', SMOTE()),
    ('clf', LogisticRegression(max_iter=500, C=1.0, solver='saga', n_jobs=-1))
])


In [62]:
print("\nTraining Logistic Regression...")
# Fit the pipeline
pipeline_lr.fit(X_train, y_train)

# Predict on the train and test set
y_train_pred = pipeline_lr.predict(X_train)
y_test_pred = pipeline_lr.predict(X_test)

# Generate and print the classification report for both train and test
print("Train Classification Report:")
print(classification_report(y_train, y_train_pred, zero_division=1))
print("Test Classification Report:")
print(classification_report(y_test, y_test_pred, zero_division=1))

# Pickle the results
with open("/content/drive/MyDrive/Colab Notebooks/Data/logistic_regression_train.pkl", "wb") as f:
    pickle.dump((y_train, y_train_pred), f)
with open("/content/drive/MyDrive/Colab Notebooks/Data/logistic_regression_test.pkl", "wb") as f:
    pickle.dump((y_test, y_test_pred), f)


Training Logistic Regression...
Train Classification Report:
              precision    recall  f1-score   support

           0       0.60      0.60      0.60      4595
           1       0.54      0.78      0.64      1771
           2       0.70      0.90      0.79      2113
           3       0.81      0.85      0.83      5161
           4       0.75      0.93      0.83      2831
           5       0.77      0.88      0.82      1927
           6       0.82      0.79      0.81      4936
           7       0.83      0.79      0.81      5204
           8       0.92      0.83      0.87     22613
           9       0.74      0.76      0.75      2861

    accuracy                           0.81     54012
   macro avg       0.75      0.81      0.77     54012
weighted avg       0.82      0.81      0.81     54012

Test Classification Report:
              precision    recall  f1-score   support

           0       0.39      0.41      0.40      1135
           1       0.31      0.41      0.3

In [83]:
# Define the file paths in your Google Drive where you want to save the pickle files
word2vec_model_file_path = '/content/drive/MyDrive/Colab Notebooks/Data/word2vec_model.pkl'
transformed_reports_file_path = '/content/drive/MyDrive/Colab Notebooks/Data/transformed_reports.pkl'

# Load Word2Vec Model
with open(word2vec_model_file_path, "rb") as f:
    word2vec_model = pickle.load(f)

# Load transformed reports
with open(transformed_reports_file_path, "rb") as f:
    X = pickle.load(f)

print("X shape:", X.shape)

AttributeError: ignored

In [84]:
# Print the first 5 elements of the list X
print("First 5 elements of X:")
print(X[:5])

# Check the length of the list X
print("\nLength of X:")
print(len(X))

# Check the shape of individual arrays inside the list X
print("\nShapes of the first 5 arrays in X:")
print([x.shape for x in X[:5]])


First 5 elements of X:
[array([[0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0.]]), array([[0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0.]]), array([[0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.

In [79]:
vocabulary = word2vec_model.wv.index_to_key
print("Vocabulary size: ", len(vocabulary))
print('/')
print("Some words in vocabulary: ", vocabulary[:50])
print('/')

similar_words = word2vec_model.wv.most_similar('psychedelic', topn=10)
print("Some words in to 'psychedelic: ", similar_words)
print('/')

word_vector = word2vec_model.wv['psychedelic']
print(word_vector)

Vocabulary size:  70419
/
Some words in vocabulary:  ['feel', 'like', 'time', 'felt', 'would', 'experi', 'get', 'go', 'trip', 'one', 'start', 'friend', 'could', 'back', 'take', 'tri', 'look', 'around', 'still', 'hour', 'effect', 'im', 'thing', 'realli', 'much', 'day', 'mg', 'thought', 'seem', 'think', 'first', 'see', 'smoke', 'got', 'even', 'took', 'drug', 'want', 'come', 'decid', 'didnt', 'good', 'bodi', 'way', 'went', 'littl', 'minut', 'know', 'point', 'also']
/
Some words in to 'psychedelic:  [('clearheaded', 0.6142198443412781), ('alcohollik', 0.561062753200531), ('stimulatori', 0.5583595037460327), ('experiencetim', 0.5569223165512085), ('doip', 0.5451580882072449), ('bod', 0.5305292010307312), ('nu', 0.5302093029022217), ('potlik', 0.5272364616394043), ('enboh', 0.5238556861877441), ('drowsy', 0.5235028266906738)]
/
[-5.27226985e-01  2.57536232e-01 -1.77391768e-01  2.90087253e-01
  3.98414850e-01 -7.05772117e-02  2.37836838e-02 -2.06744343e-01
 -3.16789746e-01 -2.52498090e-01  5.

In [80]:
# Initial split to get 80% of total data
X_80, X_discard, y_80, y_discard = train_test_split(X, y_encoded, test_size=0.2, stratify=y_encoded, random_state=42)

# Convert list of arrays into a 2D array
X_80 = np.vstack(X_80)

# Initialize StratifiedShuffleSplit
sss = StratifiedShuffleSplit(n_splits=2, test_size=0.2, random_state=42)

# Perform stratified sampling on the 80% data
for train_index, test_index in sss.split(X_80, y_80):
    X_train, X_test = X_80[train_index], X_80[test_index]
    y_train, y_test = y_80[train_index], y_80[test_index]

In [81]:
# Define pipeline for RFC and XGB
pipeline_rf = ImbPipeline([
    ('smote', SMOTE()),
    ('clf', RandomForestClassifier(n_estimators=100, max_depth=None, n_jobs=-1))
])

pipeline_xgb = ImbPipeline([
    ('smote', SMOTE()),
    ('clf', XGBClassifier(n_estimators=100, max_depth=3, n_jobs=-1, eval_metric='mlogloss'))
])

In [82]:
print("\nTraining Random Forest...")
# Fit the pipeline
pipeline_rf.fit(X_train, y_train)

# Predict on the train and test set
y_train_pred = pipeline_rf.predict(X_train)
y_test_pred = pipeline_rf.predict(X_test)

# Generate and print the classification report for both train and test
print("Train Classification Report:")
print(classification_report(y_train, y_train_pred, zero_division=1))
print("Test Classification Report:")
print(classification_report(y_test, y_test_pred, zero_division=1))

# Pickle the results
with open("/content/drive/MyDrive/Colab Notebooks/Data/random_forest_train.pkl", "wb") as f:
    pickle.dump((y_train, y_train_pred), f)
with open("/content/drive/MyDrive/Colab Notebooks/Data/random_forest_test.pkl", "wb") as f:
    pickle.dump((y_test, y_test_pred), f)



Training Random Forest...
Train Classification Report:
              precision    recall  f1-score   support

           0       0.08      1.00      0.16      3667
           1       1.00      0.00      0.00      1447
           2       1.00      0.00      0.00      1690
           3       1.00      0.00      0.00      4122
           4       1.00      0.00      0.00      2238
           5       1.00      0.00      0.00      1562
           6       1.00      0.00      0.00      3943
           7       1.00      0.00      0.00      4185
           8       1.00      0.00      0.00     18057
           9       1.00      0.00      0.00      2298

    accuracy                           0.08     43209
   macro avg       0.91      0.10      0.02     43209
weighted avg       0.92      0.08      0.01     43209

Test Classification Report:
              precision    recall  f1-score   support

           0       0.08      1.00      0.16       917
           1       1.00      0.00      0.00     

In [67]:
print("\nTraining XGBoost...")
# Fit the pipeline
pipeline_xgb.fit(X_train, y_train)

# Predict on the train and test set
y_train_pred = pipeline_xgb.predict(X_train)
y_test_pred = pipeline_xgb.predict(X_test)

# Generate and print the classification report for both train and test
print("Train Classification Report:")
print(classification_report(y_train, y_train_pred, zero_division=1))
print("Test Classification Report:")
print(classification_report(y_test, y_test_pred, zero_division=1))

# Pickle the results
with open("/content/drive/MyDrive/Colab Notebooks/Data/xgboost_train.pkl", "wb") as f:
    pickle.dump((y_train, y_train_pred), f)
with open("/content/drive/MyDrive/Colab Notebooks/Data/xgboost_test.pkl", "wb") as f:
    pickle.dump((y_test, y_test_pred), f)



Training XGBoost...
Train Classification Report:
              precision    recall  f1-score   support

           0       0.08      1.00      0.16      3667
           1       1.00      0.00      0.00      1447
           2       1.00      0.00      0.00      1690
           3       1.00      0.00      0.00      4122
           4       1.00      0.00      0.00      2238
           5       1.00      0.00      0.00      1562
           6       1.00      0.00      0.00      3943
           7       1.00      0.00      0.00      4185
           8       1.00      0.00      0.00     18057
           9       1.00      0.00      0.00      2298

    accuracy                           0.08     43209
   macro avg       0.91      0.10      0.02     43209
weighted avg       0.92      0.08      0.01     43209

Test Classification Report:
              precision    recall  f1-score   support

           0       0.08      1.00      0.16       917
           1       1.00      0.00      0.00       362


In [None]:
# Define the parameter grid
param_grid = {'C': [0.1, 1, 10, 100], 'penalty': ['l1', 'l2']}

# Create a GridSearchCV object
grid_search = GridSearchCV(lr, param_grid, cv=5)

# Perform grid search
grid_search.fit(X_train, y_train)

# Print the best parameters and the best score
print(grid_search.best_params_)
print(grid_search.best_score_)


In [None]:
# Define the parameter grid
param_grid = {
    'n_components': [5, 10, 15],  # Number of topics
    'learning_method': ['batch', 'online'],  # Learning method
    'doc_topic_prior': [0.1, 0.5, 1.0],  # Alpha parameter
    'topic_word_prior': [0.01, 0.1, 1.0]  # Beta parameter
}

# Create the GridSearchCV object
grid_search = GridSearchCV(lda, param_grid, cv=3)

# Fit the grid search to the data
grid_search.fit(X)

# Print the best parameters and score
print("Best Parameters:", grid_search.best_params_)
print("Best Score:", grid_search.best_score_)

In [None]:

# Define the file paths where you saved your pickle files
logistic_regression_file_path = '/content/drive/MyDrive/Colab Notebooks/Data/logistic_regression.pkl'
random_forest_file_path = '/content/drive/MyDrive/Colab Notebooks/Data/random_forest.pkl'
xgboost_file_path = '/content/drive/MyDrive/Colab Notebooks/Data/xgboost.pkl'