<a href="https://colab.research.google.com/github/jbloewencolon/Psychedelic-Trip-Generator/blob/main/Modeling.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import pandas as pd
import numpy as np
import xgboost as xgb
import pickle
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split, GridSearchCV, StratifiedShuffleSplit
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import LabelEncoder
from xgboost import XGBClassifier
from sklearn.feature_selection import SelectKBest, chi2

In [2]:
df = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/Data/processed.csv')
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 67516 entries, 0 to 67515
Data columns (total 12 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   title             67516 non-null  object 
 1   drug              67516 non-null  object 
 2   dosage            67516 non-null  object 
 3   delivery          67516 non-null  object 
 4   weight            67516 non-null  float64
 5   year              67516 non-null  int64  
 6   gender            67516 non-null  object 
 7   report            67516 non-null  object 
 8   processed_report  67516 non-null  object 
 9   mixed             67516 non-null  int64  
 10  drug_category     67516 non-null  object 
 11  language          67516 non-null  object 
dtypes: float64(1), int64(2), object(9)
memory usage: 6.2+ MB


In [3]:
df.head()

Unnamed: 0,title,drug,dosage,delivery,weight,year,gender,report,processed_report,mixed,drug_category,language
0,The Happiest I Remember Feeling,mdma,1 tablet,oral,140.0,2013,female,"I had taken Ecstasy once before, but it was to...",taken ecstasi low dose experi fulli felt happi...,0,Entactogen/Empathogen,en
1,A Short Trip to Extravagance,cannabis,1 cig.,smoked,175.0,2005,male,It all started at about 6:00AM one morning aft...,start 600am one morn gotten readi work pull fa...,1,Cannabinoid,en
2,The Yins and Yangs of the Speedball,heroin,0.1 g,IV,170.0,2007,male,INTRODUCTION:I am not just writing a ``I got f...,introductioni write got fuck report guid tour ...,1,Opioid,en
3,Well What Do You Know About That?,cannabis,1 bowl,smoked,145.0,2011,male,8:00 eat 160 mg of butylone on a mostly empty ...,800 eat 160 mg butylon mostli empti stomach al...,1,Cannabinoid,en
4,Quick and Satisfying,tea,,oral,130.0,2012,female,Due to the recent lack of availability of mimo...,due recent lack avail mimosa hostili bark jure...,1,Other,en


In [4]:
# Define y
y = df['drug_category']

# Encode the labels in y
label_encoder = LabelEncoder()
y_encoded = label_encoder.fit_transform(y)

# Define X
X = df['processed_report']

# Initial split to get 80% of total data
X_80, X_discard, y_80, y_discard = train_test_split(X, y, test_size=0.2, stratify=y, random_state=42)

# Initialize StratifiedShuffleSplit
sss = StratifiedShuffleSplit(n_splits=1, test_size=0.2, random_state=42)

# Perform stratified sampling on the 80% data
for train_index, test_index in sss.split(X_80, y_80):
    X_train, X_test = X_80[train_index], X_80[test_index]
    y_train, y_test = y_80[train_index], y_80[test_index]

# Number of features for feature selection
k = 1000

In [5]:
# Define pipelines
pipeline_lr = Pipeline([
    ('tfidf', TfidfVectorizer()),
    ('chi2', SelectKBest(chi2, k=k)),
    ('clf', LogisticRegression(max_iter=500, C=1.0, solver='saga', n_jobs=-1))
])

pipeline_rf = Pipeline([
    ('tfidf', TfidfVectorizer()),
    ('chi2', SelectKBest(chi2, k=k)),
    ('clf', RandomForestClassifier(n_estimators=100, max_depth=None, n_jobs=-1))
])

pipeline_xgb = Pipeline([
    ('tfidf', TfidfVectorizer()),
    ('chi2', SelectKBest(chi2, k=k)),
    ('clf', XGBClassifier(n_estimators=100, max_depth=3, n_jobs=-1, eval_metric='mlogloss', use_label_encoder=False))
])

In [6]:
print("\nTraining Logistic Regression...")
# Fit the pipeline
pipeline_lr.fit(X_train, y_train)

# Predict on the train and test set
y_train_pred = pipeline_lr.predict(X_train)
y_test_pred = pipeline_lr.predict(X_test)

# Generate and print the classification report for both train and test
print("Train Classification Report:")
print(classification_report(y_train, y_train_pred, zero_division=1))
print("Test Classification Report:")
print(classification_report(y_test, y_test_pred, zero_division=1))

# Pickle the results
with open("/content/drive/MyDrive/Colab Notebooks/Data/logistic_regression_train.pkl", "wb") as f:
    pickle.dump((y_train, y_train_pred), f)
with open("/content/drive/MyDrive/Colab Notebooks/Data/logistic_regression_test.pkl", "wb") as f:
    pickle.dump((y_test, y_test_pred), f)


In [None]:
print("\nTraining Random Forest...")
# Fit the pipeline
pipeline_rf.fit(X_train, y_train)

# Predict on the train and test set
y_train_pred = pipeline_rf.predict(X_train)
y_test_pred = pipeline_rf.predict(X_test)

# Generate and print the classification report for both train and test
print("Train Classification Report:")
print(classification_report(y_train, y_train_pred, zero_division=1))
print("Test Classification Report:")
print(classification_report(y_test, y_test_pred, zero_division=1))

# Pickle the results
with open("/content/drive/MyDrive/Colab Notebooks/Data/random_forest_train.pkl", "wb") as f:
    pickle.dump((y_train, y_train_pred), f)
with open("/content/drive/MyDrive/Colab Notebooks/Data/random_forest_test.pkl", "wb") as f:
    pickle.dump((y_test, y_test_pred), f)


In [None]:
print("\nTraining XGBoost...")
# Fit the pipeline
pipeline_xgb.fit(X_train, y_train)

# Predict on the train and test set
y_train_pred = pipeline_xgb.predict(X_train)
y_test_pred = pipeline_xgb.predict(X_test)

# Generate and print the classification report for both train and test
print("Train Classification Report:")
print(classification_report(y_train, y_train_pred, zero_division=1))
print("Test Classification Report:")
print(classification_report(y_test, y_test_pred, zero_division=1))

# Pickle the results
with open("/content/drive/MyDrive/Colab Notebooks/Data/xgboost_train.pkl", "wb") as f:
    pickle.dump((y_train, y_train_pred), f)
with open("/content/drive/MyDrive/Colab Notebooks/Data/xgboost_test.pkl", "wb") as f:
    pickle.dump((y_test, y_test_pred), f)


In [None]:
# Define the parameter grid
param_grid = {'C': [0.1, 1, 10, 100], 'penalty': ['l1', 'l2']}

# Create a GridSearchCV object
grid_search = GridSearchCV(lr, param_grid, cv=5)

# Perform grid search
grid_search.fit(X_train, y_train)

# Print the best parameters and the best score
print(grid_search.best_params_)
print(grid_search.best_score_)


In [None]:
# Define the parameter grid
param_grid = {
    'n_components': [5, 10, 15],  # Number of topics
    'learning_method': ['batch', 'online'],  # Learning method
    'doc_topic_prior': [0.1, 0.5, 1.0],  # Alpha parameter
    'topic_word_prior': [0.01, 0.1, 1.0]  # Beta parameter
}

# Create the GridSearchCV object
grid_search = GridSearchCV(lda, param_grid, cv=3)

# Fit the grid search to the data
grid_search.fit(X)

# Print the best parameters and score
print("Best Parameters:", grid_search.best_params_)
print("Best Score:", grid_search.best_score_)

In [None]:

# Define the file paths where you saved your pickle files
logistic_regression_file_path = '/content/drive/MyDrive/Colab Notebooks/Data/logistic_regression.pkl'
random_forest_file_path = '/content/drive/MyDrive/Colab Notebooks/Data/random_forest.pkl'
xgboost_file_path = '/content/drive/MyDrive/Colab Notebooks/Data/xgboost.pkl'