In [1]:
import util, scrape

import warnings
warnings.filterwarnings("ignore")

from tqdm.auto import tqdm
tqdm.pandas()

import collections
import itertools
import re
import pickle
import csv
import multiprocessing
import operator

import numpy as np
import pandas as pd
import seaborn as sns
import nltk
import spacy


In [3]:
# Create trip reports dataframe of all drugs with at least MIN_NUM_TRIP_REPORTS trip reports

MIN_NUM_TRIP_REPORTS = 10

df = pd.read_csv(util.TRIP_REPORTS_FILE)

drug_to_trip_reports_count_dict = dict(sorted(collections.Counter(df["drug"]).items(), key=lambda x: x[1]))
drugs_to_ignore = [drug for drug, count in drug_to_trip_reports_count_dict.items() if count < MIN_NUM_TRIP_REPORTS]

for drug_to_ignore in drugs_to_ignore:
    df = df[df.drug != drug_to_ignore]
    
df = df.sample(frac=1)
df.reset_index(inplace=True, drop=True)

In [4]:
df

Unnamed: 0,drug,trip_report
0,LSD,During the Summer of 2004 I experienced what I...
1,2CD,This is a report about my first time trying 2C...
2,AMT,"It was the start of Superbowl Weekand, and I w..."
3,Morning_Glory,I’ve always heard how Morning Glory seeds can ...
4,2CE,"Background: I am male, at the time of this exp..."
...,...,...
5592,LSD,To start off I would consider myself a very ex...
5593,Salvia_divinorum,I was home alone in my little apartment in Lon...
5594,2CT21,Setting: Alone on a weekday evening. My comfor...
5595,LSD,It started off as a harmless birthday party. M...


In [5]:
# remove square brackets and their contents (messages left by erowid administrators)
def preprocess(text):
    brackets = re.findall(re.compile("\(.*?\)"), text)
    for b in brackets:
        names_text = text.replace(b, " ")
    
    # remove zero-width space 
    text = text.replace("\u200b", " ")
    
    return text

# apply
preprocessed = df["trip_report"].progress_apply(preprocess)


HBox(children=(IntProgress(value=0, max=5597), HTML(value='')))




In [6]:
# run spacy pipeline (spacy is an NLP library) with added custom component

# treat multi-word entities as individual tokens instead of multiple tokens
class EntityRetokenizeComponent:
    def __init__(self, pipeline):
        pass
    
    def __call__(self, doc):
        with doc.retokenize() as retokenizer:
            for ent in doc.ents:
                retokenizer.merge(doc[ent.start:ent.end], attrs={"LEMMA": str(doc[ent.start:ent.end])})
        return doc

# create spacy pipeline
spacy_pipeline = spacy.load('en')
retokenizer = EntityRetokenizeComponent(spacy_pipeline) 
spacy_pipeline.add_pipe(retokenizer, name='merge_enitities', last=True)

# apply
df["trip_report_spacy"] = preprocessed.progress_apply(spacy_pipeline)


HBox(children=(IntProgress(value=0, max=5597), HTML(value='')))




In [23]:
# load custom stop words from file
custom_stop_words = []
with open(util.CUSTOM_STOP_WORDS_FILE) as f:
    custom_stop_words = f.readlines()
custom_stop_words = [w.strip() for w in custom_stop_words]

# augment custom stop words (e.g. add "pre-", "post-", "-esque", "-type" to each word)
custom_stop_words = scrape.augment_custom_stop_words()


In [24]:
# tokenize trip reports using spacy docs

def tokenize(doc):
    tokens = []
    for w in doc.doc:
        if all([
            (w.is_alpha),
            (w.lang_ == 'en'),
            (w.is_ascii),
            (not spacy_pipeline.vocab[w.text.lower()].is_stop),
            (w.text.lower() not in custom_stop_words),
            (w.lemma_ not in custom_stop_words),
            (not w.is_space), 
            (not w.is_punct),
            (not w.is_digit),
            (w.ent_type == 0),
            (w.lemma_ != "")
        ]):
            tokens.append(str(w.lemma_))
    return tokens

tokenized = df["trip_report_spacy"].progress_apply(tokenize)


HBox(children=(IntProgress(value=0, max=5597), HTML(value='')))




In [25]:
# filter out words that appear less than MIN_WORD_COUNT times in the entire corpus 

MIN_WORD_COUNT = 10

vocabulary = []
for tokens in tokenized:
    vocabulary += tokens
word_count_dict = dict(sorted(collections.Counter(vocabulary).items(), key=lambda x: x[1], reverse=True))
vocabulary = set([w for w in word_count_dict if word_count_dict[w] >= MIN_WORD_COUNT])

def filter_uncommon_words(tokens):
    return [str(token) for token in tokens if token in vocabulary]

# apply
df["trip_report_tokenized"] = tokenized.progress_apply(filter_uncommon_words)

HBox(children=(IntProgress(value=0, max=5597), HTML(value='')))




In [26]:
# size of vocabulary
print(f"Vocobulary size: {len(vocabulary)}")


Vocobulary size: 9712


In [28]:
# remove spacy column from dataframe so that we can efficiently save it to memory
del df["trip_report_spacy"]


In [32]:
# save trip reports dataframe 
with open(util.TRIP_REPORTS_DATAFRAME_FILE, "wb") as f:
    pickle.dump(df, f)
    