In [1]:
from collections import Counter,defaultdict
import os
import glob
import nltk
import math
import urllib.request
import numpy as np
from bs4 import BeautifulSoup as bs
import re
import requests
import pandas as pd

In [None]:
#parent url
erowid_url = "https://erowid.org/experiences/"


In [None]:
#helper functions:

def read_links(start_page):
    """
    takes a url and returns the links on that page
    """
    response = urllib.request.urlopen(start_page)
    #convert to soup in order to strain out the anchors and the urls
    soup = bs(response.read(), "lxml")
    href_tags = [a['href'] for a in soup.findAll('a') if a.has_attr('href')]
    #return links from page -> these are the links to the drug experinces
    #use regex here later?
    return href_tags

def convert_local_to_absolute_url(parent_url, url_list):
    """
    takes a list of relative urls and converts to absolute urls
    can be simpliefied later
    """
    full_url_list = []
    for l in url_list:
        #append the parent url to the relative url 
        full_url = parent_url + l
        #make a new list of absolute urls
        full_url_list.append(full_url)
    return full_url_list

In [None]:
#changed the doc string in the url to retrieve all links to the experiences on one page
#(Start=0&Max=24810) the 24810 is the amount of current experiences
#LATER-> get the Max parameter from the webpage + append into query parameter so is constantly 
#retrieving All Experiences
all_experience = "https://erowid.org/experiences/exp.cgi?ShowViews=0&Cellar=0&Start=0&Max=28678"

#regular expression to match experiences urls. The [0-9] matches the 5 number experience id 
#e.g will match exp.php?ID=107831
experience_regex = '[exp.php?ID=]*[0-9]{5}'
  
def get_experiences(url, regex):
    """
    takes a url and a regex returns a list of all the urls satistfying the regex on the page
    """
    experience = []
    #we find all the urls on the input url using previously defined function read_links
    for l in read_links(url):
        #if the urls on the page match the previously defined regex, return them
        match = re.match(regex, l)
        if match:
            experience.append(l)
        else:
            pass
    return experience

In [None]:
#we save all the relative urls pointing to each drug experience    
relative_urls =  get_experiences(all_experience, experience_regex)

In [None]:
#the absolute urls for each experience is:
absolute_urls = (convert_local_to_absolute_url(erowid_url, relative_urls))

In [None]:
len(absolute_urls)

In [None]:
trips = []

In [None]:
i = 1
for url in absolute_urls[0:100]:
    trips.append(parse_trip_report(url))
    print("trip %s scraped" %i)
    i += 1

In [None]:
# Save a dictionary into a pickle file.
import pickle
pickle.dump(trips, open( "trips0_100.p", "wb" ) )

In [None]:
trips = []


In [None]:
i = 1
for url in absolute_urls[101:1000]:
    trips.append(parse_trip_report(url))

    if i % 10 == 0:
        print("trip %s scraped" %i)
    i += 1
pickle.dump(trips, open( "trips101_1000.p", "wb" ))

In [None]:
import sys
sys.setrecursionlimit(30000)

In [None]:
pickle.dump(trips, open( "trips101_1000.p", "wb" ))

In [None]:
trips = pickle.load( open( "trips1001_2000.p", "rb" ) )

In [None]:
def parse_trip_report(url):
    try:
        res = requests.get(url)
    except:
        print("Couldn't open url")
    else:
        if res.url != url:
            return
        else:
            soup = bs(res.content,'lxml')
            title = soup.findAll('div', {'class' : 'title'})[0].contents[0]
            dosechart = soup.findAll('table', {'class' : 'dosechart'})
            bodyweight = soup.findAll('table', {'class' : 'bodyweight'})
            footdata = soup.findAll('table', {'class' : 'footdata'})
            author = soup.findAll('div', {'class' : 'author'})[0].a.text
            body_text = soup.findAll('div', {'class' : 'report-text-surround'})[0]

            dosechart_dt = parse_table(table=dosechart, ttype=0)
            bodyweight_dt = parse_table(table=bodyweight, ttype=1)
            footdata_dt = parse_table(table=footdata, ttype=2)

            return([title, author, url, dosechart_dt, bodyweight_dt, footdata_dt, body_text])


In [None]:
def parse_table(table, ttype):
    
    if ttype == 0: #dosechart
        dosechart = parse_dosechart(table)
        return(dosechart)
    if ttype == 1: #bodyweight
        bodyweight = parse_bodyweight(table)
        return(bodyweight)

    if ttype == 2: #footdata
        footdata = parse_footdata(table)
        return(footdata)
        

In [None]:
def parse_dosechart(table):
    if table:
        table = pd.read_html(str(table))[0]
        
        if len(table.columns) == 5:
            table.columns = ['Time', 'Dose', 'Method', 'Substance', 'Form']
            return(table)
            
        if len(table.columns) == 4:
            table[5] = np.nan
            table.columns = ['Time', 'Dose', 'Method', 'Substance', 'Form']
            return(table)
        
    else:
        return(pd.DataFrame(np.nan, index=[0], columns=['Time', 'Dose', 'Method', 'Substance', 'Form']))

In [None]:
def parse_bodyweight(table):
    if table:
        table = pd.read_html(str(table))[0]
        table = table.drop(0, 1)
        table.columns = ['bodyweight']
        return(table)
    else:
        return(pd.DataFrame(np.nan, index=[0], columns=['bodyweight']))
    

In [None]:
def parse_footdata(table):
    df = pd.DataFrame(np.nan, index=[0], columns=['exp_date', 'gender', 'age', 'publish_date', 'tags', 'exp_id', 'views'])
    if table:
        footdata = pd.read_html(str(table))[0]
        df['exp_date'] = footdata[0][0].split(":")[1].strip() #experience date
        df['gender'] = footdata[0][1].split(":")[1].strip() #gender
        df['age'] = footdata[0][2].split(":")[1].strip() #age
        df['publish_date'] = footdata[0][3].split(":")[1].strip() #publish date
        df['tags'] = footdata[0][5] #tags
        df['exp_id'] = footdata[1][0].split(":")[1].strip() #exp id
        df['views'] = footdata[1][3].split(":")[1].strip() #views
        return(df)
    else:
        return(df)

In [None]:
url = 'https://erowid.org/experiences/exp.php?ID=46265'
res = requests.get(url)
soup = bs(res.content,'lxml')
title = soup.findAll('div', {'class' : 'title'})[0].contents[0]


In [None]:
title

In [None]:
#trip_report = parse_trip_report("https://erowid.org/experiences/exp.php?ID=894")
#trip_report = parse_trip_report("https://erowid.org/experiences/exp.php?ID=107477")
#trip_report = parse_trip_report("https://erowid.org/experiences/exp.php?ID=46265")
trip_report = parse_trip_report('https://erowid.org/experiences/exp.php?ID=60495')

In [None]:
trip_report

In [None]:
df

In [None]:
#df = pd.DataFrame(data, columns=['Time', 'Dose', 'Method', 'Substance', 'Form'])
#df = pd.DataFrame(np.nan, index=[0], columns=['Time', 'Dose', 'Method', 'Substance', 'Form'])

In [None]:
data_set = retrieve_drug_and_gender(absolute_urls[0:1])

In [None]:
import pandas as pd
import requests
from bs4 import BeautifulSoup
 
#res = requests.get("https://erowid.org/experiences/exp.php?ID=111195")
res = requests.get("https://erowid.org/experiences/exp.php?ID=16151")
#res = requests.get("https://erowid.org/experiences/exp.php?ID=894")
soup = BeautifulSoup(res.content,'lxml')
#table = soup.find_all('table')
table = soup.findAll('table', {'class' : 'dosechart'})
#table = soup.findAll('table', {'class' : 'footdata'})
#table = soup.findAll('table', {'class' : 'bodyweight'})
#trip_report = soup.findAll('div', {'class' : 'report-text-surround'})[0]
#author = soup.findAll('div', {'class' : 'author'})

 

In [None]:
table = pd.read_html(str(table))[0]

In [None]:
table = table.drop(5, 1)

In [None]:
table.columns = ['Time', 'Dose', 'Method', 'Substance', 'Form']

In [None]:
len(table.columns)

In [None]:
s = "123123STRINGabcabc"

def find_between( s, first, last ):
    try:
        start = s.index( first ) + len( first )
        end = s.index( last, start )
        return s[start:end]
    except ValueError:
        return ""

def find_between_r( s, first, last ):
    try:
        start = s.rindex( first ) + len( first )
        end = s.rindex( last, start )
        return s[start:end]
    except ValueError:
        return ""


#print(find_between( s, "123", "abc" ))
print(find_between_r(trip_report, "<!-- Start Body -->", "<!-- End Body -->")

In [None]:
pd.read_html(str(table))[0]

In [None]:
# Check Dose Chart
# Clean Dose Chart
# Check Body Weight
# Clean Body Weight
# Check Experience Table
# Clean Experience Table

In [None]:
absolute_urls[0]

In [None]:
#a list of genders and the drugs
data_set = retrieve_drug_and_gender(absolute_urls)

In [None]:
def count_gender(drug_gender_list):
    """
    takes output of retrieve_drug_and_gender and 
    returns a dict of theccount of each gender for each drug
    """
    male_dict = {}
    female_dict = {}
    #select the second list in the item -> this is the part 
    #containing the information about gender
    for i in drug_gender_list:
        for gender in i[1]:
            #if gender is male, male_count is + 1
            if "Male" in gender:
                # we need to add 1 to the count specific to that drug
                #use a set to ensure no repetition of drug name in table
                for drug in set(i[0]):
                    if not drug in male_dict:
                        #if the drug is not already a key in the dict, then the count is 1
                        # as we havent seen it before
                        male_dict[drug] = 1
                    else:
                        # the count is incremented by 1
                        male_dict[drug] += 1
            #add 1 to female count if the gender is female
            if "Female" in gender:
                #check that there is no repetition of the same drug in a single entry
                for drug in set(i[0]):
                    if not drug in female_dict:
                        #the count is 1
                        female_dict[drug] = 1 
                    else:
                        # the count is incremented by 1
                        female_dict[drug] += 1
    return female_dict, male_dict

#returns a tuple with two dicts. the first dict is the count of the female drug users, the second dict
# is a count of the male drug users
complete_data_dict = count_gender(data_set)

#print complete_data_dict

In [None]:
#display the results as bar plots

%matplotlib inline
import matplotlib.pyplot as plt

#get the output dict from the counted gender
dicts = count_gender(data_set)

#the first dict is female drug users
female_drugs = dicts[0]
#the second dict is male drug users
male_drugs = dicts[1]

#get the top 5 used drugs from male and female users
top_female_drugs = Counter(female_drugs).most_common(5)
top_male_drugs = Counter(male_drugs).most_common(5)

#display the results as a bar chart
#female drug users plotted onto a graph
plt.bar(range(len(top_female_drugs)), [x[1] for x in top_female_drugs], align="center")
plt.xticks(range(len(top_female_drugs)), [x[0] for x in top_female_drugs])
#label the axis
plt.ylabel('amount of users')
plt.title('Female drug users')

plt.show()

#male drug users plotted onto a drug
plt.bar(range(len(top_male_drugs)), [y[1] for y in top_male_drugs], align="center", rotation='vertical')
plt.xticks(range(len(top_male_drugs)), [y[0] for y in top_male_drugs])
#label the axis 
plt.ylabel('amount of users')
plt.title('Male drug users')

plt.show()

In [None]:
len(absolute_urls)

In [None]:
### Collecting Chemical information
#content-body-frame > div.content-section > div.summary-card > div.summary-card-text-surround > div.sum-effects
start_page = 'https://erowid.org/chemicals/5meo_dmt/'
response = urllib.request.urlopen(start_page)
    #convert to soup in order to strain out the anchors and the urls
soup = bs(response.read(), "lxml")

In [None]:
def get_drug_links(base_url):
    response = urllib.request.urlopen(base_url)
    soup = bs(response.read(), 'lxml')
    href_tags = [a['href'] for a in soup.findAll('a') if a.has_attr('href')]
    return(href_tags)

In [None]:
def get_drug_effects(url):
    response = urllib.request.urlopen(url)
    soup = bs(response.read(), 'lxml')
    effects_classification = soup.findAll('div', {'class' : 'sum-effects'})[0].text
    return(effects_classification)
    

In [None]:
import PyOpenSSL

In [12]:
base_url = 'http://psychonautwiki.org/wiki/Bufotenin'
http = urllib3.PoolManager()
response = http.request(
'GET',
base_url,
preload_content = False)
soup = bs(response.read(), 'lxml')




In [13]:
table = soup.findAll('table', {'id' : 'InfoTable'})

In [17]:
pd.read_html(str(table))[0]

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,16,17,18,19,20,21,22,23,24,25
0,Bufotenin,,,,,,,,,,...,,,,,,,,,,
1,Chemical Nomenclature,,,,,,,,,,...,,,,,,,,,,
2,Common names,"Bufotenin, 5-HO-DMT",,,,,,,,,...,,,,,,,,,,
3,Substitutive name,"N,N-dimethylserotonin",,,,,,,,,...,,,,,,,,,,
4,Systematic name,3-[2-(Dimethylamino)ethyl]-1H-indol-5-ol,,,,,,,,,...,,,,,,,,,,
5,Class Membership,,,,,,,,,,...,,,,,,,,,,
6,Psychoactive class,Psychedelic,,,,,,,,,...,,,,,,,,,,
7,Chemical class,Tryptamine,,,,,,,,,...,,,,,,,,,,
8,Routes of Administration,,,,,,,,,,...,,,,,,,,,,
9,WARNING: Always start with lower doses due to ...,WARNING: Always start with lower doses due to ...,Smoked,Dosage,Threshold,2 - 5 mg,Light,5 - 20 mg,Common,20 - 40 mg,...,15 - 90 minutes,Onset,15 - 60 seconds,Peak,1 - 5 minutes,Offset,5 - 10 minutes,After effects,10 - 60 minutes,DISCLAIMER: PW's dosage information is gathere...


In [3]:
import ssl
print(ssl.OPENSSL_VERSION)

OpenSSL 0.9.8zh 14 Jan 2016


In [4]:
import urllib3

In [8]:
r = http.request(
'GET',
base_url,
preload_content = False)



In [10]:
soup = bs(r.read(), 'lxml')

In [11]:
soup

<!DOCTYPE html>
<html class="client-nojs" dir="ltr" lang="en-GB">
<head>
<meta charset="utf-8"/>
<title>Psychoactive substance index - PsychonautWiki</title>
<style>.property-page-results tr.value-row:nth-child(even){}.smwtable-striped tbody > tr:nth-child(even){background-color:#f5f5f5}.smwtable-striped tbody > tr:nth-child(odd){background-color:#fff}@-webkit-keyframes rotation{from{-webkit-transform:rotate(0deg)}to{-webkit-transform:rotate(359deg)}}@-moz-keyframes rotation{from{-moz-transform:rotate(0deg)}to{-moz-transform:rotate(359deg)}}@-o-keyframes rotation{from{-o-transform:rotate(0deg)}to{-o-transform:rotate(359deg)}}@keyframes rotation{from{transform:rotate(0deg)}to{transform:rotate(359deg)}}.qtip:not(.ie9haxors) div.qtip-content,.qtip:not(.ie9haxors) div.qtip-titlebar{filter:none;-ms-filter:none}.uls-trigger{background:transparent no-repeat scroll left center;background-image:url(https://psychonautwiki.global.ssl.fastly.net/w/extensions/UniversalLanguageSelector/lib/jquery.ul

In [None]:
soup.findAll('div', {'class' : 'sum-effects'})[0].text

In [None]:
chemicals = get_drug_links("https://erowid.org/chemicals/")

In [None]:
plants = get_drug_links("https://erowid.org/plants/")

In [None]:
pharms = get_drug_links("https://erowid.org/pharms/")

In [None]:
herbs = get_drug_links("https://erowid.org/herbs/")

In [None]:
smarts = get_drug_links("https://erowid.org/smarts/")

In [None]:
animals = get_drug_links("https://erowid.org/animals/")

In [None]:
animals[12:17]

In [None]:
smarts[12:38]

In [None]:
herbs[12:62]

In [None]:
pharms[14:90]

In [None]:
plants[12:85]

In [None]:
chemicals[11]