## Extracting nutritional supplements and respective effects information from examine.com

[Examine.com](http://www.examine.com/supplements) offer a curated perspective on most supplements by reviewing most of the scientific information availabale. This stands as highly valuable information not only from a consumers point of view, but also from a scientific point of view. My objective in extracting each supplement and its respective effect is purely for a research point of view. The final dataset can be downloaded [here](https://drive.google.com/file/d/0B-yLJQAPPRwiYy1EelRUSm9MZ00/view?usp=sharing).

In [12]:
#!/usr/bin/env python3
# -*- coding: utf-8 -*-

import pandas as pd
from bs4 import BeautifulSoup
import urllib
import re
import time
import os

In [5]:
def soupLinkfinder(url, string):
    
    ''' Given a url and a string, returns all the href
        that contain the specified string in the specified url'''
    
    if re.findall("http[s]?://", url):
        html = urllib.request.urlopen(url)
    else: 
        html = open(url,'r').read()
    
    soup = BeautifulSoup(html, "lxml")
    href_list = []
    
    for a in soup.find_all('a', href=True):
        if string in a['href']:
            href_list.append(a['href'])
    
    return href_list

In [11]:
supplist = soupLinkfinder(
                          '/supplements.html',
                          '/supplements/'
                          )
    
supplist = [url.rsplit('/')[-2] for url in supplist]
supplist[0:15]

['1%2c3-dimethylamylamine',
 '2%2c4-dinitrophenol',
 '5-htp',
 '7%2c8-dihydroxyflavone',
 '7-keto-dhea',
 'acorus-calamus',
 'adrafinil',
 'aframomum-melegueta',
 'agmatine',
 'alanine',
 'alanylglutamine',
 'alcohol',
 'aloe-vera',
 'alpha-gpc',
 'alpha-lipoic-acid']

In [13]:
topicslist = soupLinkfinder(
                          '/topics.html',
                          '/topics/'
                          )

topicslist = [url.rsplit('/')[-2] for url in topicslist]
topicslist[0:15]

['adhd-in-children',
 'adiponectin',
 'adrenaline',
 'aerobic-exercise',
 'aggression',
 'alertness',
 'allergies',
 'ammonia',
 'anaerobic-running-capacity',
 'anti-oxidant-enzyme-profile',
 'anxiety',
 'apolipoprotein-a',
 'apolipoprotein-b',
 'appetite',
 'asthma']

In [21]:
# Make an empty dataframe with columns as supplements and rows as effects

df = pd.DataFrame(columns = supplist,  index = topicslist)
df.sample(5)

Unnamed: 0,1%2c3-dimethylamylamine,2%2c4-dinitrophenol,5-htp,7%2c8-dihydroxyflavone,7-keto-dhea,acorus-calamus,adrafinil,aframomum-melegueta,agmatine,alanine,...,coffee,eca,egg-%28chicken%29,energy-drinks,olive-oil,pomegranate,safflower-oil,tea-%28camellia-sinensis%29,wine,zma
treatment-of-hepatic-encephalopathy,,,,,,,,,,,...,,,,,,,,,,
power-output,,,,,,,,,,,...,,,,,,,,,,
protection-from-smoking,,,,,,,,,,,...,,,,,,,,,,
glycemic-control,,,,,,,,,,,...,,,,,,,,,,
length-of-sickness,,,,,,,,,,,...,,,,,,,,,,


In [None]:
for suppl in supplist:
    
    #Fetch and parse effect-matrix
    html = urllib.request.urlopen('https://examine.com/supplements/'
                                  +suppl+'#effect-matrix').read()
    soup = BeautifulSoup(html)
    
    try: 
        effectmat = soup.find("article", {'id': "effect-matrix"})
        
        topics = effectmat.find_all('a', href=True)
        
        topics = [ topic for topic in topics 
                  if re.findall("(?:/\topics/(*.))", str(topic)) ]
        
        topics = [ re.findall('(?:[a-z][a-z0-9_]*)', str(topic))[3] 
                                for topic in topics ]
        
        scores = effectmat.find_all("img")
        
        scores = [ score for score in scores if ".svg" in str(score) ]
        
        for i in range(len(scores)):
            
            if "down-3" in str(scores[i]): scores[i] = -3
            elif "down-2" in str(scores[i]): scores[i] = -2
            elif "down-1" in str(scores[i]): scores[i] = -1
            elif "up-1" in str(scores[i]): scores[i] = 1
            elif "up-2" in str(scores[i]): scores[i] = 2
            elif "up-3" in str(scores[i]): scores[i] = 3
            
            df[str(suppl)][topics[i]] = scores[i]             
        
    except: 
        continue

    #Wait 5 secs between each iteration to avoid getting my ip banned.
    time.sleep(5)

    for a in soup.find_all('a', href=True):
    if "/supplements/" in a['href']:
        supplist.append(a['href'])

In [22]:
# This chunk is only relevant for Jupyter Notebook aesthetics.
from pylab import rcParams
import json
s = json.load( open("/home/hypathia/bmh_matplotlibrc.json") ) 
rcParams.update(s)

from IPython.core.display import HTML
def css_styling():
    styles = open("/home/hypathia/custom.css", "r").read() 
    return HTML(styles)
css_styling()