# A WEB SCRAPPER FOR "refractiveindex.info" WEBSITE
## Author: Innocent Abel Kirigiti, email: innocentiousabel@gmail.com
### This jupyter notebook script will scrap refractive index values of all the books (60 classes of organic compounds) found refractiveindex.info website, and store the scrapped data in a csv file named "riScrappedData.csv".

### The resulting csv has four columns: book, wl, n, and k

- book = name of the class of organic compound
- wl = wavelength (µm) of light used for measuring the refractive index
- n = the refractive index at a specific wavelength (wl)
- k = the imaginary part in the complex refractive index at a specific wavelength (wl)

More info on refractive index: https://en.wikipedia.org/wiki/Refractive_index




In [1]:
from bs4 import BeautifulSoup
import requests
import pandas as pd
import time # To calculate time for scrapping

In [66]:
# Function to cut string based on a pattern
def extractData(data, startPattern, endCharacters='];'):
    start = data.find(startPattern)
    end = min(
        [
            pos if -1 <
            (pos := data.find(ec, start + 1)) > start else len(data)
            for ec in list(endCharacters)
        ]
    )
    extractedString = data[start + len(startPattern):end].strip()
    return extractedString


In [67]:
# Function to take input of a list of 2 dictionaries for n & k values, 
# indes 0 = nDict, index 1 = kDict
# returns a final joined dataframe with columns (book, wl, n, k).
# This dataframe can be directly be appended later to final dataframe using a loop

def returnPageDF(pageDictionaryList):
    nDF =pd.DataFrame(pageDictionaryList[0])
    kDF =pd.DataFrame(pageDictionaryList[1])
    
    # Doing a left join with 2 pandas df, nDF is left
    # REturn that dataframe
    return nDF.merge(kDF, on = 'wl', how='left')

## CREATING A BOOK SCRAPER (riScrapper i.e RefractiveIndexSCrapper)
## IT WILL LOOP ON EACH BOOK & PAGE TO FINALLY RETURN A FINAL DATA FRAME (dataDF)

### Code to get all books & create a booksDictionary, valuesList, realNamesList

In [68]:
# Scrapping books list 
# This code should only run once through the lifecycle of our script
# Use any single page
myPage = "https://refractiveindex.info/?shelf=organic&book=butanol&page=El-Kashef"

# send get reguest
myReq = requests.get(myPage)
mySoup = BeautifulSoup(myReq.content, "html.parser")

    # Get the book list
bookGroups = mySoup.find("select", {"id": "book"})
bookOptions = bookGroups.find_all('option')

booksDictionary = {} # To hold value:realname of our books (classes)
valuesList = [] # Can help store the keys 4 accessing booksDictionary
realNamesList = [] # Stores the real names, i.e values in booksDictionary

# Loop to generate the valuesList and realNamesList
for option in bookOptions:
    # Get a value
     value = option.attrs['value']
     # print(value)
    # Append the value in it's list
     valuesList.append(value)
    
    # Get the real name in the option tag, i.e it's content
     realName = option.contents[0]
     # print(realName)
    # Append the realName in it's list
     realNamesList.append(realName)

        
# Let's populate the booksDictionary
booksDictionary = dict(zip(valuesList, realNamesList))

### Function to scrap a single page

In [69]:
# Takes input of book (class name), page name
def pageScraper(book, page):
    nDict = {} # To hold our final collection of data for n
    kDict = {} # To hold our final collection of data for k

    # Generating a page url
    page_url = "https://refractiveindex.info/?shelf=organic&book={}&page={}".format(book, page)

    # send get reguest
    req = requests.get(page_url)
    soup = BeautifulSoup(req.content, "html.parser")
    script = soup.find_all('script')[15].text.strip()

        # Depending on a page, the target script with data maybe at index 13 , or 15
        # If script (string) is empty , get a script at index 13 which is probably the correct one, 
    if len(script) == 0:
        # print('Script not indexed properly, Using index 13 \n', )
        script = soup.find_all('script')[13].text.strip()

        
      
    # Getting n_wl list
    data_n_wl = extractData(script, 'data_n_wl=[').split(",")
        # If it has only one element assign an empty list, else convert into floats list
        # Convert into list of floats
    if len(data_n_wl) == 1:
        n_wl_list = []
    else:
        n_wl_list  = [float(x) for x in data_n_wl]

        
    # Getting n list
    data_n = extractData(script, 'data_n=[').split(",")
        # If it has only one element assign an empty list, else convert into floats list    
        # Convert into list of floats
    if len(data_n) == 1:
        n_list = []
    else:
        n_list = [float(x) for x in data_n]
    
    
    # Getting k_wl list
    data_k_wl = extractData(script, 'data_k_wl=[').split(",")
        # Convert into list of floats
        # If it has only one element assign an empty list, else convert into floats list
    if len(data_k_wl) == 1:
        k_wl_list = []
    else:
        k_wl_list  = [float(x) for x in data_k_wl]              
    
    
    # Getting k list
    data_k = extractData(script, 'data_k=[').split(",")
        # Convert into list of floats
        # If it has only one element assign an empty list, else convert into floats list
    if len(data_k) == 1:
        k_list = []
    else:
        k_list = [float(x) for x in data_k]
        
    # Generate our final book_list, filled with repeated name of book (real name of book)
    # WE will use our book dictionary to get real name using local variable "book" = key in dict
    book_list = [booksDictionary[book] for x in range(len(n_wl_list))]     
    
# Preparing dictionary for n, ie with keys book, wl, n
    # Now we have all lists of wl, n, book
    # Fill our nDict
    nDict['book'] = book_list
    nDict['wl'] = n_wl_list
    nDict['n'] = n_list
    
# Preparing dictionary for k, ie with keys wl, k
    # Now we have all lists of wl, k (using k_wl_list, it is always a subset of n_wl_list)
    # Fill our nDict
    kDict['wl'] = k_wl_list
    kDict['k'] = k_list
    
    # Return our dictionary list, index 0 = nDict , index 1= kDict 
    return returnPageDF([nDict, kDict])

In [70]:
#Test our single heptane scraper for all 293k

book = "heptane"
page = "Kerl-293K"

pageDictionary = pageScraper(book, page)
pageDictionary

Unnamed: 0,book,wl,n,k
0,C7H16 (Heptane),0.3260,1.413190,
1,C7H16 (Heptane),0.3292,1.412476,
2,C7H16 (Heptane),0.3324,1.411784,
3,C7H16 (Heptane),0.3355,1.411133,
4,C7H16 (Heptane),0.3387,1.410482,
...,...,...,...,...
96,C7H16 (Heptane),0.6313,1.387570,
97,C7H16 (Heptane),0.6345,1.387487,
98,C7H16 (Heptane),0.6376,1.387407,
99,C7H16 (Heptane),0.6408,1.387327,


In [71]:
#Test our single heptane scraper for all 313k

book = "heptane"
page = "Kerl-313K"

pageDictionary = pageScraper(book, page)
pageDictionary

Unnamed: 0,book,wl,n,k
0,C7H16 (Heptane),0.3260,1.402155,
1,C7H16 (Heptane),0.3292,1.401461,
2,C7H16 (Heptane),0.3324,1.400789,
3,C7H16 (Heptane),0.3355,1.400156,
4,C7H16 (Heptane),0.3387,1.399523,
...,...,...,...,...
96,C7H16 (Heptane),0.6313,1.377273,
97,C7H16 (Heptane),0.6345,1.377192,
98,C7H16 (Heptane),0.6376,1.377115,
99,C7H16 (Heptane),0.6408,1.377037,


In [72]:
#Test our single heptane scraper for 333k

book = "heptane"
page = "Kerl-333K"

pageDictionary = pageScraper(book, page)
pageDictionary

Unnamed: 0,book,wl,n,k
0,C7H16 (Heptane),0.3260,1.391120,
1,C7H16 (Heptane),0.3292,1.390446,
2,C7H16 (Heptane),0.3324,1.389793,
3,C7H16 (Heptane),0.3355,1.389179,
4,C7H16 (Heptane),0.3387,1.388564,
...,...,...,...,...
96,C7H16 (Heptane),0.6313,1.366976,
97,C7H16 (Heptane),0.6345,1.366897,
98,C7H16 (Heptane),0.6376,1.366823,
99,C7H16 (Heptane),0.6408,1.366747,


## The refractive index scrapper

#### A function that takes a book & returns a list of all pages in that book

In [73]:
def pagesListGenerator(book):
    # Do some scrapping to get all pages in a book
    # You can use the url pattern below
    # https://refractiveindex.info/?shelf=organic&book={mBook}
    
    pagesList = [] # Empty list to be populated by pages in a book
    
    # Generating a book url
    book_url = "https://refractiveindex.info/?shelf=organic&book={}".format(book)
    
        # send get reguest
    myReq = requests.get(book_url)
    mySoup = BeautifulSoup(myReq.content, "html.parser")

        # Get the pages list
    pageGroups = mySoup.find("select", {"id": "page"})
    pageOptions = pageGroups.find_all('option')

        # Loop to generate the pagesList
    for option in pageOptions:
        # Get a content of value attribute
         value = option.attrs['value']
        # Append the value in it's list
         pagesList.append(value)
 
    return pagesList

In [74]:
# Testing the pageLIstGenreator
pagesListGenerator('methane')

['Rollefson',
 'Loria',
 'Martonchik-liquid-111K',
 'Martonchik-liquid-90K',
 'Martonchik-solid-90K',
 'Martonchik-solid-30K']

#### Takes in book_list = valuesList i.e Available 60 classes of organic compounds in the refractiveindex.info website
#### & returns a dataFrame with all scrapped data

In [75]:
#Takes in book_list i.e Available 60 classes of organic compounds in the refractiveindex.info website 

def riScrapper(book_list):
    
    # All scrapped data will be appended here to dataDF & returned
    dataDF = pd.DataFrame(columns = ['book', 'wl', 	'n', 'k'])
    totalScrappedClasses = 0 # Keep track of scrapped books
    scrapperStartTime = time.time() # Time our scrapper starts
    
    # Loop through each book in book list
    for book in book_list:
        # Look for all pages in that list & store in pages_list
        # Call pagesListGenerator function above
        pages_list = pagesListGenerator(book)
        print('Finished scrapping: ',book)
        #Increment our books counter
        totalScrappedClasses+= 1
        
        for page in pages_list:
            # Call pageScrapper & merge the returned df into dataDF
            pageDF = pageScraper(book, page)
            
            # Merge into our main DF (datDF)
            dataDF = pd.concat([dataDF, pageDF])   
            
    scrapperFinishTime = time.time() # Time our scrapper terminates

    
    # Print total no of scrapped classes (books)
    print('\nriScrapper has Finished scrapping: "', totalScrappedClasses, '" books(classes)')
          
    #Print time to run our scrapper
    print("\nScrapping time: '", round((scrapperFinishTime - scrapperStartTime), 2), "' Seconds")
          
    return dataDF

### Calling our page scrapper to populate our dataframe with final data

In [76]:
# riScrapper(valuesList)
# demoBOOKLIST = ['benzene', 'hexane']
dataDF = riScrapper(valuesList)

Finished scrapping:  methane
Finished scrapping:  ethane
Finished scrapping:  pentane
Finished scrapping:  hexane
Finished scrapping:  heptane
Finished scrapping:  octane
Finished scrapping:  acetylene
Finished scrapping:  ethylene
Finished scrapping:  methanol
Finished scrapping:  ethanol
Finished scrapping:  propanol
Finished scrapping:  butanol
Finished scrapping:  pentanol
Finished scrapping:  ethylene_glycol
Finished scrapping:  propylene_glycol
Finished scrapping:  glycerol
Finished scrapping:  ethyl_acetate
Finished scrapping:  methyl_salicylate
Finished scrapping:  ethyl_salicylate
Finished scrapping:  ethyl_cinnamate
Finished scrapping:  diethyl_phthalate
Finished scrapping:  cyclohexane
Finished scrapping:  benzene
Finished scrapping:  styrene
Finished scrapping:  toluene
Finished scrapping:  trichlorobenzene
Finished scrapping:  nitrobenzene
Finished scrapping:  dioxane
Finished scrapping:  oxathiane
Finished scrapping:  acetic_acid
Finished scrapping:  pentanediol
Finished 

In [77]:
dataDF

Unnamed: 0,book,wl,n,k
0,CH4 (Methane),1.68,1.000436,
1,CH4 (Methane),1.94,1.000436,
2,CH4 (Methane),2.67,1.000433,
3,CH4 (Methane),2.79,1.000431,
4,CH4 (Methane),2.96,1.000428,
...,...,...,...,...
612,"(C37H24O6N2)n (Polyetherimide, PEI)",18.786,1.69899,0.048
613,"(C37H24O6N2)n (Polyetherimide, PEI)",19.062,1.6964,0.0419
614,"(C37H24O6N2)n (Polyetherimide, PEI)",19.347,1.69831,0.0378
615,"(C37H24O6N2)n (Polyetherimide, PEI)",19.64,1.70015,0.0311


## Creating our csv from dataDF

In [78]:
dataDF.to_csv('./riScrappedData.csv', index = False)