In [1]:
import requests

from bs4 import BeautifulSoup, NavigableString
from html_table_extractor.extractor import Extractor
import pandas as pd
import time
import collections
import csv
collections.Callable = collections.abc.Callable

In [2]:
def extract_content_from_h2(soup_test):
    # Find the first h2 tag with class "title"
    h2_tag = soup_test.find('h2', class_='title')
    
    # Extract the content inside the first <i> tag within the h2 tag
    if h2_tag:
        i_tags = h2_tag.find_all('i')
        if i_tags:
            first_i_tag_content = i_tags[0].text.strip()
            return first_i_tag_content
    
    return None

In [3]:
def extract_table_content(soup_test):
    
    # Find the first table tag in the parsed HTML
    table = soup_test.find_all('table')
    
    # Initialize an empty list to store the extracted content
    content_list = []
    
    # If a table tag is found, extract the content inside it
    if table[3]:
        for cell in table[3].find_all('tr'):
            list_items = cell.find_all('li')

            for li in list_items:
                inner_list = [li.get_text(strip=True)]
                    
                content_list.append(inner_list)

    return content_list

In [11]:
# Define the base URL pattern
base_url = "https://indiaflora-ces.iisc.ac.in/herbsheet.php?id={}&cat=13"

# Define the User-Agent header
headers = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
}

# Initialize an empty list to store the extracted data
all_data = []

# Loop through IDs from 1 to 13122
for herb_id in range(1, 2):
    # Construct the URL with the current herb ID
    url = base_url.format(herb_id)
    
    # Send an HTTP GET request to the website with the defined headers
    response = requests.get(url, headers=headers)

    # Check if the request was successful (status code 200)
    if response.status_code == 200:
        # Parse the HTML code using BeautifulSoup
        soup = BeautifulSoup(response.content, 'html.parser')
        one_plant=[]
        pid = 'Plant_id: ' + str(herb_id)
        plant_name = extract_content_from_h2(soup)
        scientific_name = 'Scientific_name: ' + plant_name
        one_plant.append([pid])
        one_plant.append([scientific_name])
        one_plant.extend(extract_table_content(soup))        
        # Append the dictionary to the list
        all_data.append(one_plant)
    else:
        print(f"Failed to retrieve data for ID {herb_id}. Status code: {response.status_code}")

    # Adding a delay between requests to avoid rate limiting
    time.sleep(1)  # Sleep for 1 second between requests


In [12]:
all_data

[[['Plant_id: 1'],
  ['Scientific_name: Abelia chinensis'],
  ['Family:CAPRIFOLIACEAE(Honeysuckle Family)'],
  ['Family (Hindi name): RASHINA FAMILY'],
  ['Family (as per The APG System III):Caprifoliaceae'],
  ['Synonym(s): Linnaea chinensis (R.Br.) A.Braun & Vatke'],
  ['Species Name(as per The IPNI):Linnaea chinensis (R.Br.) A.Braun & Vatke'],
  ['Habitat: Cultivated'],
  ['Key identification features: An upright-rounded, multi-stemmed, deciduous shrub of the honeysuckle family that typically grows on arching branches to 5-8’ tall and to 3-5’ wide. This is an outstanding flowering shrub whose main claim to fame is a lengthy early summer to fall bloom of very fragrant tubular white flowers with showy soft pink bracts. Flowers may be the most fragrant found on any species of Abelia. This shrub is native to lower alpine areas up to about 6000’ in elevation in southeastern China. Branchlets are covered with soft reddish wooly hairs when young. Mature branches are grayish brown. Toothed 

In [6]:
len(all_data)

1

In [15]:
def extract_data_to_csv(scraped_data, csv_filename):
    # Define the header row for the CSV file
    header = ['Plant_Id', 'Scientific_Name', 'Family', 'Family_Hindi_Name)', 'Species_Name', 'Common_Name', 'Habit', 'Habitat', 'Comments', 'Flower_Fruit', 'Distribution', 'Key_Feature']
    
    # Create and open the CSV file in write mode
    with open(csv_filename, mode='w', newline='') as file:
        writer = csv.writer(file)
        
        # Write the header row to the CSV file
        writer.writerow(header)
        
        for item in scraped_data:
            # Initialize variables to store extracted information
            pid = ''
            scientific_name = ''
            family = ''
            hindi_name = ''
            species_name = ''
            common_name = ''
            habit = ''
            habitat = ''
            feature = ''
            comments = ''
            flower_fruit = ''
            distribution = ''
            
            # Loop through each item in the scraped data
            for sub_item in range(len(item)):
                
                # Extract information based on the content of the sub-item
                if item[sub_item][0].startswith('Plant_id:'):
                    pid = item[sub_item][0].split(':')[1].strip()
                elif item[sub_item][0].startswith('Scientific_name:'):
                    scientific_name = item[sub_item][0].split(':')[1].strip()
                elif item[sub_item][0].startswith('Family:'):
                    family = item[sub_item][0].split(':')[1].strip()
                elif item[sub_item][0].startswith('Family (Hindi name):'):
                    hindi_name = item[sub_item][0].split(':')[1].strip()
                elif item[sub_item][0].startswith('Species Name'):
                    species_name = item[sub_item][0].split(':')[1].strip()
                elif item[sub_item][0].startswith('Common name:'):
                    common_name = item[sub_item][0].split(':')[1].strip()
                elif item[sub_item][0].startswith('Habit:'):
                    habit = item[sub_item][0].split(':')[1].strip()
                elif item[sub_item][0].startswith('Habitat:'):
                    habitat = item[sub_item][0].split(':')[1].strip()
                elif item[sub_item][0].startswith('Comments / notes:'):
                    comments = item[sub_item][0].split(':')[1].strip()
                elif item[sub_item][0].startswith('Flower, Fruit:'):
                    flower_fruit = item[sub_item][0].split(':')[1].strip()
                elif item[sub_item][0].startswith('Distribution:'):
                    # Extract the state names from the Distribution information
                    states = []
                    states_str = item[sub_item][0].split(':', 1)[1].strip()
                    counter = states_str.count(':')
                    sub_item = sub_item + 1
                    while counter>0 and sub_item<len(item):
                        states.append(item[sub_item][0].split(':')[0].strip())
                        sub_item = sub_item + 1
                        counter = counter - 1
                    distribution = ', '.join(states)
                elif item[sub_item][0].startswith('Key identification features:'):
                    feature = item[sub_item][0].split(':')[1].strip()

            # Write the extracted information to the CSV file as a row
            writer.writerow([pid, scientific_name, family, hindi_name, species_name, common_name, habit, habitat, comments, flower_fruit, distribution, feature])

In [16]:
# Specify the desired CSV filename
csv_filename = 'extracted_data_india_flora_1.csv'

# Call the function to extract data to CSV
extract_data_to_csv(all_data, csv_filename)
