In [26]:
import requests

from bs4 import BeautifulSoup, NavigableString
from html_table_extractor.extractor import Extractor
import pandas as pd
import time
import collections
import csv
collections.Callable = collections.abc.Callable

In [27]:
def extract_content_from_h1(soup_test):
    # Find the first h2 tag with class "title"
    div_check = soup_test.find('div', class_='invalid-input')
    h1_tag = soup_test.find('h1', style='font-style:italic;')
    
    # Extract the content inside the first <i> tag within the h2 tag
    if h1_tag:
        h1_content = h1_tag.text.strip()
        return h1_content
    elif div_check:
        return "invalid"
    return "invalid"

In [28]:
def group_elements_with_colon(elements):
    grouped_elements = []
    current_group = []

    for element in elements:
        if ':' in element:
            # If the element contains ":", start a new group
            if current_group:
                grouped_elements.append(', '.join(current_group))
                current_group = []
        current_group.append(element)

    # Add the last group if any
    if current_group:
        grouped_elements.append(', '.join(current_group))

    return grouped_elements

In [29]:
def extract_table_content(soup_test):
    
    # Find the first table tag in the parsed HTML
    table = soup_test.find_all('dl', class_='plant-details-text')
    
    item_list = []
    for dl_tag in table:
        text_list = []
        for item in dl_tag.descendants:
            if isinstance(item, str):  # Check if the item is a string
                text_list.append(item.strip())# Add the text to the list
        item_list.extend(text_list)
        # Filter out empty strings and print the text items
    filtered_text = [text.lstrip('- ') for text in item_list if text and text!=',']
    grouped_elements = group_elements_with_colon(filtered_text)

    return grouped_elements

In [38]:
# Define the base URL pattern for login and data scraping
base_url = "https://www.indiaplants.com/"

# Define the User-Agent header
headers = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
}

# Create a session to maintain the login state
session = requests.Session()

# Login endpoint URL for existing users
login_url = base_url + "login-check.php"

# Login credentials
username = "patidarriya.04@gmail.com"
password = "Rsplant123"

# Login data payload for existing users
login_data = {
    'userName': username,
    'userPwd': password,
    'action': 'Login'
}

# Send a POST request to the login endpoint to authenticate as an existing user
login_response = session.post(login_url, data=login_data, headers=headers)

# Check if the login was successful (status code 200)
if login_response.status_code == 200:
    print("Login successful.")
else:
    print(f"Failed to login. Status code: {login_response.status_code}")


Login successful.


In [33]:
# Initialize an empty list to store the extracted data
all_data = []

invalid_plant = []

# Loop through IDs from 1 to 3100+
for herb_id in range(3201, 4001):
    # Construct the URL with the current herb ID
    url = base_url + f"plant-details.php?plant={herb_id}"
    
    # Send an HTTP GET request to the website with the session cookies
    response = session.get(url, headers=headers)

    # Check if the request was successful (status code 200)
    if response.status_code == 200:
        # Parse the HTML code using BeautifulSoup
        soup = BeautifulSoup(response.content, 'html.parser')
        one_plant = []
        pid = 'Plant_id: ' + str(herb_id)
        plant_name = extract_content_from_h1(soup)
        if plant_name == "invalid":
            #print(f"Failed to retrieve data for ID {herb_id}. Status code: Invalid Plant")
            invalid_plant.append(herb_id)
            continue;    
        scientific_name = 'Scientific_name: ' + plant_name
        one_plant.append(pid)
        one_plant.append(scientific_name)
        one_plant.extend(extract_table_content(soup))        
        all_data.append(one_plant)

    else:
        invalid_plant.append(herb_id)
        print(f"Failed to retrieve data for ID {herb_id}. Status code: {response.status_code}")

    # Adding a delay between requests to avoid rate limiting
    time.sleep(1)  # Sleep for 1 second between requests


In [34]:
all_data

[]

In [35]:
len(all_data)

0

In [36]:
len(invalid_plant)

800

In [10]:
invalid_plant

[3007,
 3020,
 3021,
 3022,
 3023,
 3047,
 3061,
 3087,
 3093,
 3103,
 3112,
 3113,
 3179,
 3196]

In [12]:
def extract_data_to_csv(scraped_data, csv_filename):
    # Define the header row for the CSV file
    header = ['Plant_Id', 'Scientific_Name', 'Common_Name', 'Regional_Name', 'Category', 'Family', 'Light', 'Water', 'Primary_Grown_for', 'Flowering_Season', 'Foliage_Color', 'Height_or_Length', 'Spread_or_Width', 'Plant_Form', 'Lifespan', 'Special_Feature', 'Plant_Description', 'Growing_Tips']
    
    # Create and open the CSV file in write mode
    with open(csv_filename, mode='w', newline='') as file:
        writer = csv.writer(file)
        
        # Write the header row to the CSV file
        writer.writerow(header)
        
        for item in scraped_data:
            # Initialize variables to store extracted information
            pid = ''
            scientific_name = ''
            common_names = ''
            regional_names = ''
            category = ''
            family = ''
            light = ''
            water = ''
            primarily_grown_for = ''
            flowering_season = ''
            foliage_color = ''
            height_or_length = ''
            spread_or_width = ''
            plant_form = ''
            lifespan = ''
            special_feature = ''
            plant_description = ''
            growing_tips = ''            
            
            # Loop through each item in the scraped data
            for sub_item in range(len(item)):
                # Extract information based on the content of the sub-item
                if item[sub_item].startswith('Plant_id:'):
                    pid = item[sub_item].split(':')[1].strip()
                elif item[sub_item].startswith('Scientific_name:'):
                    scientific_name = item[sub_item].split(':')[1].strip()
                elif item[sub_item].startswith('Common name:,'):
                    common_names = item[sub_item].split(':')[1][1:].strip()
                elif item[sub_item].startswith('Regional name:,'):
                    regional_names = item[sub_item].split(':')[1][1:].strip()
                elif item[sub_item].startswith('Category:,'):
                    category = item[sub_item].split(':')[1][1:].strip()
                elif item[sub_item].startswith('Family:,'):
                    family = item[sub_item].split(':')[1][1:].strip()
                elif item[sub_item].startswith('Light:,'):
                    light = item[sub_item].split(':')[1][1:].strip()
                elif item[sub_item].startswith('Water:,'):
                    water = item[sub_item].split(':')[1][1:].strip()
                elif item[sub_item].startswith('Primarily grown for:,'):
                    primarily_grown_for = item[sub_item].split(':')[1][1:].strip()
                elif item[sub_item].startswith('Flowering season:,'):
                    flowering_season = item[sub_item].split(':')[1][1:].strip()
                elif item[sub_item].startswith('Foliage color:,'):
                    foliage_color = item[sub_item].split(':')[1][1:].strip()
                elif item[sub_item].startswith('Plant Height or length:,'):
                    height_or_length = item[sub_item].split(':')[1][1:].strip()
                elif item[sub_item].startswith('Plant Spread or Width:,'):
                    spread_or_width = item[sub_item].split(':')[1][1:].strip()
                elif item[sub_item].startswith('Plant Form:,'):
                    plant_form = item[sub_item].split(':')[1][1:].strip()
                elif item[sub_item].startswith('Estimated Life Span:,'):
                    lifespan = item[sub_item].split(':')[1][1:].strip()
                elif item[sub_item].startswith('Special Character:,'):
                    special_feature = item[sub_item].split(':')[1][1:].strip()
                elif item[sub_item].startswith('Plant Description:,'):
                    plant_description = item[sub_item].split(':')[1][1:].strip()
                elif item[sub_item].startswith('Growing tips:,'):
                    growing_tips = item[sub_item].split(':')[1][1:].strip()

            # Write the extracted information to the CSV file as a row
            writer.writerow([pid, scientific_name, common_names, regional_names, category, family, light, water, primarily_grown_for, flowering_season, foliage_color, height_or_length, spread_or_width, plant_form, lifespan, special_feature, plant_description, growing_tips])

In [13]:
# Specify the desired CSV filename
csv_filename = './Login_IndiaPlant/data_indiaplant_3201_3400.csv'

# Call the function to extract data to CSV
extract_data_to_csv(all_data, csv_filename)


In [None]:
#https://www.indiaplants.com/customer-login.php#3310

In [39]:
# Logout endpoint URL
logout_url = base_url + "logout.php"

# Send a GET request to the logout endpoint to log out
logout_response = session.get(logout_url, headers=headers)

# Check if the logout was successful (status code 200)
if logout_response.status_code == 200:
    print("Logout successful.")
else:
    print(f"Failed to logout. Status code: {logout_response.status_code}")

# Close the session
session.close()

Logout successful.
