In [213]:
import pandas as pd
import requests
from bs4 import BeautifulSoup

In [214]:
# Define the column names
columns = [
    'Course Link',
    'Title',
    'Description',
    'Duration',
    'Timing',
    'Course Start Date',
    'What you will learn',
    'Skills',
    'Target Students',
    'Prerequisites/Eligibility criteria',
    'Content',
    'Faculty 1 Name',
    'Faculty 1 Designation',
    'Faculty 1 Description',
    'Faculty 2 Name',
    'Faculty 2 Designation',
    'Faculty 2 Description',
    'Institute Name',
    'Fee in INR',
    'Fee in USD'
]

# Create an empty DataFrame with specified columns
df = pd.DataFrame(columns=columns)

Note: There are two different html structures are present for different course. So I have created two seperate scraper functions to handle each.

In [215]:
def scrap1(url,df):
    response = requests.get(url)

    soup = BeautifulSoup(response.text, 'html.parser')

    # Extracting specific information based on HTML structure
    title = soup.find('h1').text.strip()
    description = soup.find('div', class_='el-lap-section2-inner el-ln-p').text.strip()
    duration_num = soup.find('div', class_='months pb-1 pt-2').find('span').text.strip()
    duration_days = soup.find('div', class_='months pb-1 pt-2').find('p').text.strip()
    timing = None
    start_date = soup.find('h2', class_='no').text.strip()
    what_you_learn = list(soup.find('div', class_='el-lap-section3-inner').find_all('strong')[:])
    x=" "
    for i in what_you_learn:
      x+=i.text.strip()
      x+=" | "
    what_you_learn=x[:-3]
    skills = soup.find('div', class_ ='el-lap-section4-inner').find_all('div', class_='el-lap-section4-repeater')[:]
    x=" "
    for i in skills:
      x+=i.text.strip()
      x+=" | "
    skills=x[:-3]
    target_students = soup.find('div', class_='tab-content current').find('ul').text.strip()
    prerequisites = soup.find('div', class_='el-lap-section cornell-eligibility').find('ul').text.strip()
    content = soup.find('div', class_ ='el-lap-section4-inner').find_all('div', class_='el-lap-section4-repeater')[:]
    x=" "
    for i in content:
      x+=i.text.strip()
      x+=", "
    content=x[:-2]
    faculty_1_name = soup.find_all('div', class_='el-fec-right')[0].find('h4').text.strip()
    faculty_1_designation = soup.find_all('div', class_='el-fec-right')[0].find('p').text.strip()
    faculty_1_description = soup.find_all('div', class_='el-rd readMoreHide')[0].text.strip()
    if soup.find_all('div', class_='el-fec-right')[1].find('h4') is None:
      faculty_2_name = None
    else:    
      faculty_2_name = soup.find_all('div', class_='el-fec-right')[1].find('h4').text.strip()
    if soup.find_all('div', class_='el-fec-right')[1].find('p') is None:
      faculty_2_designation = None
    else:    
      faculty_2_designation = soup.find_all('div', class_='el-fec-right')[1].find('p').text.strip()
    if soup.find_all('div', class_='el-rd readMoreHide')[1] is None:
      faculty_2_description = None
    else:    
      faculty_2_description = soup.find_all('div', class_='el-rd readMoreHide')[1].text.strip()
    institute_name = soup.find('div', class_='el-lap-sec-hd white').text.strip()
    institute_name=' '.join(institute_name.split(" ")[1:])
    fee_inr = soup.find('td', text = 'Balance Payment').find_next_sibling('td').text.strip()
    fee_usd = "USD "+str(int(fee_inr.split(" ")[1])/83)+" +GST"
    
    # Create a dictionary with the values we have extracted
    new_row_data = {
    'Course Link': url,
    'Title': title,
    'Description': description,
    'Duration': duration_num + duration_days,
    'Timing': timing,
    'Course Start Date': start_date,
    'What you will learn': what_you_learn,
    'Skills': skills,
    'Target Students': target_students,
    'Prerequisites/Eligibility criteria': prerequisites,
    'Content': content,
    'Faculty 1 Name': faculty_1_name,
    'Faculty 1 Designation': faculty_1_designation,
    'Faculty 1 Description': faculty_1_description,
    'Faculty 2 Name': faculty_2_name,
    'Faculty 2 Designation': faculty_2_designation,
    'Faculty 2 Description': faculty_2_description,
    'Institute Name': institute_name,
    'Fee in INR': fee_inr,
    'Fee in USD': fee_usd
    }
    
    # Append the dictionary to the DataFrame
    df = pd.concat([df, pd.DataFrame([new_row_data])], ignore_index=True)

    return df

In [216]:
def scrap2(url,df):
    # Make a GET request to fetch the raw HTML content
    response = requests.get(url)
    
    # Parse the html content
    soup = BeautifulSoup(response.text, 'html.parser')

    # Extracting specific information based on HTML structure
    title = soup.find('h1').text.strip() 
    description = soup.find('div', class_='desc').text.strip()
    duration = soup.find('div', class_='duration-of-course').find('strong').text.strip().split(" ")
    duration = duration[1]+" "+duration[2]
    timing = soup.find('div', class_='duration-of-course').find_all('p')[1].text.strip().split(" ")
    timing = ' '.join(timing[1:])
    start_date = soup.find('div', class_='duration-of-course').find_all('strong')[1].text.strip()
    what_you_learn = soup.find('div', class_='pl-deeper-undstnd to_flex_ul').find_all('li')[:]
    x=" "
    for i in what_you_learn:
      x+=i.text.strip()
      x+=" | "
    what_you_learn=x[:-2]
    skills = soup.find('div', class_='key-skills-sec').find_all('li')[:]
    x=" "
    for i in skills:
      x+=i.text.strip()
      x+=" | "
    skills=x[:-3]
    target_students = soup.find('h4', class_='cs-titlec').text.strip()
    prerequisites = soup.find('div', class_='eligible-right-top-list').find_all('ul')[:]
    x=" "
    for i in prerequisites:
      x+=i.text.strip()
      x+=", "
    prerequisites=x[:-2]
    content = soup.find('div', class_='sylab-tab-ul').find_all('li')[:]
    x=" "
    for i in content:
      x+=i.text.strip()
      x+=" | "
    content=x[:-2]
    faculty_1_name = soup.find_all('h4', class_='best-fname')[0].text.strip()
    faculty_1_designation = soup.find_all('p', class_='best-fdesingnation')[0].text.strip()
    faculty_1_description = soup.find_all('p', class_='best-fdesingnation')[0].text.strip()
    if soup.find_all('h4', class_='best-fname')[1].text.strip() is None:
      faculty_2_name = None
    else:  
      faculty_2_name = soup.find_all('h4', class_='best-fname')[1].text.strip()
    if soup.find_all('p', class_='best-fdesingnation')[1].text.strip() is None:
      faculty_2_designation = None
    else:     
      faculty_2_designation = soup.find_all('p', class_='best-fdesingnation')[1].text.strip()
    if soup.find_all('p', class_='best-fdesingnation')[1].text.strip() is None:
      faculty_2_description = None
    else:    
      faculty_2_description = soup.find_all('p', class_='best-fdesingnation')[1].text.strip()
    institute_name = soup.find('h4', class_='about-ititle').text.strip()
    fee_inr = soup.find('div', class_ = 'program-details-total-pay-amt-right').text.strip().replace(" ", "").replace("\n", " ").split(" ")[1]
    fee_usd = str(int(fee_inr)/83)

    # Create a dictionary containing the data we collected
    new_row_data = {
    'Course Link': url,
    'Title': title,
    'Description': description,
    'Duration': duration,
    'Timing': timing,
    'Course Start Date': start_date,
    'What you will learn': what_you_learn,
    'Skills': skills,
    'Target Students': target_students,
    'Prerequisites/Eligibility criteria': prerequisites,
    'Content': content,
    'Faculty 1 Name': faculty_1_name,
    'Faculty 1 Designation': faculty_1_designation,
    'Faculty 1 Description': faculty_1_description,
    'Faculty 2 Name': faculty_2_name,
    'Faculty 2 Designation': faculty_2_designation,
    'Faculty 2 Description': faculty_2_description,
    'Institute Name': institute_name,
    'Fee in INR': fee_inr,
    'Fee in USD': fee_usd
    }
    
    # Append the dictionary to the DataFrame
    df = pd.concat([df, pd.DataFrame([new_row_data])], ignore_index=True)

    return df
    

In [217]:
url1 = "https://talentedge.com/ecornell/certificate-course-strategic-human-resources-leadership"
df = scrap1(url1,df)



In [218]:
url2 = "https://talentedge.com/ecornell/certificate-course-executive-leadership"
df = scrap1(url2,df)

In [219]:
url3 = "https://talentedge.com/ecornell/certificate-course-human-resources-management"
df = scrap1(url3,df)

In [220]:
url4 = "https://talentedge.com/iim-kozhikode/professional-certificate-program-marketing-sales-management-iim-kozhikode"
df = scrap2(url4,df)

In [221]:
url5 = "https://talentedge.com/goa-institute-of-management/exectuive-pg-program-in-health-care-management"
df = scrap2(url5,df)

In [222]:
url6 = "https://talentedge.com/esgci-school-of-management-paris/doctorate-of-business-administration-esgci"
df = scrap2(url6,df)

In [223]:
url7 = "https://talentedge.com/opjindal-global-business-school/masters-of-business-administration-opj-global-university"
df = scrap2(url7,df)

In [224]:
url8 = "https://talentedge.com/iim-lucknow/supply-chain-management"
df = scrap2(url8,df)

In [225]:
url9 = "https://talentedge.com/golden-gate-university/doctor-of-business-administration"
df = scrap2(url9,df)

In [226]:
url10 = "https://talentedge.com/iim-kozhikode/professional-certificate-program-marketing-sales-management-iim-kozhikode"
df = scrap2(url10,df)

In [227]:
df

Unnamed: 0,Course Link,Title,Description,Duration,Timing,Course Start Date,What you will learn,Skills,Target Students,Prerequisites/Eligibility criteria,Content,Faculty 1 Name,Faculty 1 Designation,Faculty 1 Description,Faculty 2 Name,Faculty 2 Designation,Faculty 2 Description,Institute Name,Fee in INR,Fee in USD
0,https://talentedge.com/ecornell/certificate-co...,Strategic Human Resources Leadership,Align HR Strategies with Organizational Strate...,3months,,31/01/2024,Effectively Influence Business Strategy as an...,Human Resources Leadership | Aligning HR Stra...,HR Managers\nHR Supervisors/Directors,Graduates (10+2+3) or Diploma Holders (only 10...,"Human Resources Leadership, Aligning HR Strat...",Christopher Collins,"Associate Professor, Cornell ILR School",Christopher J. Collins is an Associate Profess...,Lisa Nishii,"Associate Professor, Cornell ILR School",Lisa Nishii joined the faculty of the Human Re...,eCornell,INR 61525 +GST,USD 741.2650602409639 +GST
1,https://talentedge.com/ecornell/certificate-co...,Executive Leadership,Transform into the finest version of you!\nTal...,4months,,31/01/2024,Improve the Motivation and Quality of Managem...,Quality and Service Excellence | Leading Coll...,High-potential mid-level managers with 5+ year...,Graduates (10+2+3) or Diploma Holders (only 10...,"Quality and Service Excellence, Leading Colla...",Glen Dowell,"Professor, Cornell SC Johnson College Of Business",Professor Glen Dowell is an associate professo...,Tony Simons,"Professor, Cornell SC Johnson College Of Business",Professor Tony Simons teaches organizational b...,eCornell,INR 61525 +GST,USD 741.2650602409639 +GST
2,https://talentedge.com/ecornell/certificate-co...,Human Resources Management,Build on your people management skills\nTalent...,4.5months,,31/01/2024,360-Degree Implementation of HR Frameworks | ...,Aligning Employee Performance with Organizati...,"Global HR professionals from for-profits, NGOs...",Graduates (10+2+3) or Diploma Holders (only 10...,Aligning Employee Performance with Organizati...,Alex Colvin,"Professor, Cornell ILR School",Alexander Colvin is Associate Dean for Academi...,Christopher Collins,"Associate Professor, Cornell ILR School",Christopher J. Collins is an Associate Profess...,eCornell,INR 61525 +GST,USD 741.2650602409639 +GST
3,https://talentedge.com/iim-kozhikode/professio...,Professional Certificate Program In Marketing ...,"This certificate course in ""Marketing & Sales""...",12 months,sunday from 9:30 AM to 12:30 PM IST,"07 Apr, 2024",Foundations of Marketing | Customer Relations...,Marketing Management | Analytical Skills | De...,"Marketing & Sales Executives, Working Professi...",For Indian Participants – Graduates (10+2+3) ...,Role of Marketing Research for Better Marketi...,Dr. M. Geetha,Professor,Professor,Dr. Sreejesh S.,Associate Professor,Associate Professor,IIM Kozhikode,165000,1987.9518072289156
4,https://talentedge.com/goa-institute-of-manage...,Executive Post Graduate Program In Health Care...,India's healthcare industry is expected to gro...,11 months,hours/week,"31 Mar, 2024",Hospital Planning & Services | Medico-Legal a...,Hospital Planning and Services | Ethical & Le...,"Freshers, Early-age Professionals, Medical Pro...",Professionals/freshers holding any bachelor’s...,Program Introduction | Fundamentals of Manage...,Ajay Vamadevan,"Professor, Healthcare Management","Professor, Healthcare Management",Alekh Gour,"Associate Professor, Healthcare Management","Associate Professor, Healthcare Management",Goa Institute of Management,97458,1174.1927710843374
5,https://talentedge.com/esgci-school-of-managem...,Doctorate Of Business Administration,"Under the guidance of scholar-practitioners, u...",36 Months,hours of live lectures per week,"31 Mar, 2024",Recognizing and solving the business problem ...,Applied Research | Concept Selection | Manage...,Working professionals who are looking for a tr...,,Foundation Phase | Dissertation Phase,Dr. Josse Roussel,Professor,Professor,Dr. Alain Kruger,Professor,Professor,"ESGCI School of Management, Paris",740741,8924.590361445784
6,https://talentedge.com/opjindal-global-busines...,Masters Of Business Administration,This program is a one year online MBA designed...,12 months,,"31 Mar, 2024",Marketing Fundamentals | Economics: Micro & M...,Decision Making in business context | Analyti...,This programme is useful for freshers as well ...,Candidates must hold a graduation degree in a...,The Firm and the Consumer | Global Economics ...,Chris Oates,"Adjunct faculty, upGrad","Adjunct faculty, upGrad",Rakesh Godhwani,Adjunct Professor IIM Bangalore,Adjunct Professor IIM Bangalore,OPJindal Global Business School,127119,1531.55421686747
7,https://talentedge.com/iim-lucknow/supply-chai...,Executive Program In Supply Chain Management D...,The Executive Program in Supply Chain Manageme...,9 months,from 10.00 a.m. to 01.15 p.m. IST (3 Hours). D...,"31 Mar, 2024",Supply chain management issues in a firm | Lo...,Supply Chain Management | Operation Managemen...,"Mid-level Professionals, Senior-level Executiv...",For Indian Participants – Graduates (10+2+3) ...,Fundamentals of Operations Management | Deman...,Dr. Yash Daultani,Program Director,Program Director,Dr. Suresh K Jakhar,Program Director - IIM Lucknow,Program Director - IIM Lucknow,IIM Lucknow,1500,18.072289156626507
8,https://talentedge.com/golden-gate-university/...,Doctor Of Business Administration,"Go beyond the boundaries of your job, without ...",36 Months,hours/week,"15 Feb, 2024",Recognizing and solving the business problem ...,Applied Research | Concept Selection | Manage...,Working professionals who are looking for a tr...,,Foundation (12 Credits) | Concentration (12 C...,Dr. Mick Mcgee,"Director, DBA Program","Director, DBA Program",Dr. Judith Lee,"Professor & Director, Academic Innovation","Professor & Director, Academic Innovation",Golden Gate University,1226438,14776.361445783132
9,https://talentedge.com/iim-kozhikode/professio...,Professional Certificate Program In Marketing ...,"This certificate course in ""Marketing & Sales""...",12 months,sunday from 9:30 AM to 12:30 PM IST,"07 Apr, 2024",Foundations of Marketing | Customer Relations...,Marketing Management | Analytical Skills | De...,"Marketing & Sales Executives, Working Professi...",For Indian Participants – Graduates (10+2+3) ...,Role of Marketing Research for Better Marketi...,Dr. M. Geetha,Professor,Professor,Dr. Sreejesh S.,Associate Professor,Associate Professor,IIM Kozhikode,165000,1987.9518072289156


In [228]:
# Save the DataFrame to Excel file
df.to_excel(f'D:/Assistant Chatbot/output.xlsx', index=False)