In [1]:
# from google.colab import drive
# drive.mount('/content/drive')

# Web Scraper

In [2]:
from bs4 import BeautifulSoup
import requests

Require input is the URL.

In [3]:
url = 'https://www.britannica.com/topic/list-of-state-capitals-in-the-United-States-2119210'
page = requests.get(url)
soup = BeautifulSoup(page.text, 'html')
print(soup)

<!DOCTYPE html>
<html class="topic-desktop ui-unknown0 ui-unknown" data-ytrk-page="TOPIC PAGINATED SMALL" lang="en">
<head prefix="og: https://ogp.me/ns# fb: https://ogp.me/ns/fb#">
<meta charset="utf-8"/>
<meta content="text/html; charset=utf-8" http-equiv="Content-Type"/>
<meta content="width=device-width, initial-scale=1.0" name="viewport"/>
<meta content="74442380906" property="fb:pages"/>
<link href="https://cdn.britannica.com/mendel-resources/3-111" rel="dns-prefetch"/>
<link href="https://cdn.britannica.com/mendel-resources/3-111" rel="preconnect"/>
<link as="script" href="https://www.googletagservices.com/tag/js/gpt.js" rel="preload"/>
<link href="/favicon.png" rel="icon"/>
<meta content="This is a list of the cities that are state capitals in the United States, ordered alphabetically by state. This list also provides the most recent U.S. census population for each city as well as an estimated population. (This list does not include the capital of the United States, Washington,

Find all tables in the page. In this website, though, there's only one so it's simple.

In [4]:
soup.find_all('table')

[<table> <thead> <tr> <th>state</th> <th>capital</th> <th>population of capital: census</th> <th>population of capital: estimated</th> </tr> </thead> <tbody> <tr> <td><a class="md-crosslink" data-show-preview="true" href="https://www.britannica.com/place/Alabama-state">Alabama</a></td> <td><a class="md-crosslink" data-show-preview="true" href="https://www.britannica.com/place/Montgomery-Alabama">Montgomery</a></td> <td>(2020) 200,603</td> <td>(2021 est.) 198,665</td> </tr> <tr> <td><a class="md-crosslink" data-show-preview="true" href="https://www.britannica.com/place/Alaska">Alaska</a></td> <td><a class="md-crosslink" data-show-preview="true" href="https://www.britannica.com/place/Juneau">Juneau</a></td> <td>(2020) 32,255</td> <td>(2021 est.) 31,973</td> </tr> <tr> <td><a class="md-crosslink" data-show-preview="true" href="https://www.britannica.com/place/Arizona-state">Arizona</a></td> <td><a class="md-crosslink" data-show-preview="true" href="https://www.britannica.com/place/Phoenix

Let's first get just the title of the table.

*Note: `<th>` tag defines a header cell in an HTML table*

In [5]:
titles = soup.find_all('th')
titles

[<th>state</th>,
 <th>capital</th>,
 <th>population of capital: census</th>,
 <th>population of capital: estimated</th>]

Since we do not need the tags, let's clean up the data.

In [6]:
titles_list = [title.text for title in titles]
titles_list

['state',
 'capital',
 'population of capital: census',
 'population of capital: estimated']

If the output still contains newline and other symbols that are not needed, you can further clean the data using, for example, ```.strip()```

Next, create a dataframe

In [7]:
import pandas as pd

df = pd.DataFrame(columns = titles_list)
df

  from pandas.core import (
Pyarrow will become a required dependency of pandas in the next major release of pandas (pandas 3.0),
(to allow more performant data types, such as the Arrow string type, and better interoperability with other libraries)
but was not found to be installed on your system.
If this would cause problems for you,
please provide us feedback at https://github.com/pandas-dev/pandas/issues/54466
        
  import pandas as pd


Unnamed: 0,state,capital,population of capital: census,population of capital: estimated


Let's scrape the remaining data and fill this table!

In [8]:
rows = soup.find_all('tr')
len(rows)

51

The data of our interest are within the scope of **td** tags.

*Note: `<td>` tag defines a standard data cell in an HTML table.*

In [9]:
# -- Long version --
# row_data = []
# for row in rows:
#   row_data.append(row.find_all('td'))

# -- Short version --
row_data = [row.find_all('td') for row in rows]
row_data

[[],
 [<td><a class="md-crosslink" data-show-preview="true" href="https://www.britannica.com/place/Alabama-state">Alabama</a></td>,
  <td><a class="md-crosslink" data-show-preview="true" href="https://www.britannica.com/place/Montgomery-Alabama">Montgomery</a></td>,
  <td>(2020) 200,603</td>,
  <td>(2021 est.) 198,665</td>],
 [<td><a class="md-crosslink" data-show-preview="true" href="https://www.britannica.com/place/Alaska">Alaska</a></td>,
  <td><a class="md-crosslink" data-show-preview="true" href="https://www.britannica.com/place/Juneau">Juneau</a></td>,
  <td>(2020) 32,255</td>,
  <td>(2021 est.) 31,973</td>],
 [<td><a class="md-crosslink" data-show-preview="true" href="https://www.britannica.com/place/Arizona-state">Arizona</a></td>,
  <td><a class="md-crosslink" data-show-preview="true" href="https://www.britannica.com/place/Phoenix-Arizona">Phoenix</a></td>,
  <td>(2020) 1,608,139</td>,
  <td>(2021 est.) 1,624,569</td>],
 [<td><a class="md-crosslink" data-show-preview="true" hr

The first row collected has no value, thus an empty list.

In [10]:
row_data[0]

[]

Turns out the first row is actually here

In [11]:
row_data[1]

[<td><a class="md-crosslink" data-show-preview="true" href="https://www.britannica.com/place/Alabama-state">Alabama</a></td>,
 <td><a class="md-crosslink" data-show-preview="true" href="https://www.britannica.com/place/Montgomery-Alabama">Montgomery</a></td>,
 <td>(2020) 200,603</td>,
 <td>(2021 est.) 198,665</td>]

However, we only want the text portion.

In [12]:
print(row_data[1][0].text)
print(row_data[1][1].text)
print(row_data[1][2].text)
print(row_data[1][3].text)

Alabama
Montgomery
(2020) 200,603
(2021 est.) 198,665


Therefore, more cleaning is necessary.

Add the remaining rows to the dataframe.

But does this code work?

In [13]:
for each_row_data in row_data[1:]:
  state=[]
  for each_row_data_elem in each_row_data:
    state.append(each_row_data_elem.text)

  length = len(df)
  df.loc[length] = state


In [14]:
df

Unnamed: 0,state,capital,population of capital: census,population of capital: estimated
0,Alabama,Montgomery,"(2020) 200,603","(2021 est.) 198,665"
1,Alaska,Juneau,"(2020) 32,255","(2021 est.) 31,973"
2,Arizona,Phoenix,"(2020) 1,608,139","(2021 est.) 1,624,569"
3,Arkansas,Little Rock,"(2020) 202,591","(2021 est.) 201,998"
4,California,Sacramento,"(2020) 524,943","(2021 est.) 525,041"
5,Colorado,Denver,"(2020) 715,522","(2021 est.) 711,463"
6,Connecticut,Hartford,"(2020) 121,054","(2021 est.) 120,576"
7,Delaware,Dover,"(2020) 39,403","(2021 est.) 38,992"
8,Florida,Tallahassee,"(2020) 196,068","(2021 est.) 197,102"
9,Georgia,Atlanta,"(2020) 498,715","(2021 est.) 496,461"


TODO:

- Scrape other table from wikipedia
- Generate a new table/tables using dataframe
- Feel free to use other html tags
- Clean & preprocess

---

Other websites (for instance)
- https://www.timesjobs.com/
- https://www.tripadvisor.com/

### [NBA Teams](https://en.wikipedia.org/wiki/National_Basketball_Association)

#### Data Scraping

In [15]:
url = 'https://en.wikipedia.org/wiki/National_Basketball_Association'
page = requests.get(url)
soup = BeautifulSoup(page.text, 'html')

In [16]:
table = soup.find_all('table', class_="wikitable")[1]

In [17]:
table_header = table.find('tr')

In [18]:
nba_df = pd.DataFrame(columns = [title.text.strip() for title in table_header.find_all('th')])
nba_df

Unnamed: 0,Teams,Win,Loss,Total,Year(s) won,Year(s) runner-up


In [19]:
table_body = table.find_all('tr')[1:]

In [20]:
nba_data = []

for row in table_body:
    cols = row.find_all('td')
    cols = [col.text.strip() for col in cols]
    nba_data.append(cols)

nba_df = pd.DataFrame(nba_data, columns=['Teams', 'Win', 'Loss', 'Total', 'Year(s) won', 'Year(s) runner-up'])
nba_df

Unnamed: 0,Teams,Win,Loss,Total,Year(s) won,Year(s) runner-up
0,Minneapolis/Los Angeles Lakers,17,15,32,"1949, 1950, 1952, 1953, 1954, 1972, 1980, 1982...","1959, 1962, 1963, 1965, 1966, 1968, 1969, 1970..."
1,Boston Celtics,17,5,22,"1957, 1959, 1960, 1961, 1962, 1963, 1964, 1965...","1958, 1985, 1987, 2010, 2022"
2,Philadelphia/San Francisco/Golden State Warriors,7,5,12,"1947, 1956, 1975, 2015, 2017, 2018, 2022","1948, 1964, 1967, 2016, 2019"
3,Chicago Bulls,6,0,6,"1991, 1992, 1993, 1996, 1997, 1998",—
4,San Antonio Spurs,5,1,6,"1999, 2003, 2005, 2007, 2014",2013
5,Syracuse Nationals/Philadelphia 76ers,3,6,9,"1955, 1967, 1983","1950, 1954, 1977, 1980, 1982, 2001"
6,Fort Wayne/Detroit Pistons,3,4,7,"1989, 1990, 2004","1955, 1956, 1988, 2005"
7,Miami Heat,3,4,7,"2006, 2012, 2013","2011, 2014, 2020, 2023"
8,New York Knicks,2,6,8,"1970, 1973","1951, 1952, 1953, 1972, 1994, 1999"
9,Houston Rockets,2,2,4,"1994, 1995","1981, 1986"


#### Clean & Preprocess

In [21]:
import numpy as np

nba_df = nba_df.replace('—', np.nan)

nba_df['Year(s) won'] = nba_df['Year(s) won'].fillna('')
nba_df['Year(s) runner-up'] = nba_df['Year(s) runner-up'].fillna('')

nba_df['Year(s) won'] = nba_df['Year(s) won'].apply(lambda x: [int(year.strip()) for year in x.split(',') if year.strip()])
nba_df['Year(s) runner-up'] = nba_df['Year(s) runner-up'].apply(lambda x: [int(year.strip()) for year in x.split(',') if year.strip()])

nba_df[['Win', 'Loss', 'Total']] = nba_df[['Win', 'Loss', 'Total']].apply(pd.to_numeric)

nba_df

Unnamed: 0,Teams,Win,Loss,Total,Year(s) won,Year(s) runner-up
0,Minneapolis/Los Angeles Lakers,17,15,32,"[1949, 1950, 1952, 1953, 1954, 1972, 1980, 198...","[1959, 1962, 1963, 1965, 1966, 1968, 1969, 197..."
1,Boston Celtics,17,5,22,"[1957, 1959, 1960, 1961, 1962, 1963, 1964, 196...","[1958, 1985, 1987, 2010, 2022]"
2,Philadelphia/San Francisco/Golden State Warriors,7,5,12,"[1947, 1956, 1975, 2015, 2017, 2018, 2022]","[1948, 1964, 1967, 2016, 2019]"
3,Chicago Bulls,6,0,6,"[1991, 1992, 1993, 1996, 1997, 1998]",[]
4,San Antonio Spurs,5,1,6,"[1999, 2003, 2005, 2007, 2014]",[2013]
5,Syracuse Nationals/Philadelphia 76ers,3,6,9,"[1955, 1967, 1983]","[1950, 1954, 1977, 1980, 1982, 2001]"
6,Fort Wayne/Detroit Pistons,3,4,7,"[1989, 1990, 2004]","[1955, 1956, 1988, 2005]"
7,Miami Heat,3,4,7,"[2006, 2012, 2013]","[2011, 2014, 2020, 2023]"
8,New York Knicks,2,6,8,"[1970, 1973]","[1951, 1952, 1953, 1972, 1994, 1999]"
9,Houston Rockets,2,2,4,"[1994, 1995]","[1981, 1986]"


### [Jobs for Software Engineer](https://www.timesjobs.com/candidate/job-search.html?from=submit&funcAreaSpec=35115&luceneResultSize=25&postWeek=3&searchType=Home_Search&cboPresFuncArea=35&pDate=Y&sequence=1&startPage=1)

#### Data Scraping

In [22]:
data_list = []

for sequence in range(1, 11):
    url = f'https://www.timesjobs.com/candidate/job-search.html?from=submit&funcAreaSpec=35115&luceneResultSize=25&postWeek=3&searchType=Home_Search&cboPresFuncArea=35&pDate=Y&sequence={sequence}&startPage=1'
    page = requests.get(url)
    soup = BeautifulSoup(page.text, 'html.parser')

    tmp = soup.find_all('ul', class_="new-joblist")
    tmp1 = tmp[0].find_all('li', class_="clearfix job-bx wht-shd-bx")

    for html in tmp1:
        soup = html

        job_title = soup.find('h2').text.strip()
        company_name = soup.find(class_='joblist-comp-name').text.strip()
        top_dtl = soup.find('ul', class_='top-jd-dtl clearfix')
        experience = top_dtl.li.text.strip()
        location = top_dtl.find_all('li')[-1].text.strip()
        if top_dtl.find('i', class_='rupee'):
            salary = top_dtl.find_all('li')[len(top_dtl.find_all('li')) // 2].text.strip()
        else:
            salary = None
        job_description = soup.find('ul', class_='list-job-dtl clearfix').find_all('li')[0].text.strip()
        key_skills = soup.find('ul', class_='list-job-dtl clearfix').find_all('li')[1].text.strip()
        posted_date = soup.find('span', class_='sim-posted').text.strip()

        data_list.append({
            'Job Title': job_title,
            'Company Name': company_name,
            'Experience': experience,
            'Location': location,
            'Salary (₹)': salary,
            'Job Description': job_description,
            'Key Skills': key_skills,
            'Posted Date': posted_date
        })

jobs_df = pd.DataFrame(data_list)

#### Clean & Preprocess

In [23]:
import datetime

jobs_df['Job Title'] = jobs_df['Job Title'].str.title()
jobs_df['Company Name'] = jobs_df['Company Name'].str.replace('\r\n     (More Jobs)', '').str.strip().str.title()
jobs_df['Experience'] = jobs_df['Experience'].str.replace('card_travel', '').str.strip()
jobs_df['Location'] = jobs_df['Location'].str.replace('location_on\n', '').str.strip()
jobs_df['Job Description'] = jobs_df['Job Description'].str.replace('Job Description:\r\n', '').str.replace('... More Details', '').str.strip()
jobs_df['Key Skills'] = jobs_df['Key Skills'].str.replace('KeySkills:\n\r\n      \r\n          ', '').str.strip()

def convert_posted_date(date_str):
    if isinstance(date_str, pd.Timestamp):
        return date_str
    
    date_str = date_str.split()[1:]
    date_str = ' '.join(date_str)
    
    if 'ago' in date_str:
        days_ago = int(date_str.split()[0])
        return pd.Timestamp.today() - datetime.timedelta(days=days_ago)
    elif 'today' in date_str:
        return pd.Timestamp.today()
    else:
        return date_str 

jobs_df['Posted Date'] = jobs_df['Posted Date'].apply(convert_posted_date)

jobs_df['Posted Date'] = jobs_df['Posted Date'].apply(convert_posted_date)
jobs_df['Salary (₹)'] = jobs_df['Salary (₹)'].str.replace('₹Rs ', '').str.replace('Lacs p.a.', 'Lakhs per annum').str.strip()

In [24]:
jobs_df

Unnamed: 0,Job Title,Company Name,Experience,Location,Salary (₹),Job Description,Key Skills,Posted Date
0,Servicenow Sme (Grc),Ltimindtree Ltd.,8 - 13 yrs,"Bengaluru / Bangalore, Chennai, Delhi, Hyde...",,"Job Title -ServiceNow SME (GRC)Skills - GRC, I...","GRC , IRM , Business Analyst",2024-02-14 15:35:13.183900
1,Software Engineer ( Senior Java Developer),Alpha Tech,0 - 3 yrs,"Bengaluru / Bangalore, Chennai, Hyderabad/Se...",4.50 - 7.50 Lakhs per annum,Job description1. Excellent written and verbal...,"Waterfall , Scrum , Agile Methodologies , Soft...",2024-02-14 15:35:13.183900
2,Software Engineer Bengaluru / Hyderabad / Che...,Alpha Tech,0 - 1 yrs,"Bengaluru / Bangalore, Chennai, Hyderabad/Se...",4.50 - 7.50 Lakhs per annum,Job descriptionEffectively collaborate on cros...,"Agile , Javascript , Html , c ++ , AWS , Testi...",2024-02-14 15:35:13.183900
3,Software Testing Engineer/ Qa Tester,Alpha Tech,0 - 1 yrs,"Bengaluru / Bangalore, Chennai, Hyderabad/Se...",4.50 - 7.50 Lakhs per annum,Job descriptionStrong understanding of Web and...,"Testing , Qa , Qc , Automation Testing , Selen...",2024-02-14 15:35:13.183900
4,Network Engineer Iii With Ot - J47625,Sampoorna Consultants Pvt Ltd,9 - 14 yrs,Chennai,20.00 - 40.00 Lakhs per annum,"OT InfrastructureDESIGN, BUILD, AND RUNThe res...","OT Network , Firewall",2024-02-14 15:35:13.183900
...,...,...,...,...,...,...,...,...
245,Module Lead Java Full Stack,Hyrefox Consultants,5 - 8 yrs,"Mumbai, Pune",,Front-end skills: Must have worked on ZK (Java...,"rest , css , html5 , javascript , spring boot ...",2024-02-12 15:35:13.185902
246,Lead Java Full Stack,Hyrefox Consultants,0 - 3 yrs,"Mumbai, Pune",,Front-end skills: Must have worked on ZK (Java...,"rest , css , html5 , javascript , spring boot ...",2024-02-12 15:35:13.185902
247,Manger Java Full Stack,Hyrefox Consultants,0 - 3 yrs,"Mumbai, Pune",,Front-end skills: Must have worked on ZK (Java...,"rest , css , html5 , javascript , spring boot ...",2024-02-12 15:35:13.185902
248,Sr Software Engineer Java Full Stack,Hyrefox Consultants,5 - 8 yrs,"Mumbai, Pune",,Front-end skills: Must have worked on ZK (Java...,"rest , css , html5 , javascript , spring boot ...",2024-02-12 15:35:13.185902


### [MyAnimeList - Fall 2023](https://myanimelist.net/anime/season/2023/fall)

#### Data Scraping

In [25]:
url = 'https://myanimelist.net/anime/season/2023/fall'
page = requests.get(url)
soup = BeautifulSoup(page.text, 'html')

In [26]:
tmp = soup.find_all('div' ,class_='seasonal-anime-list js-seasonal-anime-list js-seasonal-anime-list-key-1')

In [27]:
from datetime import datetime

def convert_date(date_str):
    date_obj = datetime.strptime(date_str, '%Y%m%d')
    return date_obj.strftime('%B %d, %Y')

anime_details = []
for tv_types in tmp:
    tv_type = tv_types.find('div',class_='anime-header').text.strip()
    shows = tv_types.find_all('div',class_='js-anime-category-producer seasonal-anime js-seasonal-anime js-anime-type-all js-anime-type-1')
    for anime in shows:
        title = anime.find('h2', class_='h2_anime_title').text.strip()
        big_div = anime.find('div', class_='synopsis js-synopsis')
        synopsis = big_div.find('p',class_='preline').text.strip()
        # Find the studios section
        studios_section = big_div.find('div', class_='properties')

        # Extract studios
        studios = []
        if studios_section:
            studios_span = studios_section.find('span', class_='caption', text='Studios')
            if studios_span:
                for studio_link in studios_span.find_next_siblings('span', class_='item'):
                    studios.append(studio_link.text)
            else:
                # If there's only one studio
                studio_item = studios_section.find('span', class_='item')
                if studio_item:
                    studios.append(studio_item.text)
        
        # Find the genres section
        genres_section = anime.find('div', class_='genres-inner')

        # Extract genres
        genres = []
        if genres_section:
            for genre_span in genres_section.find_all('span', class_='genre'):
                genre = genre_span.text.strip()
                genres.append(genre)
        
        members = anime.find('span', class_='js-members').text.strip()
        score = anime.find('span', class_='js-score').text.strip()
        start_date = convert_date(anime.find('span', class_='js-start_date').text.strip())
        anime_details.append({'Title': title, 'Studio': studios, 'Score': score, 'Genre': genres, 'Start Date': start_date, 'Status': tv_type, 'Synopsis': synopsis})

anime_df = pd.DataFrame(anime_details)

#### Clean & Preprocess

In [28]:
anime_df['Studio'] = anime_df['Studio'].apply(lambda x: x if x != ['Unknown'] else [])
anime_df['Synopsis'] = anime_df['Synopsis'].apply(lambda x: '-' if x == '(No synopsis yet.)' else x)
anime_df['Score'] = anime_df['Score'].apply(lambda x: '-' if x == '0' else x)

In [29]:
anime_df

Unnamed: 0,Title,Studio,Score,Genre,Start Date,Status,Synopsis
0,Sousou no Frieren,[Madhouse],9.13,"[Adventure, Drama, Fantasy]","September 29, 2023",TV (New),During their decade-long quest to defeat the D...
1,Spy x Family Season 2,"[CloverWorks, Wit Studio]",8.08,"[Action, Comedy]","October 07, 2023",TV (New),"With her ability to read minds, Anya Forger is..."
2,Tate no Yuusha no Nariagari Season 3,[Kinema Citrus],7.13,"[Action, Adventure, Drama, Fantasy]","October 06, 2023",TV (New),"After defeating the Spirit Tortoise, Naofumi h..."
3,Goblin Slayer II,[LIDENFILMS],7.24,"[Action, Adventure, Fantasy]","October 06, 2023",TV (New),Second season of Goblin Slayer.
4,Kage no Jitsuryokusha ni Naritakute! 2nd Season,[Nexus],8.41,"[Action, Comedy, Fantasy]","October 04, 2023",TV (New),"Enticed by its rumored wealth, Cid Kagenou hap..."
...,...,...,...,...,...,...,...
93,Kirin the Noop,[],5.13,[Adventure],"February 06, 2014",TV (Continuing),A shadow painting anime about a timid giraffe ...
94,Manul no Yuube,[],-,[Slice of Life],"November 11, 2018",TV (Continuing),"Within the nature ""Darwin Kita! Kikimono Shin ..."
95,Shin Nippon History,[],-,[Comedy],"January 06, 2020",TV (Continuing),-
96,Shuwawan!,[KOO-KI],-,[Slice of Life],"March 22, 2023",TV (Continuing),Shuwawan! chronicles the family life of real-l...
