In [None]:
!mamba install bs4==4.10.0 -y
!mamba install html5lib==1.1 -y

In [None]:
from bs4 import BeautifulSoup
import html5lib
import requests
import pandas as pd

## Extract Data Using Web Scraping

### Webpage Contents

Gather the contents of the webpage in text format using the `requests` library and assign it to the variable <code>html_data</code>

The wikipedia webpage https://web.archive.org/web/20200318083015/https://en.wikipedia.org/wiki/List_of_largest_banks provides information about largest banks in the world by various parameters. Scrape the data from the table 'By market capitalization' and store it in a JSON file.


In [None]:
import json
import requests
from bs4 import BeautifulSoup

url = 'https://web.archive.org/web/20200318083015/https://en.wikipedia.org/wiki/List_of_largest_banks'
response = requests.get(url)

if response.status_code == 200:
    soup = BeautifulSoup(response.text, 'html.parser')
    table = soup.find('table', {'class': 'wikitable'})
    data_list = [{'Bank Name': td[0].text.strip(), 'Market Cap': td[1].text.strip()} for td in
                 [row.find_all('td') for row in table.find_all('tr')[1:]] if len(td) >= 2]

    with open('largest_banks.json', 'w') as json_file:
        json.dump(data_list, json_file, indent=4)

    html_data = response.text
else:
    print("Failed to retrieve the webpage")


In [None]:
data_list

[{'Bank Name': '1', 'Market Cap': 'Industrial and Commercial Bank of China'},
 {'Bank Name': '2', 'Market Cap': 'China Construction Bank'},
 {'Bank Name': '3', 'Market Cap': 'Agricultural Bank of China'},
 {'Bank Name': '4', 'Market Cap': 'Bank of China'},
 {'Bank Name': '5', 'Market Cap': 'Mitsubishi UFJ Financial Group'},
 {'Bank Name': '6', 'Market Cap': 'HSBC Holdings PLC'},
 {'Bank Name': '7', 'Market Cap': 'JPMorgan Chase'},
 {'Bank Name': '8', 'Market Cap': 'Bank of America'},
 {'Bank Name': '9', 'Market Cap': 'BNP Paribas'},
 {'Bank Name': '10', 'Market Cap': 'Crédit Agricole'},
 {'Bank Name': '11', 'Market Cap': 'Citigroup Inc.'},
 {'Bank Name': '12', 'Market Cap': 'Japan Post Bank'},
 {'Bank Name': '13', 'Market Cap': 'Wells Fargo'},
 {'Bank Name': '14', 'Market Cap': 'Sumitomo Mitsui Financial Group'},
 {'Bank Name': '15', 'Market Cap': 'Mizuho Financial Group'},
 {'Bank Name': '16', 'Market Cap': 'Banco Santander'},
 {'Bank Name': '17', 'Market Cap': 'Deutsche Bank'},
 {'Ba

In [None]:
# html_data[760:783]

### Scraping the Data

<b> Question 2</b> Using the contents and `beautiful soup` load the data from the `By market capitalization` table into a `pandas` dataframe. The dataframe should have the bank `Name` and `Market Cap (US$ Billion)` as column names.  Display the first five rows using head.


In [None]:
#Replace the dots below
# soup="......"



import pandas as pd
from bs4 import BeautifulSoup

# Parse the HTML content using BeautifulSoup
soup = BeautifulSoup(html_data, 'html.parser')

# Find the table with the desired data (By market capitalization)
table = soup.find('table', {'class': 'wikitable'})

# Initialize an empty list to store the data
data_list = []

# Iterate through the rows of the table
for row in table.find_all('tr')[1:]:  # Skip the header row
    columns = row.find_all('td')
    if len(columns) >= 2:  # Ensure that there are at least 2 columns with data
        bank_name = columns[0].text.strip()
        market_cap = columns[1].text.strip()
        data_list.append({'Bank Name': bank_name, 'Market Cap (US$ Billion)': market_cap})

# Create a Pandas DataFrame from the scraped data
df = pd.DataFrame(data_list)

# Display the first five rows using head
print(df.head())


  Bank Name                 Market Cap (US$ Billion)
0         1  Industrial and Commercial Bank of China
1         2                  China Construction Bank
2         3               Agricultural Bank of China
3         4                            Bank of China
4         5           Mitsubishi UFJ Financial Group



### Loading the Data

Load the `pandas` dataframe created above into a JSON named `bank_market_cap.json` using the `to_json()` function.


In [None]:
# Save the DataFrame to a JSON file
df.to_json('bank_market_cap.json', orient='records', lines=True)
df

Unnamed: 0,Bank Name,Market Cap (US$ Billion)
0,1,Industrial and Commercial Bank of China
1,2,China Construction Bank
2,3,Agricultural Bank of China
3,4,Bank of China
4,5,Mitsubishi UFJ Financial Group
...,...,...
96,97,Swedbank
97,98,BayernLB
98,99,State Street Corporation
99,100,China Zheshang Bank
