### Scraping Data from Website

##### Step 1. Importing Libraries

In [1]:
from bs4 import BeautifulSoup
import requests

##### Step 2. Creating BeautifulSoup Object

In [2]:
url = 'https://en.wikipedia.org/wiki/List_of_largest_companies_in_the_United_States_by_revenue'
page = requests.get(url)
soup = BeautifulSoup(page.text, 'html')

In [3]:
# print(soup)

##### Step 3. Finding th tag that is going to be extracted

In [4]:
# soup.find_all('table')

In [5]:
# soup.find('table', class_='wikitable sortable')

##### Step 4. Finding wanted table and assigning wanted tag to variables

In [6]:
table = soup.find_all('table')[1]

In [7]:
world_titles = table.find_all('th')
print(world_titles)

[<th>Rank
</th>, <th>Name
</th>, <th>Industry
</th>, <th>Revenue <br/>(USD millions)
</th>, <th>Revenue growth
</th>, <th>Employees
</th>, <th>Headquarters
</th>]


##### Step 5. Extracting data from the tag by for loop

In [8]:
world_table_titles = [title.text.strip() for title in world_titles]
print(world_table_titles)

['Rank', 'Name', 'Industry', 'Revenue (USD millions)', 'Revenue growth', 'Employees', 'Headquarters']


##### Step 6. Importing pandas to create dataframe

In [9]:
import pandas as pd

In [10]:
df = pd.DataFrame(columns = world_table_titles)
df

Unnamed: 0,Rank,Name,Industry,Revenue (USD millions),Revenue growth,Employees,Headquarters


##### Step 7. Finding wanted data from the tag and assigning it to variable

In [11]:
column_data = table.find_all('tr')
# column_data

##### Step 8. Implementing row data into dataframe by for loop

In [12]:
for row in column_data[1:]:
    row_data = row.find_all('td')
    individual_row_data = [data.text.strip() for data in row_data]
    
    length = len(df)
    df.loc[length] = individual_row_data

In [13]:
df

Unnamed: 0,Rank,Name,Industry,Revenue (USD millions),Revenue growth,Employees,Headquarters
0,1,Walmart,Retail,611289,6.7%,2100000,"Bentonville, Arkansas"
1,2,Amazon,Retail and cloud computing,513983,9.4%,1540000,"Seattle, Washington"
2,3,ExxonMobil,Petroleum industry,413680,44.8%,62000,"Spring, Texas"
3,4,Apple,Electronics industry,394328,7.8%,164000,"Cupertino, California"
4,5,UnitedHealth Group,Healthcare,324162,12.7%,400000,"Minnetonka, Minnesota"
...,...,...,...,...,...,...,...
95,96,Best Buy,Retail,46298,10.6%,71100,"Richfield, Minnesota"
96,97,Bristol-Myers Squibb,Pharmaceutical industry,46159,0.5%,34300,"New York City, New York"
97,98,United Airlines,Airline,44955,82.5%,92795,"Chicago, Illinois"
98,99,Thermo Fisher Scientific,Laboratory instruments,44915,14.5%,130000,"Waltham, Massachusetts"


##### Step 9. create csv file from the dataframe with index option

In [14]:
# df.to_csv(r'C:\Users\comra\Jupyter Project\Alex_project\companies.csv', index=False)

##### Creating another Dataframe through step1 to 9

In [15]:
table2 = soup.find_all('table')[2]
# table2

In [16]:
industry_titles = table2.find_all('th')
print(industry_titles)

[<th>Rank
</th>, <th>Name
</th>, <th>Industry
</th>, <th>Revenue <br/>(USD billions)
</th>, <th>Employees
</th>, <th>Headquarters
</th>]


In [17]:
industry_table_titles = [title.text.strip() for title in industry_titles]
print(industry_table_titles)

['Rank', 'Name', 'Industry', 'Revenue (USD billions)', 'Employees', 'Headquarters']


In [18]:
df2 = pd.DataFrame(columns = industry_table_titles)
df2

Unnamed: 0,Rank,Name,Industry,Revenue (USD billions),Employees,Headquarters


In [19]:
col_data = table2.find_all('tr')
# print(col_data)

In [20]:
for row in col_data[1:]:
    row_data = row.find_all('td')
    indi_row_data = [data.text.strip() for data in row_data]

    length2 = len(df2)
    df2.loc[length2] = indi_row_data

In [21]:
df2

Unnamed: 0,Rank,Name,Industry,Revenue (USD billions),Employees,Headquarters
0,1,Cargill,Food industry,165.0,155000,"Minnetonka, Minnesota"
1,2,Koch Industries,Conglomerate,125.0,120000,"Wichita, Kansas"
2,3,Publix Super Markets,Retail,48.0,230000,"Winter Haven, Florida"
3,4,"Mars, Incorporated",Food industry,45.0,140000,"McLean, Virginia"
4,5,Pilot Corporation,Petroleum industry and Retail,41.9,30000,"Knoxville, Tennessee"
5,6,H-E-B,Retail,38.9,145000,"San Antonio, Texas"
6,7,Reyes Holdings,Wholesaling,35.3,33000,"Rosemont, Illinois"
7,8,C&S Wholesale Grocers,Wholesaling,33.0,14000,"Keene, New Hampshire"
8,9,Enterprise Holdings,Car rental,30.0,80000,"Clayton, Missouri"
9,10,Love's,Petroleum industry and Retail,25.5,38000,"Oklahoma City, Oklahoma"


In [22]:
# df2.to_csv(r'C:\Users\comra\Jupyter Project\Alex_project\industry.csv', index=False)