# Data Preparation Week 9 and 10
## Joshua Greenert
## DSC540-T301 Data Preparation
## 11/1/2022

## Step 1

In [33]:
# Import required libraries.
import pandas as pd
import numpy as np
import requests
from bs4 import BeautifulSoup

# Get the content from the page
page = requests.get("https://en.wikipedia.org/wiki/List_of_largest_companies_by_revenue")
content = page.content

# Create the page parser
soup = BeautifulSoup(content, 'html.parser')

# Get the countries table.
companies_table = soup.find('table', class_='wikitable sortable')

# Set up the dataframe to prepare for the data.
df_companies = pd.DataFrame(columns=['Rank', 'Name', 'Industry', 'Revenue', 
                                     'Profit', 'Employees', 'Headquarters'])

# Create a count variable for the rank.
count = 1

# Collect the data and store them into the dataframe.
for row in companies_table.tbody.find_all('tr'):    
    # Find all data for each column
    columns = row.find_all('td')
    
    if(columns != []):
        
        # Skip rows where data is missing for any field.
        if(len(columns) != 7):
            continue
        else:
            Rank = count
            Name = columns[0].text.strip()
            Industry = columns[1].text.strip()
            Revenue = columns[2].text.strip()
            Profit = columns[3].text.strip()
            Employees = columns[4].text.strip()
            Headquarters = columns[5].text.strip()
            
            new_row = {'Rank': Rank,  'Name': Name, 'Industry': Industry, 
                                                'Revenue': Revenue, 'Profit': Profit, 'Employees': Employees, 
                                                'Headquarters': Headquarters}
            df_companies.loc[len(df_companies.index)] = new_row
            
            # Update the counter for the rank
            count = count +1
            
# Print the head to show the data.
df_companies.head(5)

Unnamed: 0,Rank,Name,Industry,Revenue,Profit,Employees,Headquarters
0,1,Walmart,Retail,"$572,754","$13,673",2300000,United States
1,2,Amazon,Retail,"$469,822","$33,364",1608000,United States
2,3,State Grid,Electricity,"$460,616.9","$7,137.8",871145,China
3,4,China National Petroleum,Oil and gas,"$411,692.9","$9,637.5",1090345,China
4,5,Sinopec Group,Oil and gas,"$401,313.5","$8,316.1",542286,China


## Step 2

In [34]:
# Convert all of the names to lowercase for each company.
df_companies['Name'] = df_companies['Name'].str.lower()

## Step 3

In [35]:
# Convert all categorical variables to lowercase as well to produce similar dummies in the future.
df_companies['Industry'] = df_companies['Industry'].str.lower()
df_companies['Headquarters'] = df_companies['Headquarters'].str.lower()

## Step 4

In [36]:
# Remove all punctuation from the revenue and profit values while converting them to floats.
import re

# Loop through the records and update the values to remove punctuation and set as floats.
for i, row in df_companies.iterrows():
    profit_value = row["Profit"]
    revenue_value = row["Revenue"]
    
    # Remove all dollar and commas
    profit_value = profit_value.replace("$", "")
    profit_value = profit_value.replace(",", "")
    revenue_value = revenue_value.replace("$", "")
    revenue_value = revenue_value.replace(",", "")
    
    df_companies.at[i,'Profit'] = profit_value
    df_companies.at[i,'Revenue'] = float(revenue_value)
    

## Step 5

In [37]:
# Remove all commas from the employees values as well.
for i, row in df_companies.iterrows():
    employee_value = row["Employees"]
    
    employee_value = employee_value.replace(",", "")
    
    df_companies.at[i,'Employees'] = employee_value

df_companies.head(5)

Unnamed: 0,Rank,Name,Industry,Revenue,Profit,Employees,Headquarters
0,1,walmart,retail,572754.0,13673.0,2300000,united states
1,2,amazon,retail,469822.0,33364.0,1608000,united states
2,3,state grid,electricity,460616.9,7137.8,871145,china
3,4,china national petroleum,oil and gas,411692.9,9637.5,1090345,china
4,5,sinopec group,oil and gas,401313.5,8316.1,542286,china


## Ethical Considerations

The data that is pulled into this document comes from Wikipedia which is a website that allows users to make edits that are typically reviewed prior to publication.  When pulling this information for future instances, it's entirely possible that the information may change prior to it's day of use.  Additionally, almost all companies that have their main headquarters in China are state-owned; this means that their business may have a monopoly on the industry held within which could impact the findings of the exploratory data analysis.  With this consideration in mind, the analysis may require removal of state-owned companies to reduce the possibility of skewed data.