# Data Preparation Term Project
## Joshua Greenert
## DSC540-T301 Data Preparation
## 11/10/2022

## Prepare Flat File Data

In [9]:
# Optional suggestions 
import pandas as pd
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

# Read the data into a dataframe.
df_companies_flat = pd.read_csv('Fortune 1000 Companies by Revenue.csv')

# Fix column names to not have any spaces.
df_companies_flat.set_axis(["rank", "name", "revenues", "revenue_percent_change", "profits", "profits_percent_change", "assets", "market_value", "change_in_rank", "employees" ], axis=1, inplace=True)

# Reduce all names to lowercase.
df_companies_flat['name'] = df_companies_flat['name'].str.lower()

# Strip the white spaces from the columns
df_obj = df_companies_flat.select_dtypes(['object'])
df_companies_flat[df_obj.columns] = df_obj.apply(lambda x: x.str.strip())

# Remove the dollar signs from all values.
df_companies_flat["revenues"] = df_companies_flat['revenues'].str.replace('$','')
df_companies_flat["profits"] = df_companies_flat['profits'].str.replace('$','')
df_companies_flat["assets"] = df_companies_flat['assets'].str.replace('$','')
df_companies_flat["market_value"] = df_companies_flat['market_value'].str.replace('$','')
df_companies_flat["employees"] = df_companies_flat['employees'].str.replace('$','')

# Remove the commas from all values.
df_companies_flat["revenues"] = df_companies_flat['revenues'].str.replace(',','')
df_companies_flat["profits"] = df_companies_flat['profits'].str.replace(',','')
df_companies_flat["assets"] = df_companies_flat['assets'].str.replace(',','')
df_companies_flat["market_value"] = df_companies_flat['market_value'].str.replace(',','')
df_companies_flat["employees"] = df_companies_flat['employees'].str.replace(',','')

# Replace opening parenthesis with negative sign (-)
df_companies_flat["revenues"] = df_companies_flat['revenues'].str.replace('(','-')
df_companies_flat["profits"] = df_companies_flat['profits'].str.replace('(','-')
df_companies_flat["assets"] = df_companies_flat['assets'].str.replace('(','-')
df_companies_flat["market_value"] = df_companies_flat['market_value'].str.replace('(','-')
df_companies_flat["employees"] = df_companies_flat['employees'].str.replace('(','-')

# Replace ending parenthesis altogether.
df_companies_flat["revenues"] = df_companies_flat['revenues'].str.replace(')','')
df_companies_flat["profits"] = df_companies_flat['profits'].str.replace(')','')
df_companies_flat["assets"] = df_companies_flat['assets'].str.replace(')','')
df_companies_flat["market_value"] = df_companies_flat['market_value'].str.replace(')','')
df_companies_flat["employees"] = df_companies_flat['employees'].str.replace(')','')

# Drop all values that don't have profits or market value listed.
df_companies_flat = df_companies_flat.loc[df_companies_flat["profits"] != "-"]
df_companies_flat = df_companies_flat.loc[df_companies_flat["market_value"] != "-"]

# Update all numeric values to be numbers instead of strings.
df_companies_flat["revenues"] = pd.to_numeric(df_companies_flat['revenues'])
df_companies_flat["profits"] = pd.to_numeric(df_companies_flat['profits'])
df_companies_flat["assets"] = pd.to_numeric(df_companies_flat['assets'])
df_companies_flat["market_value"] = pd.to_numeric(df_companies_flat['market_value'])
df_companies_flat["employees"] = pd.to_numeric(df_companies_flat['employees'])

# Remove the change in rank column.
df_companies_flat = df_companies_flat.drop('change_in_rank', axis = 1)
df_companies_flat.head(5)

Unnamed: 0,rank,name,revenues,revenue_percent_change,profits,profits_percent_change,assets,market_value,employees
0,1,walmart,572754.0,2.40%,13673.0,1.20%,244860.0,409795.0,2300000
1,2,amazon,469822.0,21.70%,33364.0,56.40%,420549.0,1658807.3,1608000
2,3,apple,365817.0,33.30%,94680.0,64.90%,351002.0,2849537.6,154000
3,4,cvs health,292111.0,8.70%,7910.0,10.20%,232999.0,132839.2,258000
4,5,unitedhealth group,287597.0,11.80%,17285.0,12.20%,212206.0,479830.3,350000


## Prepare Website Data

In [10]:
import numpy as np
import requests
from bs4 import BeautifulSoup
import re

# Get the content from the page
page = requests.get("https://en.wikipedia.org/wiki/List_of_largest_companies_by_revenue")
content = page.content

# Create the page parser
soup = BeautifulSoup(content, 'html.parser')

# Get the countries table.
companies_table = soup.find('table', class_='wikitable sortable')

# Set up the dataframe to prepare for the data.
df_companies_website = pd.DataFrame(columns=['Rank', 'Name', 'Industry', 'Revenue', 
                                     'Profit', 'Employees', 'Headquarters'])

# Create a count variable for the rank.
count = 1

# Collect the data and store them into the dataframe.
for row in companies_table.tbody.find_all('tr'):    
    # Find all data for each column
    columns = row.find_all('td')
    
    if(columns != []):
        
        # Skip rows where data is missing for any field.
        if(len(columns) != 7):
            continue
        else:
            Rank = count
            Name = columns[0].text.strip()
            Industry = columns[1].text.strip()
            Revenue = columns[2].text.strip()
            Profit = columns[3].text.strip()
            Employees = columns[4].text.strip()
            Headquarters = columns[5].text.strip()
            
            new_row = {'Rank': Rank,  'Name': Name, 'Industry': Industry, 
                                                'Revenue': Revenue, 'Profit': Profit, 'Employees': Employees, 
                                                'Headquarters': Headquarters}
            df_companies_website.loc[len(df_companies_website.index)] = new_row
            
            # Update the counter for the rank
            count = count +1
            
# Convert all of the names to lowercase for each company.
df_companies_website['Name'] = df_companies_website['Name'].str.lower()

# Convert all categorical variables to lowercase as well to produce similar dummies in the future.
df_companies_website['Industry'] = df_companies_website['Industry'].str.lower()
df_companies_website['Headquarters'] = df_companies_website['Headquarters'].str.lower()

# Remove all punctuation from the revenue and profit values while converting them to floats.
# Loop through the records and update the values to remove punctuation and set as floats.
for i, row in df_companies_website.iterrows():
    profit_value = row["Profit"]
    revenue_value = row["Revenue"]
    
    # Remove all dollar and commas
    profit_value = profit_value.replace("$", "")
    profit_value = profit_value.replace(",", "")
    revenue_value = revenue_value.replace("$", "")
    revenue_value = revenue_value.replace(",", "")
    
    df_companies_website.at[i,'Profit'] = profit_value
    df_companies_website.at[i,'Revenue'] = float(revenue_value)
    
# Remove all commas from the employees values as well.
for i, row in df_companies_website.iterrows():
    employee_value = row["Employees"]
    
    employee_value = employee_value.replace(",", "")
    
    df_companies_website.at[i,'Employees'] = employee_value

df_companies_website.head(5)

Unnamed: 0,Rank,Name,Industry,Revenue,Profit,Employees,Headquarters
0,1,walmart,retail,572754.0,13673.0,2300000,united states
1,2,amazon,retail,469822.0,33364.0,1608000,united states
2,3,state grid,electricity,460616.9,7137.8,871145,china
3,4,china national petroleum,oil and gas,411692.9,9637.5,1090345,china
4,5,sinopec group,oil and gas,401313.5,8316.1,542286,china


## Prepare API Data

In [11]:
from urllib.request import urlopen
import json

response = urlopen("https://netimpactreport.com/data/generated/FortuneGlobal500NIR.json")
data_json = json.loads(response.read())

# There are 500 companies included in the json.  Loop through those to set the values in the dataframe.
df_companies_api = pd.DataFrame(columns=['Rank', 'Company', 'Country', 'Revenue (M$)', 
                                    'Net Impact Ratio', 'Largest Benefit', 'Largest Cost', 'Society', 'Knowledge', 'Health', 'Environment'])
# Collect the data and store them into the dataframe.
for i in range(500): 

    rank = data_json["items"][i]["rank"]
    company = data_json["items"][i]["name"]
    country = data_json["items"][i]["country"]
    revenue = data_json["items"][i]["revenueUSD"]
    net_impact = data_json["items"][i]["netImpactRatio"]
    benefit = data_json["items"][i]["biggestImpacts"]["P"]["impactCategoryTitle"]
    cost = data_json["items"][i]["biggestImpacts"]["N"]["impactCategoryTitle"]
    society = data_json["items"][i]["dimensionTotals"]['S']
    knowledge = data_json["items"][i]["dimensionTotals"]['K']
    health = data_json["items"][i]["dimensionTotals"]['H']
    environment = data_json["items"][i]["dimensionTotals"]['E']

    new_row = {'Rank': rank,  'Company': company, 'Country': country, 
                     'Revenue (M$)': revenue, 'Net Impact Ratio': net_impact, 'Largest Benefit': benefit, 
                   'Largest Cost': cost, 'Society': society, 'Knowledge': knowledge, 'Health': health, 'Environment': environment}

    df_companies_api.loc[len(df_companies_api.index)] = new_row
    
# Convert revenue columns to readable numbers by removing trailing zeros
for i, row in df_companies_api.iterrows():
    # Get the count of the number (some have more than others but they are all near one another)
    str_revenue = str(int(row['Revenue (M$)']))
    
    if(len(str_revenue) == 11):
        str_revenue = str_revenue[:5]
        df_companies_api.at[i,'Revenue (M$)'] = int(str_revenue)
    elif(len(str_revenue) == 12):
        str_revenue = str_revenue[:6]
        df_companies_api.at[i,'Revenue (M$)'] = int(str_revenue)
        
# Fix percentages for data in net impact ratio
df_companies_api['Net Impact Ratio'] = df_companies_api['Net Impact Ratio'].round(decimals = 2)

# Reduce the S, K, H, and E values to rounded values.
df_companies_api['Society'] = df_companies_api['Society'].round(decimals = 2)
df_companies_api['Knowledge'] = df_companies_api['Knowledge'].round(decimals = 2)
df_companies_api['Health'] = df_companies_api['Health'].round(decimals = 2)
df_companies_api['Environment'] = df_companies_api['Environment'].round(decimals = 2)

# Convert all company names to lowercase.
df_companies_api['Company'] = df_companies_api['Company'].str.lower()

# Convert all categorical variables to lowercase as well to produce similar dummies in the future.
df_companies_api['Country'] = df_companies_api['Country'].str.lower()
df_companies_api['Largest Benefit'] = df_companies_api['Largest Benefit'].str.lower()
df_companies_api['Largest Cost'] = df_companies_api['Largest Cost'].str.lower()

# Confirm dataset was updated.
df_companies_api.head(5)

Unnamed: 0,Rank,Company,Country,Revenue (M$),Net Impact Ratio,Largest Benefit,Largest Cost,Society,Knowledge,Health,Environment
0,1,thermo fisher scientific,united states of america,25542.0,0.74,diseases,scarce human capital,2.09,1.44,4.16,-0.49
1,2,abbott laboratories,united states of america,31904.0,0.72,diseases,scarce human capital,2.15,-0.36,5.58,-0.45
2,3,veolia environnement,france,30431.0,0.7,waste,ghg emissions,3.46,-0.08,0.3,2.82
3,4,medtronic,ireland,30557.0,0.69,diseases,scarce human capital,2.36,-1.66,4.86,-0.23
4,5,iberdrola,spain,40783.0,0.68,societal infrastructure,scarce human capital,6.62,0.23,0.43,1.62


## Load Data Into Database

### Flat File Data

In [8]:
# Insert data into a SQL Lite database
import sqlite3

# Create the table using a query.
createTableQuery = """
CREATE TABLE flatfile(rank integer(5), name varchar(50), revenue float(20), revenue_change varchar(10), 
profits float(15), profit_change varchar(10), assets float(15), market_value float(20), employees integer(10));
"""

# Create the connection
conn = sqlite3.connect('databases.sqlite')
conn.execute(createTableQuery)
conn.commit()

In [None]:
# Add data from the table into the database.
data = []

for i, row in df_companies_flat.iterrows():
    
    
data = [('Ulla Hopkins', '9637 Placerat Rd', 'Schwalbach', 'NE', 53179, 2682710286),
        ('Myra Osborn','385-5226 Rutrum Road', 'Casanova Elvo', 'FL', 48854, 2682710286 ),
        ('Lester Holman', '339-4864 Et Rd', 'Kacchi', 'NE', 36165, 7808882816),
        ('Timothy Horne','Ap #826-5932 Quis St.', 'Rạch Gia', 'GA', 41254, 5722302276 ),
        ('Lana Mason', '313-8034 Ultricies Rd.', 'Lutsk', 'IN', 50409, 5284884723),
        ('Jolie Day', '1230 Vivamus Rd', 'Bautzen', 'MI', 16861, 5701000601 ),
        ('Bree Acosta', 'P.O. Box 994, 831 Ac Rd', 'Naushahro', 'RI', 98741, 7351396348),
        ('Fatima Reese', 'Ap #741-4347 Vivamus St', 'Warminster', 'FL', 14934, 3667546251 ),
        ('Aphrodite Norman', '973-431 Tincidunt Avenue', 'Calle Blancos', 'AL', 86086, 5464585376),
        ('Ian Reilly', 'P.O. Box 621, 9470 Enim St', 'Middelburg', 'IL', 45325, 2852418273)]

statement = "Insert into usertable VALUES(?, ?, ?, ?, ?, ?)"
conn.executemany(statement, data)

conn.commit()

### Website Data

### API Data