# Data Preparation Week 7 and 8
## Joshua Greenert
## DSC540-T301 Data Preparation
## 10/21/2022

## Step 1

In [8]:
# Perform at least 5 data transformations.
import pandas as pd
from urllib.request import urlopen
import json

# Get the content from the page
# Found that the website being used was using a JavaScript function to pull data from a json api.  Found the 
# json api calls inside of the networking tab and called them here instead of the webiste url.
# https://netimpactreport.com/datasets/largest-500
response = urlopen("https://netimpactreport.com/data/generated/FortuneGlobal500NIR.json")
data_json = json.loads(response.read())

# There are 500 companies included in the json.  Loop through those to set the values in the dataframe.
df_companies = pd.DataFrame(columns=['Rank', 'Company', 'Country', 'Revenue (M$)', 
                                    'Net Impact Ratio', 'Largest Benefit', 'Largest Cost', 'Society', 'Knowledge', 'Health', 'Environment'])
# Collect the data and store them into the dataframe.
for i in range(500): 

    rank = data_json["items"][i]["rank"]
    company = data_json["items"][i]["name"]
    country = data_json["items"][i]["country"]
    revenue = data_json["items"][i]["revenueUSD"]
    net_impact = data_json["items"][i]["netImpactRatio"]
    benefit = data_json["items"][i]["biggestImpacts"]["P"]["impactCategoryTitle"]
    cost = data_json["items"][i]["biggestImpacts"]["N"]["impactCategoryTitle"]
    society = data_json["items"][i]["dimensionTotals"]['S']
    knowledge = data_json["items"][i]["dimensionTotals"]['K']
    health = data_json["items"][i]["dimensionTotals"]['H']
    environment = data_json["items"][i]["dimensionTotals"]['E']

    new_row = {'Rank': rank,  'Company': company, 'Country': country, 
                     'Revenue (M$)': revenue, 'Net Impact Ratio': net_impact, 'Largest Benefit': benefit, 
                   'Largest Cost': cost, 'Society': society, 'Knowledge': knowledge, 'Health': health, 'Environment': environment}

    df_companies.loc[len(df_companies.index)] = new_row
            
# Show the new dataframe.
df_companies.head(5)


Unnamed: 0,Rank,Company,Country,Revenue (M$),Net Impact Ratio,Largest Benefit,Largest Cost,Society,Knowledge,Health,Environment
0,1,Thermo Fisher Scientific,United States of America,25542000000.0,0.74237,Diseases,Scarce human capital,2.087694,1.442365,4.159877,-0.493861
1,2,Abbott Laboratories,United States of America,31904000000.0,0.719987,Diseases,Scarce human capital,2.147632,-0.355803,5.576975,-0.446213
2,3,Veolia Environnement,France,30431000000.0,0.698317,Waste,GHG emissions,3.457366,-0.077464,0.297366,2.817365
3,4,Medtronic,Ireland,30557000000.0,0.686935,Diseases,Scarce human capital,2.356223,-1.661113,4.8582,-0.230345
4,5,Iberdrola,Spain,40783000000.0,0.68244,Societal infrastructure,Scarce human capital,6.618977,0.234801,0.427617,1.618268


## Step 2

In [12]:
# Convert revenue columns to readable numbers by removing trailing zeros
for i, row in df_companies.iterrows():
    # Get the count of the number (some have more than others but they are all near one another)
    str_revenue = str(int(row['Revenue (M$)']))
    
    if(len(str_revenue) == 11):
        str_revenue = str_revenue[:5]
        df_companies.at[i,'Revenue (M$)'] = int(str_revenue)
    elif(len(str_revenue) == 12):
        str_revenue = str_revenue[:6]
        df_companies.at[i,'Revenue (M$)'] = int(str_revenue)
        
# Confirm updates were successful.
df_companies.head(5)

Unnamed: 0,Rank,Company,Country,Revenue (M$),Net Impact Ratio,Largest Benefit,Largest Cost,Society,Knowledge,Health,Environment
0,1,Thermo Fisher Scientific,United States of America,25542.0,0.74237,Diseases,Scarce human capital,2.087694,1.442365,4.159877,-0.493861
1,2,Abbott Laboratories,United States of America,31904.0,0.719987,Diseases,Scarce human capital,2.147632,-0.355803,5.576975,-0.446213
2,3,Veolia Environnement,France,30431.0,0.698317,Waste,GHG emissions,3.457366,-0.077464,0.297366,2.817365
3,4,Medtronic,Ireland,30557.0,0.686935,Diseases,Scarce human capital,2.356223,-1.661113,4.8582,-0.230345
4,5,Iberdrola,Spain,40783.0,0.68244,Societal infrastructure,Scarce human capital,6.618977,0.234801,0.427617,1.618268


## Step 3

In [13]:
# Fix percentages for data in net impact ratio
df_companies['Net Impact Ratio'] = df_companies['Net Impact Ratio'].round(decimals = 2)

## Step 4

In [14]:
# Reduce the S, K, H, and E values to rounded values.
df_companies['Society'] = df_companies['Society'].round(decimals = 2)
df_companies['Knowledge'] = df_companies['Knowledge'].round(decimals = 2)
df_companies['Health'] = df_companies['Health'].round(decimals = 2)
df_companies['Environment'] = df_companies['Environment'].round(decimals = 2)

## Step 5

In [15]:
# Convert all company names to lowercase.
df_companies['Company'] = df_companies['Company'].str.lower()

## Step 6

In [16]:
# Convert all categorical variables to lowercase as well to produce similar dummies in the future.
df_companies['Country'] = df_companies['Country'].str.lower()
df_companies['Largest Benefit'] = df_companies['Largest Benefit'].str.lower()
df_companies['Largest Cost'] = df_companies['Largest Cost'].str.lower()

In [17]:
# Confirm dataset was updated.
df_companies.head(5)

Unnamed: 0,Rank,Company,Country,Revenue (M$),Net Impact Ratio,Largest Benefit,Largest Cost,Society,Knowledge,Health,Environment
0,1,thermo fisher scientific,united states of america,25542.0,0.74,diseases,scarce human capital,2.09,1.44,4.16,-0.49
1,2,abbott laboratories,united states of america,31904.0,0.72,diseases,scarce human capital,2.15,-0.36,5.58,-0.45
2,3,veolia environnement,france,30431.0,0.7,waste,ghg emissions,3.46,-0.08,0.3,2.82
3,4,medtronic,ireland,30557.0,0.69,diseases,scarce human capital,2.36,-1.66,4.86,-0.23
4,5,iberdrola,spain,40783.0,0.68,societal infrastructure,scarce human capital,6.62,0.23,0.43,1.62


## Ethical Considerations

When using data sources such as ones from an unreputed source, there's a potential for bias from the creator to value certain options over others (i.e. ghg emissions are worse than scarce human capital, etc.). Therefore, it would be wise to keep this in mind when considering the Society, Knowledge, Health, and Environment features.  Moreover, the rank in this dataframe may end up proving that bias based on the factors aforementioned.  Beyond those key considerations and their potential bias, the data here is fairly straight-forward.  No other factors, such as revenue, have an ethical consideration that needs to be explored beyond the general scope of the exploratory data analysis.