# Data Preparation Week 5 and 6
## Joshua Greenert
## DSC540-T301 Data Preparation
## 10/7/2022

In [25]:
# Perform at least 5 data transformations.
# Optional suggestions 
import pandas as pd
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)
'''
Replace Headers
Format data into a more readable format
Identify outliers and bad data
Find duplicates
Fix casing or inconsistent values
Conduct Fuzzy Matching
'''

'\nReplace Headers\nFormat data into a more readable format\nIdentify outliers and bad data\nFind duplicates\nFix casing or inconsistent values\nConduct Fuzzy Matching\n'

## Step 1

In [26]:
# Read the data into a dataframe.
df_companies = pd.read_csv('Fortune 1000 Companies by Revenue.csv')

# Fix column names to not have any spaces.
df_companies.columns

df_companies.set_axis(["rank", "name", "revenues", "revenue_percent_change", "profits", "profits_percent_change", "assets", "market_value", "change_in_rank", "employees" ], axis=1, inplace=True)

## Step 2

In [27]:
# Reduce all names to lowercase.
df_companies['name'] = df_companies['name'].str.lower()

# Confirm that names were updated with sample.
df_companies['name'].head(10)

0               walmart
1                amazon
2                 apple
3            cvs health
4    unitedhealth group
5           exxon mobil
6    berkshire hathaway
7              alphabet
8              mckesson
9     amerisourcebergen
Name: name, dtype: object

## Step 3

In [28]:
# Strip the white spaces from the columns
df_obj = df_companies.select_dtypes(['object'])
df_companies[df_obj.columns] = df_obj.apply(lambda x: x.str.strip())

# Remove the dollar signs from all values.
df_companies["revenues"] = df_companies['revenues'].str.replace('$','')
df_companies["profits"] = df_companies['profits'].str.replace('$','')
df_companies["assets"] = df_companies['assets'].str.replace('$','')
df_companies["market_value"] = df_companies['market_value'].str.replace('$','')
df_companies["employees"] = df_companies['employees'].str.replace('$','')

# Remove the commas from all values.
df_companies["revenues"] = df_companies['revenues'].str.replace(',','')
df_companies["profits"] = df_companies['profits'].str.replace(',','')
df_companies["assets"] = df_companies['assets'].str.replace(',','')
df_companies["market_value"] = df_companies['market_value'].str.replace(',','')
df_companies["employees"] = df_companies['employees'].str.replace(',','')

# Replace opening parenthesis with negative sign (-)
df_companies["revenues"] = df_companies['revenues'].str.replace('(','-')
df_companies["profits"] = df_companies['profits'].str.replace('(','-')
df_companies["assets"] = df_companies['assets'].str.replace('(','-')
df_companies["market_value"] = df_companies['market_value'].str.replace('(','-')
df_companies["employees"] = df_companies['employees'].str.replace('(','-')

# Replace ending parenthesis altogether.
df_companies["revenues"] = df_companies['revenues'].str.replace(')','')
df_companies["profits"] = df_companies['profits'].str.replace(')','')
df_companies["assets"] = df_companies['assets'].str.replace(')','')
df_companies["market_value"] = df_companies['market_value'].str.replace(')','')
df_companies["employees"] = df_companies['employees'].str.replace(')','')

df_companies.head(5)

Unnamed: 0,rank,name,revenues,revenue_percent_change,profits,profits_percent_change,assets,market_value,change_in_rank,employees
0,1,walmart,572754,2.40%,13673,1.20%,244860,409795.0,-,2300000
1,2,amazon,469822,21.70%,33364,56.40%,420549,1658807.3,-,1608000
2,3,apple,365817,33.30%,94680,64.90%,351002,2849537.6,-,154000
3,4,cvs health,292111,8.70%,7910,10.20%,232999,132839.2,-,258000
4,5,unitedhealth group,287597,11.80%,17285,12.20%,212206,479830.3,-,350000


## Step 4

In [39]:
# Drop all values that don't have profits or market value listed.
df_companies = df_companies.loc[df_companies["profits"] != "-"]
df_companies = df_companies.loc[df_companies["market_value"] != "-"]

## Step 5

In [41]:
# Update all numeric values to be numbers instead of strings.
df_companies["revenues"] = pd.to_numeric(df_companies['revenues'])
df_companies["profits"] = pd.to_numeric(df_companies['profits'])
df_companies["assets"] = pd.to_numeric(df_companies['assets'])
df_companies["market_value"] = pd.to_numeric(df_companies['market_value'])
df_companies["employees"] = pd.to_numeric(df_companies['employees'])

# Print a known value with change to confirm the previous changes worked.
print(df_companies[df_companies["name"] == "amerisourcebergen"])

  rank               name  revenues revenue_percent_change  profits  \
9   10  amerisourcebergen  213988.8                 12.70%   1539.9   

  profits_percent_change   assets  market_value change_in_rank  employees  
9                      -  57337.8       32355.7             -2      40000  


## Step 6

In [52]:
# Remove the change in rank column.
df_companies = df_companies.drop('change_in_rank', axis = 1)
df_companies

Unnamed: 0,rank,name,revenues,revenue_percent_change,profits,profits_percent_change,assets,market_value,employees
0,1,walmart,572754.0,2.40%,13673.0,1.20%,244860.0,409795.0,2300000
1,2,amazon,469822.0,21.70%,33364.0,56.40%,420549.0,1658807.3,1608000
2,3,apple,365817.0,33.30%,94680.0,64.90%,351002.0,2849537.6,154000
3,4,cvs health,292111.0,8.70%,7910.0,10.20%,232999.0,132839.2,258000
4,5,unitedhealth group,287597.0,11.80%,17285.0,12.20%,212206.0,479830.3,350000
...,...,...,...,...,...,...,...,...,...
995,996,vizio holding,2124.0,4%,-39.4,-138.40%,935.8,1705.1,800
996,997,1-800-flowers.com,2122.2,42.50%,118.7,101.10%,1076.7,830.0,4800
997,998,cowen,2112.8,30.20%,295.6,36.60%,8748.8,744.1,1534
998,999,ashland global holdings,2111.0,-11.20%,220.0,-,6612.0,5601.9,4100
