# Ishaan Thakker - CORGIS Billionaire Dataset Rework

Using the data from the billionaire dataset on CORGIS, I will be reworking this data to one that I found which is what I will be using to guide me in creating a new dataset.

[Reference Dataset](https://github.com/civisanalytics/potato/tree/master/data/billion)

In [80]:
#Importing
import pandas as pd
import numpy as np

In [81]:
df = pd.read_csv("billionaire-corgis.csv")
df.head()

Unnamed: 0,name,age,children,marital-status,country,gender,industry,source,net-worth
0,charlene-de-carvalho-heineken,60.0,5,Married,Netherlands,Female,Food & Beverage,Inherited,9.8
1,beny-steinmetz,58.0,4,Married,Israel,Male,Metals & Mining,Self-made,2.7
2,lee-shau-kee,86.0,5,Divorced,Hong Kong,Male,Real Estate,Self-made,22.1
3,david-thomson,57.0,4,Divorced,Canada,Male,Media,Inherited,24.3
4,bidzina-ivanishvili,58.0,4,Married,Georgia,Male,Diversified,Self-made,5.2


In [82]:
existing = pd.read_csv("References/billionaires-corgis.csv")
existing.head()

Unnamed: 0,Bill Gates,1,1996,1975,Microsoft,founder,Software,new,40,male,...,8100000000000.0,North America,founder non-finance,18.5,New Sectors,True,Technology-Computer,not inherited,True.1,False
0,Bill Gates,1,2001,1975,Microsoft,founder,Software,new,45,male,...,10600000000000.0,North America,founder non-finance,58.7,New Sectors,True,Technology-Computer,not inherited,True,False
1,Bill Gates,1,2014,1975,Microsoft,founder,Software,new,58,male,...,0.0,North America,founder non-finance,76.0,New Sectors,True,Technology-Computer,not inherited,True,False
2,Warren Buffett,2,1996,1962,Berkshire Hathaway,founder,Finance,new,65,male,...,8100000000000.0,North America,founder non-finance,15.0,Traded Sectors,True,Consumer,not inherited,True,False
3,Warren Buffett,2,2001,1962,Berkshire Hathaway,founder,Finance,new,70,male,...,10600000000000.0,North America,founder non-finance,32.3,Traded Sectors,True,Consumer,not inherited,True,False
4,Carlos Slim Helu,2,2014,1990,Telmex,founder,Communications,privatization,74,male,...,0.0,Latin America,privatized and resources,72.0,Non-Traded Sectors,False,Media,not inherited,True,False


In [83]:
#Renaming columns to be visually better
df.rename(columns = {"name":"Name", "age":"Age", "children":"Children", "marital-status":"Married?",
                     "country":"Country", "gender":"Gender", "industry":"Industry", "source":"Source",
                     "net-worth":"Net-Worth"}, inplace = True)
df.columns

Index(['Name', 'Age', 'Children', 'Married?', 'Country', 'Gender', 'Industry',
       'Source', 'Net-Worth'],
      dtype='object')

In [84]:
#Before formatting data
df.head()

Unnamed: 0,Name,Age,Children,Married?,Country,Gender,Industry,Source,Net-Worth
0,charlene-de-carvalho-heineken,60.0,5,Married,Netherlands,Female,Food & Beverage,Inherited,9.8
1,beny-steinmetz,58.0,4,Married,Israel,Male,Metals & Mining,Self-made,2.7
2,lee-shau-kee,86.0,5,Divorced,Hong Kong,Male,Real Estate,Self-made,22.1
3,david-thomson,57.0,4,Divorced,Canada,Male,Media,Inherited,24.3
4,bidzina-ivanishvili,58.0,4,Married,Georgia,Male,Diversified,Self-made,5.2


In [85]:
# After formatting data
df['Married?'] = df['Married?'].replace(['Married'], 'Yes')
df['Married?'] = df['Married?'].replace(['Divorced'], 'No')
df['Name'] = df['Name'].str.title()
df['Name'] = df['Name'].replace('-', ' ', regex = True)
df.head()

Unnamed: 0,Name,Age,Children,Married?,Country,Gender,Industry,Source,Net-Worth
0,Charlene De Carvalho Heineken,60.0,5,Yes,Netherlands,Female,Food & Beverage,Inherited,9.8
1,Beny Steinmetz,58.0,4,Yes,Israel,Male,Metals & Mining,Self-made,2.7
2,Lee Shau Kee,86.0,5,No,Hong Kong,Male,Real Estate,Self-made,22.1
3,David Thomson,57.0,4,No,Canada,Male,Media,Inherited,24.3
4,Bidzina Ivanishvili,58.0,4,Yes,Georgia,Male,Diversified,Self-made,5.2


In [100]:
#Adding most important values to new dataframe
new = pd.DataFrame()
#new = pd.concat([df, existing])
new['Name'] = existing['Bill Gates']
new['Age'] = existing['40']
new.replace(0, np.nan, inplace=True)
new['Gender'] = existing['male']
new['Net_Worth'] = existing['18.5']
new['Region'] = existing['North America']
new['Company'] = existing['Microsoft']


new = new.drop(columns = '')


#Sorting the data by net worth so names are not repeating, makes for easier data manipulation
new = new.sort_values(by = ["Net_Worth"], ascending = False)

new

Unnamed: 0,Name,Age,Gender,Net_Worth,Region,Company
1,Bill Gates,58.0,male,76.0,North America,Microsoft
4,Carlos Slim Helu,74.0,male,72.0,Latin America,Telmex
7,Amancio Ortega,77.0,male,64.0,Europe,Zara
0,Bill Gates,45.0,male,58.7,North America,Microsoft
10,Warren Buffett,83.0,male,58.2,North America,Berkshire Hathaway
...,...,...,...,...,...,...
1437,Jerome Kohlberg,75.0,male,1.0,North America,Kohberg Kravis Roberts & Co
1436,James France,56.0,male,1.0,North America,NASCAR
1435,Jacques Servier,,male,1.0,Europe,Laboratories Servier
1434,Jacob and Yehudith Richter,,married couple,1.0,Middle East/North Africa,Medinol


In [101]:
new.to_csv("billionaires - corgis.csv", index = True)