### Importando bibliotecas

In [41]:
#Passo 1: Importar as bibliotecas
import pandas as pd
from pandas.api.types import is_string_dtype
from pandas.api.types import is_numeric_dtype
import plotly.express as px
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np
import statistics  as sts
from geopy.geocoders import Nominatim
import folium
from folium.plugins import MarkerCluster
from plotly.subplots import make_subplots
import plotly.graph_objects as go
import numbers

### Carregando e entendendo dados

In [42]:
#Passo 2: Carregar base de dados
df = pd.read_csv("forbes_billionaires.csv")
#Passo 3: Entender  e editar os tipo dos dados
df.info()
df.head(2)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2755 entries, 0 to 2754
Data columns (total 12 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   Name         2755 non-null   object 
 1   NetWorth     2755 non-null   float64
 2   Country      2755 non-null   object 
 3   Source       2755 non-null   object 
 4   Rank         2755 non-null   int64  
 5   Age          2630 non-null   float64
 6   Residence    2715 non-null   object 
 7   Citizenship  2739 non-null   object 
 8   Status       2090 non-null   object 
 9   Children     1552 non-null   float64
 10  Education    1409 non-null   object 
 11  Self_made    2737 non-null   object 
dtypes: float64(3), int64(1), object(8)
memory usage: 258.4+ KB


Unnamed: 0,Name,NetWorth,Country,Source,Rank,Age,Residence,Citizenship,Status,Children,Education,Self_made
0,Jeff Bezos,177.0,United States,Amazon,1,57.0,"Seattle, Washington",United States,In Relationship,4.0,"Bachelor of Arts/Science, Princeton University",True
1,Elon Musk,151.0,United States,"Tesla, SpaceX",2,49.0,"Austin, Texas",United States,In Relationship,7.0,"Bachelor of Arts/Science, University of Pennsy...",True


### Limpeza e tratamento de dados

In [43]:
#Passo 4: Limpeza e tratamento de dados
for column in df:

    nan_values = df[column].isnull().sum()
    
    if (is_numeric_dtype(df[column]) and nan_values>0):
        median = sts.median(df[column])
        df[column].fillna(median, inplace=True)
        #print("column:{}, NaN values: {}, media:{}".format(column,nan_values,median))
        
    if (is_string_dtype(df[column]) and nan_values>0):
        mode = sts.mode(df[column])
        if(isinstance(mode, float)): #Se a moda for = "NaN"
            mode = "Uninformed"
        df[column].fillna(mode, inplace=True)
        #print("column:{}, NaN values: {}, mode:{}".format(column,nan_values,mode))

#Alterando tipos de dados
df["NetWorth"] = df["NetWorth"].astype("float64")
df["Age"] = df["Age"].astype("int64")
df["Children"] = df["Children"].astype("int64")
df["Education"] = df["Education"].astype("str")

#Mostrando tipo dos dados
df.info()
df.head(2)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2755 entries, 0 to 2754
Data columns (total 12 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   Name         2755 non-null   object 
 1   NetWorth     2755 non-null   float64
 2   Country      2755 non-null   object 
 3   Source       2755 non-null   object 
 4   Rank         2755 non-null   int64  
 5   Age          2755 non-null   int64  
 6   Residence    2755 non-null   object 
 7   Citizenship  2755 non-null   object 
 8   Status       2755 non-null   object 
 9   Children     2755 non-null   int64  
 10  Education    2755 non-null   object 
 11  Self_made    2755 non-null   bool   
dtypes: bool(1), float64(1), int64(3), object(7)
memory usage: 239.6+ KB


Unnamed: 0,Name,NetWorth,Country,Source,Rank,Age,Residence,Citizenship,Status,Children,Education,Self_made
0,Jeff Bezos,177.0,United States,Amazon,1,57,"Seattle, Washington",United States,In Relationship,4,"Bachelor of Arts/Science, Princeton University",True
1,Elon Musk,151.0,United States,"Tesla, SpaceX",2,49,"Austin, Texas",United States,In Relationship,7,"Bachelor of Arts/Science, University of Pennsy...",True


### Incrementando dados de localização (latitude e longitude)

In [44]:
#state residence
df["StateResidence"] = df["Residence"].str.rsplit(', ', expand=True)[1]

#lat e long dos estados
geolocator = Nominatim(user_agent="teste")
states = df['StateResidence'].unique()
lat_coord_dictionary = dict(zip(states, pd.Series(states).apply(geolocator.geocode).apply(lambda x: (x.latitude))))
long_coord_dictionary = dict(zip(states, pd.Series(states).apply(geolocator.geocode).apply(lambda x: (x.longitude))))

df['Lat_StateCoord'] = df['StateResidence'].map(lat_coord_dictionary)
df['Long_StateCoord'] = df['StateResidence'].map(long_coord_dictionary)
df.head(2)

Unnamed: 0,Name,NetWorth,Country,Source,Rank,Age,Residence,Citizenship,Status,Children,Education,Self_made,StateResidence,Lat_StateCoord,Long_StateCoord
0,Jeff Bezos,177.0,United States,Amazon,1,57,"Seattle, Washington",United States,In Relationship,4,"Bachelor of Arts/Science, Princeton University",True,Washington,38.894992,-77.036558
1,Elon Musk,151.0,United States,"Tesla, SpaceX",2,49,"Austin, Texas",United States,In Relationship,7,"Bachelor of Arts/Science, University of Pennsy...",True,Texas,31.816038,-99.512099


### Análise Exploratória

Qual a distribuição de pessoas na lista da forbes 2021 por pais?

In [45]:
world_map= folium.Map(tiles="cartodbpositron")

marker_cluster = MarkerCluster().add_to(world_map)

#cria um circlemarker para cada coordenada
for i in range(len(df)):
        lat = df.iloc[i]['Lat_StateCoord']
        long = df.iloc[i]['Long_StateCoord']
        radius=5
        popup_text = """Country : {}<br>"""
        popup_text = popup_text.format(df.iloc[i]['Country'])
        
        folium.CircleMarker(location = [lat, long], radius=radius, popup= popup_text, fill =True).add_to(marker_cluster)
world_map


In [46]:
df2 = pd.DataFrame(df["Country"].value_counts())
df2 = df2.reset_index()
df2.columns = ['Country', 'Pop']
#df2
df3 = df2.nlargest(15,'Pop')
grafico1 = px.bar(df3, x="Country", y="Pop",
                  text="Pop", width= 1200)
grafico1.update_layout(
    title="Destribuição por pais",
    xaxis_title="Country",
    yaxis_title="População",
    font=dict(size=9.5,
             family="Arial")
    )
grafico1.show()

Qual a distribuição e média de idade?

In [47]:
fig = make_subplots(rows=1, cols=2,
                   subplot_titles=("Destribuição de idades","Média de idade"))

fig.add_trace(
    go.Histogram(x=df["Age"]),
    row=1, col=1
)

fig.add_trace(
    go.Box(y=df["Age"]),
    row=1, col=2
)

fig.update_layout(height=600, width=900)
fig.show()

Qual a destribuição de grau de escolaridade? (Obs: Refinar o que foi desenvolvido abaixo,vai ser necessário uma mineração de texto)

Ex: 

Bachelor: 10
Master: 8
High School: 7
Drop Out, High School: 7
Drop Out, University: 5
Ph.D: 4

In [48]:
df2 = pd.DataFrame(df["Education"].value_counts())
df2 = df2.reset_index()
df2.columns = ['Education', 'Pop']
df2
#df3 = df2.nlargest(15,'Pop')
#df3

Unnamed: 0,Education,Pop
0,Uninformed,1346
1,"Diploma, High School",23
2,"Bachelor of Arts/Science, Stanford University",13
3,"Bachelor of Arts/Science, Harvard University",11
4,"Bachelor of Arts/Science, University of Southe...",9
...,...,...
1116,"Bachelor of Arts/Science, New York University;...",1
1117,"Bachelor of Arts/Science, University of Maryland",1
1118,"Drop Out, West Virginia University",1
1119,"Doctorate, University of Chicago; Master of Bu...",1


In [49]:
lista1 = ["Drop Out", "Bachelor", "Master", "Doctorate", "Doctor", "Ph.D", "Uninformed"]
lista2 = ["University", "School"]

for i in lista1:
    if i == "Drop Out":
        for f in lista2:
            if [df2.Education.str.contains(i) & df2.Education.str.contains(f)]:
                #print(i,f)
                df2["EducationType"] = i+" "+f
    elif [df2.Education.str.contains(i)]:
            #print(i)
            df2["EducationType"] = i

df2.head()

Unnamed: 0,Education,Pop,EducationType
0,Uninformed,1346,Uninformed
1,"Diploma, High School",23,Uninformed
2,"Bachelor of Arts/Science, Stanford University",13,Uninformed
3,"Bachelor of Arts/Science, Harvard University",11,Uninformed
4,"Bachelor of Arts/Science, University of Southe...",9,Uninformed


In [None]:
df.loc

Qual a destribuição de "Fortuna feita pelo próprio individuo" e "Fortuna não feita pelo próprio individuo"?

**Possiveis próximos passos com essa analise:**<br>
. Qual a faixa de idade de pessoas que fazem fortunas por si próprios?<br>
. Essas pessoas tem qual grau de escolaridade?<br>
. Essas pessoas em sua maioria são de quais paises?<br>
. Quantos filhos essas pessoas tem?<br>
. Essas pessoas são casadas?<br>

In [50]:
df2 = pd.DataFrame(df["Self_made"].value_counts())
df2 = df2.reset_index()
df2.columns = ['Self_made', 'Pop']
df2


Unnamed: 0,Self_made,Pop
0,True,1978
1,False,777


In [51]:
#Passo 6: Conclusões