# Exploratory Analysis. Top World Billionaries

In [None]:
import numpy as np 
import pandas as pd
import seaborn as sns 
import matplotlib.pyplot as plt 

In [None]:
data=pd.read_csv("data/billionaires.csv")
data.head()

In [None]:
data.columns

In [None]:
data.shape

In [None]:
data.info()

In [None]:
data.describe()

In [None]:
data.nunique()

Remove columns,there is no use because it have only one unique values

In [None]:
print(data['Wealth How Was Political'].unique())
print(data['Wealth How Was Founder'].unique())
print(data['Wealth How From Emerging'].unique())

In [None]:
# Extract the column containing categorical data
column_values = data['Wealth Type']

# Calculate the count of each category
category_counts = column_values.value_counts()

# Plotting
plt.figure(figsize=(5, 5))  # Optional: Adjust the figure size
plt.pie(category_counts, labels=category_counts.index, autopct='%1.1f%%')
plt.title('Wealth Type')
plt.show()

In [None]:
# Extract the column containing categorical data
column_values = data['Demographics Gender']

# Calculate the count of each category
category_counts = column_values.value_counts()

# Plotting
plt.figure(figsize=(5, 5))  # Optional: Adjust the figure size
plt.pie(category_counts, labels=category_counts.index, autopct='%1.1f%%')
plt.title('Demographics Gender')
plt.show()

In [None]:
# Top 5 countries in each year

counts_1996 = data[data['Year'] == 1996]['Location Citizenship'].value_counts()
counts_2001 = data[data['Year'] == 2001]['Location Citizenship'].value_counts()
counts_2014 = data[data['Year'] == 2014]['Location Citizenship'].value_counts()

fig, axs = plt.subplots(1, 3, figsize=(10, 5))

axs[0].pie(counts_1996[:5], labels=counts_1996[:5].index, autopct='%1.1f%%')
axs[0].set_title('Billionaires in 1996')

axs[1].pie(counts_2001[:5], labels=counts_2001[:5].index, autopct='%1.1f%%')
axs[1].set_title('Billionaires in 2001')

axs[2].pie(counts_2014[:5], labels=counts_2014[:5].index, autopct='%1.1f%%')
axs[2].set_title('Billionaires in 2014')

plt.show()

In [None]:
from wordcloud import WordCloud

df = data.loc[:, ['Wealth Worth In Billions', 'Company Sector']]
df_grouped = df.groupby('Company Sector').sum()

wordcloud = (WordCloud(width=800, height=800, background_color='white').
             generate_from_frequencies(df_grouped['Wealth Worth In Billions']))
plt.figure(figsize=(8,8))
plt.imshow(wordcloud)
plt.axis('off')
plt.show()

In [None]:
# Top 5 company sectors

df_sorted = df_grouped.sort_values(by='Wealth Worth In Billions', ascending = False)
df_sorted.head()

In [None]:
# Top 10 industries

df = data.loc[:, ['Wealth Worth In Billions', 'Wealth How Industry']]
df_grouped = df.groupby('Wealth How Industry').sum()
df_sorted = df_grouped.sort_values(by='Wealth Worth In Billions', ascending = False)
df_sorted[:10].plot.bar(color='green')
plt.title('Top 10 Industries with Highest Wealth Worth')
plt.xlabel('Industry')
plt.ylabel('SUM')
plt.show()

In [None]:
# Mean wealth by gender

mean_wealth = data.groupby('Demographics Gender')['Wealth Worth In Billions'].mean()
ax = mean_wealth.plot(kind='bar', color=['blue', 'orange', 'green', 'pink'])
ax.set_ylabel('wealth worth in billions')
ax.set_xlabel('gender')
ax.set_title('Mean Wealth by Gender')
plt.show()

In [None]:
# Top 5 industries founded by women

result = data[data['Demographics Gender'] == 'female'].groupby('Wealth How Industry').size()
sorted_result = result.sort_values(ascending = False)
sorted_result.head()

In [None]:
# Creating group ages

age_labels = ['10-19', '20-29', '30-39', '40-49', '50-59', '60-69', '70-79', '80-89', '90-99']
age_bins = [9, 19, 29, 39, 49, 59, 69, 79, 89, 99]
data['Age_Group'] = pd.cut(data['Demographics Age'], bins=age_bins, labels=age_labels)
age_groups = data["Age_Group"].value_counts()
age_groups.sort_index().plot(kind='bar')

In [None]:
counts_1996 = data[data['Year'] == 1996]['Age_Group'].value_counts()
counts_2001 = data[data['Year'] == 2001]['Age_Group'].value_counts()
counts_2014 = data[data['Year'] == 2014]['Age_Group'].value_counts()

fig, axs = plt.subplots(1, 3, figsize=(10, 5))

axs[0].pie(counts_1996[:5], labels=counts_1996[:5].index, autopct='%1.1f%%')
axs[0].set_title('Top 5 Age Groups in 1996')

axs[1].pie(counts_2001[:5], labels=counts_2001[:5].index, autopct='%1.1f%%')
axs[1].set_title('Top 5 Age Groups in 2001')

axs[2].pie(counts_2014[:5], labels=counts_2014[:5].index, autopct='%1.1f%%')
axs[2].set_title('Top 5 Age Groups in 2014')

plt.show()

In [None]:
# Calculate the correlation matrix
corr_matrix = data[['Demographics Age', 'Wealth Worth In Billions', 'Location GDP']].corr()

# Create a heatmap
plt.figure(figsize=(5, 5))
sns.heatmap(corr_matrix, annot=True, cmap='coolwarm')
plt.title('Correlation Matrix')
plt.show()