### Import Libraries

In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

### Import Dataset

In [2]:
df = pd.read_csv("/kaggle/input/demographic-data-analyzer/adult_data.csv")
df.head()

Unnamed: 0,age,workclass,fnlwgt,education,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,native-country,salary
0,39,State-gov,77516,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States,<=50K
1,50,Self-emp-not-inc,83311,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States,<=50K
2,38,Private,215646,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States,<=50K
3,53,Private,234721,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,40,United-States,<=50K
4,28,Private,338409,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40,Cuba,<=50K


### Dataset Information

In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 32561 entries, 0 to 32560
Data columns (total 15 columns):
 #   Column          Non-Null Count  Dtype 
---  ------          --------------  ----- 
 0   age             32561 non-null  int64 
 1   workclass       32561 non-null  object
 2   fnlwgt          32561 non-null  int64 
 3   education       32561 non-null  object
 4   education-num   32561 non-null  int64 
 5   marital-status  32561 non-null  object
 6   occupation      32561 non-null  object
 7   relationship    32561 non-null  object
 8   race            32561 non-null  object
 9   sex             32561 non-null  object
 10  capital-gain    32561 non-null  int64 
 11  capital-loss    32561 non-null  int64 
 12  hours-per-week  32561 non-null  int64 
 13  native-country  32561 non-null  object
 14  salary          32561 non-null  object
dtypes: int64(6), object(9)
memory usage: 3.7+ MB


### Dataset Statistics

In [4]:
df.describe()

Unnamed: 0,age,fnlwgt,education-num,capital-gain,capital-loss,hours-per-week
count,32561.0,32561.0,32561.0,32561.0,32561.0,32561.0
mean,38.581647,189778.4,10.080679,1077.648844,87.30383,40.437456
std,13.640433,105550.0,2.57272,7385.292085,402.960219,12.347429
min,17.0,12285.0,1.0,0.0,0.0,1.0
25%,28.0,117827.0,9.0,0.0,0.0,40.0
50%,37.0,178356.0,10.0,0.0,0.0,40.0
75%,48.0,237051.0,12.0,0.0,0.0,45.0
max,90.0,1484705.0,16.0,99999.0,4356.0,99.0


# Exploratory Data Analysis

**1. How many people of each race are represented in this dataset**

In [5]:
race = df["race"].value_counts().to_frame().reset_index()
race.columns = ['Race', 'People']
race

Unnamed: 0,Race,People
0,White,27816
1,Black,3124
2,Asian-Pac-Islander,1039
3,Amer-Indian-Eskimo,311
4,Other,271


**2. What is the average age of men**

In [6]:
men = df[df["sex"] == 'Male']
men_avg_age = men['age'].agg('mean').round(2)
print('Average Age of Men is', men_avg_age)

Average Age of Men is 39.43


**3. What is the percentage of people who have a Bachelor's Degree**

In [7]:
education = df['education'].value_counts().to_frame().reset_index()
education['Percentage'] = (education['count'] / education['count'].sum() * 100).round(2).astype(str) + '%'
education.columns = ['Education', 'People', 'Percentage']
bachelors = education[education['Education'] == 'Bachelors']
bachelors

Unnamed: 0,Education,People,Percentage
2,Bachelors,5355,16.45%


**4. What percentage of people with advanced education (Bachelors, Masters, or Doctorate) make more than 50K?**

In [8]:
bmd = df[df["education"].isin(['Bachelors', 'Masters', 'Doctorate'])]
bmd_salary = bmd[bmd['salary'] == '>50K']
bmd_people_percentage = round((len(bmd_salary) / len(df)) * 100, 2)
print('Percentage of people with advanced education (Bachelors, Masters, or Doctorate) make more than 50K =', str(bmd_people_percentage) + '%')

Percentage of people with advanced education (Bachelors, Masters, or Doctorate) make more than 50K = 10.71%


**5. What percentage of people without advanced education make more than 50K?**

In [9]:
salary_50k = df[df['salary'] == '>50K']
salary_50k_low_education = salary_50k[~salary_50k.isin(['Bachelors', 'Masters', 'Doctorate'])]
salary_50k_low_education_percentage =round(((salary_50k_low_education.value_counts().sum()) / len(df)) * 100, 2)
print('Percentage of people with advanced education make more than 50K =', str(salary_50k_low_education_percentage) + '%')

Percentage of people with advanced education make more than 50K = 13.37%


**6. What is the minimum number of hours a person works per week?**

In [10]:
min_hours = df['hours-per-week'].min()
print('Minimum number of hours a person works per week =', min_hours)

Minimum number of hours a person works per week = 1


**7. What percentage of the people who work the minimum number of hours per week have a salary of more than 50K?**

In [11]:
min_hours_50K = df[(df['hours-per-week'] == df['hours-per-week'].min()) & (df['salary'] == '>50K')]
min_hours_50K_percentage = round((len(min_hours_50K) / len(df)) * 100, 3)
print('Percentage of the people who work the minimum number of hours per week have a salary of more than 50K =', str(min_hours_50K_percentage) + '%')

Percentage of the people who work the minimum number of hours per week have a salary of more than 50K = 0.006%


**8. What country has the highest percentage of people that earn >50K and what is that percentage?**

In [12]:
salary_50k = df[df['salary'] == '>50K']
salary_50k_country = salary_50k['native-country'].value_counts().to_frame().reset_index()
salary_50k_country.columns = ['Native Country', 'People']
salary_50k_country['Percentage'] = round((salary_50k_country['People'] / salary_50k_country['People'].sum()) * 100, 2)
salary_50k_country = salary_50k_country[salary_50k_country['Percentage'] == salary_50k_country['Percentage'].max()]
salary_50k_country

Unnamed: 0,Native Country,People,Percentage
0,United-States,7171,91.46


**9. Identify the most popular occupation for those who earn >50K in India.**

In [13]:
ind_50k = df[(df['native-country'] == 'India') & (df['salary'] == '>50K')]
ind_50k_ocu = ind_50k['occupation'].value_counts().to_frame().reset_index()
ind_50k_ocu.columns = ['Occupation', 'People']
ind_50k_ocu

Unnamed: 0,Occupation,People
0,Prof-specialty,25
1,Exec-managerial,8
2,Other-service,2
3,Tech-support,2
4,Transport-moving,1
5,Sales,1
6,Adm-clerical,1
