# Demographic Data Analyzer

## Dataset source
Dua, D. and Graff, C. (2019). UCI Machine Learning Repository. Irvine, CA: University of California, School of Information and Computer Science.

In [145]:
import numpy as np
import pandas as pd
import math

In [146]:
demographic_data_df = pd.read_csv(r'\Users\loren\Downloads\adult.data.csv')

In [147]:
demographic_data_df.shape

(32561, 15)

In [148]:
demographic_data_df

Unnamed: 0,age,workclass,fnlwgt,education,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,native-country,salary
0,39,State-gov,77516,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States,<=50K
1,50,Self-emp-not-inc,83311,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States,<=50K
2,38,Private,215646,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States,<=50K
3,53,Private,234721,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,40,United-States,<=50K
4,28,Private,338409,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40,Cuba,<=50K
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
32556,27,Private,257302,Assoc-acdm,12,Married-civ-spouse,Tech-support,Wife,White,Female,0,0,38,United-States,<=50K
32557,40,Private,154374,HS-grad,9,Married-civ-spouse,Machine-op-inspct,Husband,White,Male,0,0,40,United-States,>50K
32558,58,Private,151910,HS-grad,9,Widowed,Adm-clerical,Unmarried,White,Female,0,0,40,United-States,<=50K
32559,22,Private,201490,HS-grad,9,Never-married,Adm-clerical,Own-child,White,Male,0,0,20,United-States,<=50K


In [149]:
demographic_data_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 32561 entries, 0 to 32560
Data columns (total 15 columns):
 #   Column          Non-Null Count  Dtype 
---  ------          --------------  ----- 
 0   age             32561 non-null  int64 
 1   workclass       32561 non-null  object
 2   fnlwgt          32561 non-null  int64 
 3   education       32561 non-null  object
 4   education-num   32561 non-null  int64 
 5   marital-status  32561 non-null  object
 6   occupation      32561 non-null  object
 7   relationship    32561 non-null  object
 8   race            32561 non-null  object
 9   sex             32561 non-null  object
 10  capital-gain    32561 non-null  int64 
 11  capital-loss    32561 non-null  int64 
 12  hours-per-week  32561 non-null  int64 
 13  native-country  32561 non-null  object
 14  salary          32561 non-null  object
dtypes: int64(6), object(9)
memory usage: 3.7+ MB


## Q1
##### How many people of each race are represented in this dataset? This should be a Pandas series with race names as the index labels. (race column)


In [150]:
race = demographic_data_df.groupby('race', as_index=True)['race'].count()
print(race)
type(race)

race
Amer-Indian-Eskimo      311
Asian-Pac-Islander     1039
Black                  3124
Other                   271
White                 27816
Name: race, dtype: int64


pandas.core.series.Series

## Q2
##### What is the average age of men?



In [151]:
all_men = demographic_data_df[demographic_data_df.sex == "Male"]
all_men_age_mean = all_men.age.mean()
all_men_age_mean
print('The average age of men is', round(all_men_age_mean, 1))

The average age of men is 39.4


## Q3
##### What is the percentage of people who have a Bachelor's degree?


In [152]:
bachelor_df = demographic_data_df[demographic_data_df.education == "Bachelors"]
bachelor_df

Unnamed: 0,age,workclass,fnlwgt,education,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,native-country,salary
0,39,State-gov,77516,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States,<=50K
1,50,Self-emp-not-inc,83311,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States,<=50K
4,28,Private,338409,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40,Cuba,<=50K
9,42,Private,159449,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,5178,0,40,United-States,>50K
11,30,State-gov,141297,Bachelors,13,Married-civ-spouse,Prof-specialty,Husband,Asian-Pac-Islander,Male,0,0,40,India,>50K
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
32530,35,?,320084,Bachelors,13,Married-civ-spouse,?,Wife,White,Female,0,0,55,United-States,>50K
32531,30,?,33811,Bachelors,13,Never-married,?,Not-in-family,Asian-Pac-Islander,Female,0,0,99,United-States,<=50K
32533,54,Private,337992,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,Asian-Pac-Islander,Male,0,0,50,Japan,>50K
32536,34,Private,160216,Bachelors,13,Never-married,Exec-managerial,Not-in-family,White,Female,0,0,55,United-States,>50K


In [172]:
bachelor_number = bachelor_df['education'].value_counts().sum()
print(bachelor_number)

total_number = demographic_data_df['education'].value_counts().sum()
print(total_number)

type(total_number)
type(bachelor_number)

5355
32561


numpy.int64

In [250]:
bachelor_percentage = (bachelor_number / total_number)*100
print('The percentage of people who have a degree is {}%'.format(round(bachelor_percentage, 1)))

The percentage of people who have a degree is 16.4%


## Q4
##### What percentage of people with advanced education (Bachelors, Masters, or Doctorate) make more than 50K?


In [282]:
edu_options = ['Bachelors','Masters','Doctorate']

#Checking out how many people with higher education
higher_education = demographic_data_df[demographic_data_df['education'].isin(edu_options)]

#Filtering in people with higher education 
higher_salary_educated = higher_education[higher_education['salary'] == '>50K']

In [255]:
higher_salary_educated

Unnamed: 0,age,workclass,fnlwgt,education,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,native-country,salary
8,31,Private,45781,Masters,14,Never-married,Prof-specialty,Not-in-family,White,Female,14084,0,50,United-States,>50K
9,42,Private,159449,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,5178,0,40,United-States,>50K
11,30,State-gov,141297,Bachelors,13,Married-civ-spouse,Prof-specialty,Husband,Asian-Pac-Islander,Male,0,0,40,India,>50K
19,43,Self-emp-not-inc,292175,Masters,14,Divorced,Exec-managerial,Unmarried,White,Female,0,0,45,United-States,>50K
20,40,Private,193524,Doctorate,16,Married-civ-spouse,Prof-specialty,Husband,White,Male,0,0,60,United-States,>50K
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
32533,54,Private,337992,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,Asian-Pac-Islander,Male,0,0,50,Japan,>50K
32536,34,Private,160216,Bachelors,13,Never-married,Exec-managerial,Not-in-family,White,Female,0,0,55,United-States,>50K
32538,38,Private,139180,Bachelors,13,Divorced,Prof-specialty,Unmarried,Black,Female,15020,0,45,United-States,>50K
32539,71,?,287372,Doctorate,16,Married-civ-spouse,?,Husband,White,Male,0,0,10,United-States,>50K


In [270]:
#Calculating %
rich_percentage = round((higher_salary_educated.value_counts().sum() / higher_education.value_counts().sum()) * 100,1)

print("The percentage of people with advanced education (Bachelors, Masters, or Doctorate) who make more than 50K is {} %".format(rich_percentage))

The percentage of people with advanced education (Bachelors, Masters, or Doctorate) who make more than 50K is 46.5 %


## Q5
##### What percentage of people without advanced education make more than 50K?


In [288]:
edu_options_to_exlude = ['Bachelors','Masters','Doctorate']

#Checking out how many people WITHOUT higher education
lower_education = demographic_data_df[~demographic_data_df['education'].isin(edu_options)]

#Checking out how many people with lower ed make more than 50k
lower_but_not_poor = lower_education[lower_education['salary'] == '>50K']

#Calculating %
lower_ed_high_salary_percentage = round((lower_but_not_poor.value_counts().sum() / lower_education.value_counts().sum()) * 100,1)


In [290]:
print("The percentage of people with lower education who make more than 50K is {} %".format(lower_ed_high_salary_percentage))

The percentage of people with lower education who make more than 50K is 17.4 %


## Q6
##### What is the minimum number of hours a person works per week?


In [138]:
min_hours = demographic_data_df['hours-per-week'].min()

In [140]:
print("The minimum number of hours a person works per week is {} hours".format(min_hours))

The minimum number of hours a person works per week is 1 hours


## Q7
##### What percentage of the people who work the minimum number of hours per week have a salary of more than 50K?


In [296]:
#Filtering - People working min hours per week
working_min_hours = demographic_data_df[demographic_data_df['hours-per-week'] == 1]

#Filtering - People working min hours and making more than 50k
working_min_earning_max = working_min_hours[working_min_hours['salary'] == '>50K']

#Calculating percentage
working_min_earning_max_perc = (working_min_earning_max.value_counts().sum() / working_min_hours.value_counts().sum())*100

In [298]:
print("The percentage of people that work minimum hours and earn more than 50k is {} %".format(working_min_earning_max_perc))

The percentage of people that work minimum hours and earn more than 50k is 2 %


## Q8
##### What country has the highest percentage of people that earn >50K and what is that percentage?


In [428]:
values_to_exclude = ['?']
rich_countries = demographic_data_df[demographic_data_df['salary'] == '>50K']
rich_countries_cleaned = higher_salary[~higher_salary['native-country'].isin(values_to_exclude)]

wealth_percentages = (rich_countries["native-country"].value_counts() / demographic_data_df["native-country"].value_counts())*100
wealth_country = wealth_percentages.idxmax()
print("The country which has the highest percentage of people that earn >50k is", wealth_country)
print("The percentage is", round(wealth_percentages.max(), 1))


The country which has the highest percentage of people that earn >50k is Iran
The percentage is 41.9


## Q9
##### Identify the most popular occupation for those who earn >50K in India.


In [371]:
india = demographic_data_df[demographic_data_df['native-country'] == "India"]
rich_indians = india[india['salary'] == ">50K"]
rich_indians['occupation'].value_counts()


Prof-specialty      25
Exec-managerial      8
Other-service        2
Tech-support         2
Transport-moving     1
Sales                1
Adm-clerical         1
Name: occupation, dtype: int64

The most popular occupation among indians who earn more than 50K is Prof-Specialty