<div align="center">
    <h1>Demographic Data Analyzer</h1>
    <img src="https://i0.wp.com/persona.qcri.org/blog/wp-content/uploads/2021/03/persona_demographics.png?fit=625%2C419&ssl=1" alt="Demographic Picture" width="500">
</div>

<div>
    <h2>Overview</h2>
    <p>This is my personal solution to <a href="https://www.freecodecamp.org/" target="_blank">Freecodecamp's</a> second <a href="https://www.freecodecamp.org/learn/data-analysis-with-python/data-analysis-with-python-projects/demographic-data-analyzer" target="_blank">Data Analysis with Python Challenge</a></p>
    <p>Simple Data Analysis of demographic data extracted from a 1994 Census Database</p>
</div>

In [2]:
# Import dependencies
import pandas as pd

In [3]:
# Import dataset
demo_data = pd.read_csv("adult.data.csv")

In [4]:
# Find if there are any NaN values
demo_data.isna().values.any()

False

In [5]:
# Explore the dataset
demo_data.shape

(32561, 15)

In [6]:
demo_data.columns

Index(['age', 'workclass', 'fnlwgt', 'education', 'education-num',
       'marital-status', 'occupation', 'relationship', 'race', 'sex',
       'capital-gain', 'capital-loss', 'hours-per-week', 'native-country',
       'salary'],
      dtype='object')

In [7]:
demo_data.head()

Unnamed: 0,age,workclass,fnlwgt,education,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,native-country,salary
0,39,State-gov,77516,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States,<=50K
1,50,Self-emp-not-inc,83311,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States,<=50K
2,38,Private,215646,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States,<=50K
3,53,Private,234721,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,40,United-States,<=50K
4,28,Private,338409,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40,Cuba,<=50K


In [8]:
demo_data.tail()

Unnamed: 0,age,workclass,fnlwgt,education,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,native-country,salary
32556,27,Private,257302,Assoc-acdm,12,Married-civ-spouse,Tech-support,Wife,White,Female,0,0,38,United-States,<=50K
32557,40,Private,154374,HS-grad,9,Married-civ-spouse,Machine-op-inspct,Husband,White,Male,0,0,40,United-States,>50K
32558,58,Private,151910,HS-grad,9,Widowed,Adm-clerical,Unmarried,White,Female,0,0,40,United-States,<=50K
32559,22,Private,201490,HS-grad,9,Never-married,Adm-clerical,Own-child,White,Male,0,0,20,United-States,<=50K
32560,52,Self-emp-inc,287927,HS-grad,9,Married-civ-spouse,Exec-managerial,Wife,White,Female,15024,0,40,United-States,>50K


In [9]:
# Find the number of people of each race, consider that to be a pandas dataseries
race_count = demo_data["race"].value_counts()
race_count

race
White                 27816
Black                  3124
Asian-Pac-Islander     1039
Amer-Indian-Eskimo      311
Other                   271
Name: count, dtype: int64

In [10]:
# Find the average age of men

# Make a sub Pandas DataFrame for all men
men_data = demo_data[demo_data["sex"] == "Male"]

# Find the average age of men
average_age_men = men_data["age"].mean()
average_age_men

39.43354749885268

In [11]:
# Find the percentage of people with bachelors degree
bachelor_people = demo_data[demo_data["education"] == "Bachelors"]

bachelor_percentage = 100 * len(bachelor_people) / len(demo_data)
bachelor_percentage

16.446055096587944

In [12]:
# Find the percentage of people with Bachelors, Masters, or Doctorate
# Grab people with advanced education
advanced_edu = demo_data[demo_data["education"].isin(["Bachelors", "Masters", "Doctorate"])]
advanced_edu_more_than_50 = advanced_edu[advanced_edu["salary"] == ">50K"]

# Calculate the percentage
advanced_edu_more_than_50_per = 100 * len(advanced_edu_more_than_50) / len(advanced_edu)
advanced_edu_more_than_50_per

46.535843011613935

In [13]:
# Grab the minimum number of hours a person works per week
min_weekly_hours = demo_data["hours-per-week"].min()
min_weekly_hours

1

In [14]:
# Find percentage of people who work minimum number of hours per week that make more than 50K
min_weekly_people = demo_data[demo_data["hours-per-week"] == min_weekly_hours]
min_weekly_people_more_than_50 = min_weekly_people[min_weekly_people["salary"] == ">50K"]

min_weekly_people_more_than_50_per = 100 * len(min_weekly_people_more_than_50) / len(min_weekly_people)
min_weekly_people_more_than_50_per

10.0

In [21]:
# Grab the country that has the highest percentage of people earning more than 50K and what's that percentage
# Grab the people who make more than 50K per country
country_pop = demo_data["native-country"].value_counts()

country_pop_more_than_50 = demo_data[demo_data["salary"] == ">50K"]["native-country"].value_counts()

country_pop_more_than_50_per = country_pop_more_than_50 * 100 / country_pop

# Find the country with highest percentage of people earning more than 50
highest_per_country = country_pop_more_than_50_per.idxmax()
highest_per_country

'Iran'

In [22]:
# Grab the entry with the highest percentage
highest_per = country_pop_more_than_50_per.max()
highest_per

41.86046511627907

In [33]:
# Identify the most common occupation for people from India who earn more than 50K
ex_demo_data = demo_data.rename(columns = {"native-country":"country"})
rich_indians_data = ex_demo_data.query("country == 'India' and salary == '>50K'")
riom = rich_indians_data["occupation"].mode().iloc[0]
riom

'Prof-specialty'