# FreeCodeCamp Challenge 02: Demographic Data Analyser

In [1]:
# Dependencies
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

In [2]:
# Reading csv files
df = pd.read_csv("adult.data.csv")
df.head(5)

Unnamed: 0,age,workclass,fnlwgt,education,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,native-country,salary
0,39,State-gov,77516,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States,<=50K
1,50,Self-emp-not-inc,83311,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States,<=50K
2,38,Private,215646,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States,<=50K
3,53,Private,234721,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,40,United-States,<=50K
4,28,Private,338409,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40,Cuba,<=50K


In [3]:
# Information about the dataframe
df.dtypes

age                int64
workclass         object
fnlwgt             int64
education         object
education-num      int64
marital-status    object
occupation        object
relationship      object
race              object
sex               object
capital-gain       int64
capital-loss       int64
hours-per-week     int64
native-country    object
salary            object
dtype: object

In [4]:
# Cleaning data. Droping rows with non values.
df.dropna(inplace = True) 

In [5]:
# Transforming education, race, sex, native-country and salary data from object to category type
df = df.astype({
    "education": "category",
    "occupation": "category",
    "race": "category",
    "sex": "category",
    "native-country": "category",
    "salary": "category"
    })
df.dtypes

age                  int64
workclass           object
fnlwgt               int64
education         category
education-num        int64
marital-status      object
occupation        category
relationship        object
race              category
sex               category
capital-gain         int64
capital-loss         int64
hours-per-week       int64
native-country    category
salary            category
dtype: object

### 01. Number of race in the dataset

In [6]:
race_count = df["race"].value_counts()
race_count

White                 27816
Black                  3124
Asian-Pac-Islander     1039
Amer-Indian-Eskimo      311
Other                   271
Name: race, dtype: int64

### 02. Average age of men

In [7]:
average_age_men = np.round(np.mean(df["age"][df["sex"] == "Male"]), 1)
average_age_men

39.4

### 03. Percentage of people with bachelor

In [8]:
percentage_bachelors = np.round((len(df["education"][df["education"] == "Bachelors"]) / len(df["education"])) * 100, 1)
percentage_bachelors

16.4

### 04. Percentage of people with or without advanced education that make more 50K

In [9]:
# People with advanced education (with Bachelors, Masters or Doctorate)
higher_education_filter = (df["education"] == "Bachelors") | (df["education"] == "Masters") | (df["education"] == "Doctorate")
higher_education = df[higher_education_filter]
higher_education.head(5)

Unnamed: 0,age,workclass,fnlwgt,education,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,native-country,salary
0,39,State-gov,77516,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States,<=50K
1,50,Self-emp-not-inc,83311,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States,<=50K
4,28,Private,338409,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40,Cuba,<=50K
5,37,Private,284582,Masters,14,Married-civ-spouse,Exec-managerial,Wife,White,Female,0,0,40,United-States,<=50K
8,31,Private,45781,Masters,14,Never-married,Prof-specialty,Not-in-family,White,Female,14084,0,50,United-States,>50K


In [10]:
# People with lower education
lower_education_filter = ~ higher_education_filter
lower_education = df[lower_education_filter]
lower_education.head(5)

Unnamed: 0,age,workclass,fnlwgt,education,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,native-country,salary
2,38,Private,215646,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States,<=50K
3,53,Private,234721,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,40,United-States,<=50K
6,49,Private,160187,9th,5,Married-spouse-absent,Other-service,Not-in-family,Black,Female,0,0,16,Jamaica,<=50K
7,52,Self-emp-not-inc,209642,HS-grad,9,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,45,United-States,>50K
10,37,Private,280464,Some-college,10,Married-civ-spouse,Exec-managerial,Husband,Black,Male,0,0,80,United-States,>50K


In [11]:
# percentage of people with advanced education and salary >50K
number_higher_education = len(higher_education["education"])
higher_education_rich = np.round(len(higher_education[higher_education["salary"] == ">50K"]) / number_higher_education * 100, 1)
higher_education_rich

46.5

In [12]:
# percentage of people without advanced education and salary >50K
number_lower_education = len(lower_education["education"])
lower_education_rich = np.round(len(lower_education[lower_education["salary"] == ">50K"]) / number_lower_education * 100, 1)
lower_education_rich

17.4

### 05. Minimum number of hours a person works per week (hours-per-week feature)

In [13]:
min_work_hours = np.min(df["hours-per-week"])
min_work_hours

1

In [14]:

database_min_work_hours = df[df["hours-per-week"] == min_work_hours]
database_min_work_hours.head(5)

Unnamed: 0,age,workclass,fnlwgt,education,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,native-country,salary
189,58,State-gov,109567,Doctorate,16,Married-civ-spouse,Prof-specialty,Husband,White,Male,0,0,1,United-States,>50K
1036,66,Self-emp-inc,150726,9th,5,Married-civ-spouse,Exec-managerial,Husband,White,Male,1409,0,1,?,<=50K
1262,69,?,195779,Assoc-voc,11,Widowed,?,Not-in-family,White,Female,0,0,1,United-States,<=50K
5590,78,?,363134,HS-grad,9,Widowed,?,Not-in-family,White,Female,0,0,1,United-States,<=50K
5632,45,?,189564,Masters,14,Married-civ-spouse,?,Wife,White,Female,0,0,1,United-States,<=50K


### 06. Percentage of the people who work the minimum number of hours per week and have a salary of >50K

In [15]:
# Number of people who work the minimun number of hours per week
num_min_workers = len(database_min_work_hours["salary"])
num_min_workers

20

In [16]:
# Percentage of these people who have a salary >50K
num_min_workers_filter = database_min_work_hours["salary"] == ">50K"
rich_percentage = np.round(len(database_min_work_hours[num_min_workers_filter]) / num_min_workers * 100, 1)
rich_percentage

10.0

### 07. Country with the highest percentage of people that earn >50K

In [17]:
country_workers_more50k = df[df["salary"] == ">50K"] 
country_workers_more50k.head(5)

Unnamed: 0,age,workclass,fnlwgt,education,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,native-country,salary
7,52,Self-emp-not-inc,209642,HS-grad,9,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,45,United-States,>50K
8,31,Private,45781,Masters,14,Never-married,Prof-specialty,Not-in-family,White,Female,14084,0,50,United-States,>50K
9,42,Private,159449,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,5178,0,40,United-States,>50K
10,37,Private,280464,Some-college,10,Married-civ-spouse,Exec-managerial,Husband,Black,Male,0,0,80,United-States,>50K
11,30,State-gov,141297,Bachelors,13,Married-civ-spouse,Prof-specialty,Husband,Asian-Pac-Islander,Male,0,0,40,India,>50K


In [18]:
serie_country_more50k = country_workers_more50k["native-country"].value_counts()
serie_country_more50k.head(5)

United-States    7171
?                 146
Philippines        61
Germany            44
India              40
Name: native-country, dtype: int64

In [19]:
# Number of people per country
df["ones"] = 1
people_per_country = df.groupby("native-country")["ones"].count().sort_values(ascending=False)
people_per_country.head(5)

native-country
United-States    29170
Mexico             643
?                  583
Philippines        198
Germany            137
Name: ones, dtype: int64

In [20]:
# Percentage of people per country that ears >50K
percentage_earning_country = np.round((serie_country_more50k / people_per_country) * 100, 1).sort_values(ascending=False)
percentage_earning_country.head(5)

Iran      41.9
France    41.4
India     40.0
Taiwan    39.2
Japan     38.7
dtype: float64

In [21]:
# Highest percentage of people that earn >50K
highest_earning_country = percentage_earning_country.idxmax()
highest_earning_country

'Iran'

### 08. Percentage of people that earn >50K in the country 

In [22]:
# Percentage of people that earn >50K in the country
highest_earning_country_percentage = percentage_earning_country.max()
highest_earning_country_percentage 

41.9

### 09. The most popular occupation for those who earn >50K in India

In [23]:
india_database = df[(df["native-country"] == "India") & (df["salary"] == ">50K")]
india_database.head(5)

Unnamed: 0,age,workclass,fnlwgt,education,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,native-country,salary,ones
11,30,State-gov,141297,Bachelors,13,Married-civ-spouse,Prof-specialty,Husband,Asian-Pac-Islander,Male,0,0,40,India,>50K,1
968,48,Private,164966,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,Asian-Pac-Islander,Male,0,0,40,India,>50K,1
1327,52,Private,168381,HS-grad,9,Widowed,Other-service,Unmarried,Asian-Pac-Islander,Female,0,0,40,India,>50K,1
7258,42,State-gov,102343,Prof-school,15,Married-civ-spouse,Prof-specialty,Husband,Asian-Pac-Islander,Male,0,0,72,India,>50K,1
7285,54,State-gov,93449,Masters,14,Married-civ-spouse,Prof-specialty,Husband,Asian-Pac-Islander,Male,0,0,40,India,>50K,1


In [24]:
top_IN_occupation = india_database["occupation"].value_counts().idxmax()
top_IN_occupation

'Prof-specialty'