<a href="https://colab.research.google.com/github/lilaceri/Working-with-data-/blob/main/Census_Project.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Census Data

adult.data.csv is a dataset of demographic data that was extracted from the 1994 Census database.  

Here is a sample of what the data looks like:

|    |   age | workclass        |   fnlwgt | education   |   education-num | marital-status     | occupation        | relationship   | race   | sex    |   capital-gain |   capital-loss |   hours-per-week | native-country   | salary   |
|---:|------:|:-----------------|---------:|:------------|----------------:|:-------------------|:------------------|:---------------|:-------|:-------|---------------:|---------------:|-----------------:|:-----------------|:---------|
|  0 |    39 | State-gov        |    77516 | Bachelors   |              13 | Never-married      | Adm-clerical      | Not-in-family  | White  | Male   |           2174 |              0 |               40 | United-States    | <=50K    |
|  1 |    50 | Self-emp-not-inc |    83311 | Bachelors   |              13 | Married-civ-spouse | Exec-managerial   | Husband        | White  | Male   |              0 |              0 |               13 | United-States    | <=50K    |
|  2 |    38 | Private          |   215646 | HS-grad     |               9 | Divorced           | Handlers-cleaners | Not-in-family  | White  | Male   |              0 |              0 |               40 | United-States    | <=50K    |
|  3 |    53 | Private          |   234721 | 11th        |               7 | Married-civ-spouse | Handlers-cleaners | Husband        | Black  | Male   |              0 |              0 |               40 | United-States    | <=50K    |
|  4 |    28 | Private          |   338409 | Bachelors   |              13 | Married-civ-spouse | Prof-specialty    | Wife           | Black  | Female |              0 |              0 |               40 | Cuba             | <=50K    |


# pandas

is an open-source library of data structures and functions for  practical, real world data analysis in Python

Create an alias to the pandas library and read the data from the census data file.

In [None]:
import pandas as pd

def get_dataframe():
    # Read data from file
    data_url = "https://raw.githubusercontent.com/freeCodeCamp/boilerplate-demographic-data-analyzer/master/adult.data.csv"
    df = pd.read_csv(data_url)
    return df

df = get_dataframe()
df

Unnamed: 0,age,workclass,fnlwgt,education,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,native-country,salary
0,39,State-gov,77516,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States,<=50K
1,50,Self-emp-not-inc,83311,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States,<=50K
2,38,Private,215646,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States,<=50K
3,53,Private,234721,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,40,United-States,<=50K
4,28,Private,338409,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40,Cuba,<=50K
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
32556,27,Private,257302,Assoc-acdm,12,Married-civ-spouse,Tech-support,Wife,White,Female,0,0,38,United-States,<=50K
32557,40,Private,154374,HS-grad,9,Married-civ-spouse,Machine-op-inspct,Husband,White,Male,0,0,40,United-States,>50K
32558,58,Private,151910,HS-grad,9,Widowed,Adm-clerical,Unmarried,White,Female,0,0,40,United-States,<=50K
32559,22,Private,201490,HS-grad,9,Never-married,Adm-clerical,Own-child,White,Male,0,0,20,United-States,<=50K


---
Find the number of each race represented in the dataset.

*Hint: This should be a Pandas series with race names as the index labels.*

In [None]:
def get_race_count():
  race_count = pd.Series(data=df["race"].value_counts(), index=df["race"].unique())
  return race_count

get_race_count()

---
Find the average age of men

In [None]:
def find_average_age_men():
    men = df.loc[df["sex"]=="Male"]
    return round(men["age"].mean(),1)

find_average_age_men()

---
Find the percentage of people who have a Bachelor's degree.

In [None]:
def find_percentage(frame, col, val):
  quantity = frame.loc[frame[col]==val]
  percentage = round(quantity[col].count() / frame[col].count() * 100, 1)
  return percentage

find_percentage(df, "education", "HS-grad")

32.3

---
Find the percentage of people with advanced education (`Bachelors`, `Masters`, or `Doctorate`) who make more than 50K

In [None]:
def percentage_higher_rich():
  higher_education = df.loc[(df["education"].isin(["Bachelors","Masters","Doctorate"]))]
  higher_ed_salary = df.loc[(df["education"].isin(["Bachelors","Masters","Doctorate"]))&(df["salary"]==">50K")]
  higher_education_rich = round(higher_ed_salary["education"].count() / higher_education["education"].count() * 100,1)
  return higher_education_rich

percentage_higher_rich()

46.5

---
Find the percentage of people without advanced education who make more than 50K 

In [None]:
def percentage_lower_rich():
  lower_education = df.loc[(~df["education"].isin(["Bachelors","Masters","Doctorate"]))]
  lower_ed_salary = df.loc[(~df["education"].isin(["Bachelors","Masters","Doctorate"]))&(df["salary"]==">50K")]
  lower_education_rich = round(lower_ed_salary["education"].count() / lower_education["education"].count() * 100,1)
  return lower_education_rich

percentage_lower_rich()

---
Find the minimum number of hours a person works per week (hours-per-week feature) and the percentage of the people who work the minimum number of hours per week have a salary of >50K

In [None]:
def find_percentage_min_hours_rich():
    min_work_hours = df["hours-per-week"].min()
    num_min_workers = df.loc[(df["hours-per-week"]==min_work_hours)&(df["salary"]=='>50K')]
    all_min_workers = df.loc[(df["hours-per-week"]==min_work_hours)]
    rich_percentage = num_min_workers["hours-per-week"].count() / all_min_workers["hours-per-week"].count() * 100
    return rich_percentage

find_percentage_min_hours_rich()

10.0

---
Find the country with the highest percentage of people who earn >50K

In [None]:
def find_highest_rich():
  countries = df.loc[df["salary"]==">50K"]
  rich_count =countries["native-country"].value_counts()
  rich_count = rich_count.sort_values(ascending=False)
  population = df["native-country"].value_counts()
  percentage = rich_count / population * 100
  percentage = percentage.sort_values(ascending=False)
  highest_earning_country = percentage.head(1).index[0]
  highest_earning_country_percentage = round(percentage.head(1).values[0],1)
  return highest_earning_country, highest_earning_country_percentage

find_highest_rich()

('Iran', 41.9)

---
Find the most popular occupation for those who earn >50K in a given country

In [None]:
def find_top_occupation_rich(country):
    india = df.loc[df["native-country"]==country]
    in_occupations = india["occupation"].value_counts()
    in_occupations = in_occupations.sort_values(ascending=False)
    top_IN_occupation = in_occupations.head(1).index[0]
    return top_IN_occupation

find_top_occupation_rich("India")

'Prof-specialty'

In [None]:
def show_result(x,limit):
  while (x < limit):
    x+=2
  print(x)

show_result(1,100)

101
