In [1]:
import os
import pandas as pd

# Adding names to the columns because there was no header in the dataset
df = pd.read_csv('/kaggle/input/helllooo/adult_data (1).csv', 
                 names=["age", "workclass", "fnlwgt", "education", "education-num", "marital-status",
                        "occupation", "relationship", "race", "sex", "capital-gain", "capital-loss",
                        "hours-per-week", "native-country", "salary"])

# You must use Pandas to answer the following questions:

**How many people of each race are represented in this dataset? This should be a Pandas series with race names as the index labels. (race column)**

**What is the average age of men?**

**What is the percentage of people who have a Bachelor's degree?**

**What percentage of people with advanced education (Bachelors, Masters, or Doctorate) make more than 50K?**

**What percentage of people without advanced education make more than 50K?**

**What is the minimum number of hours a person works per week?**

**What percentage of the people who work the minimum number of hours per week have a salary of more than 50K?**

**What country has the highest percentage of people that earn >50K and what is that percentage?**

**Identify the most popular occupation for those who earn >50K in India.**

# Question 1

In [2]:
# Strip whitespace from all object (string) columns

df = df.apply(lambda col: col.str.strip() if col.dtype == "object" else col)
# Some of the values had spaces so i removed the spaces to avoid errors in future


In [3]:
# How many people of each race are represented in this dataset? 
# This should be a Pandas series with race names as the index labels. (race column)
values = dict(df['race'].value_counts())
print("\n".join(f"{i}: {j}" for i, j in values.items()))


White: 27816
Black: 3124
Asian-Pac-Islander: 1039
Amer-Indian-Eskimo: 311
Other: 271


# Question 2

In [4]:
# To clarify what values sex column contains (M/F Male/Female etc)
df['sex'].unique()

array(['Male', 'Female'], dtype=object)

In [5]:
# What is the average age of men?
avg_men_age = df[df['sex'] == 'Male']['age'].mean()
f"Average age of men: {avg_men_age}"

'Average age of men: 39.43354749885268'

# Question 3

In [6]:
# What is the percentage of people who have a Bachelor's degree?
per = round((df['education'] == 'Bachelors').sum()/df['education'].count() * 100, 2)
f"Percentage of people who have a Bachelor's degree: {per}%"

"Percentage of people who have a Bachelor's degree: 16.45%"

In [7]:
# To calrify what salary column contains
df['salary'].unique()

array(['<=50K', '>50K'], dtype=object)

In [8]:
# To calrify what education column contains
df['education'].unique()

array(['Bachelors', 'HS-grad', '11th', 'Masters', '9th', 'Some-college',
       'Assoc-acdm', 'Assoc-voc', '7th-8th', 'Doctorate', 'Prof-school',
       '5th-6th', '10th', '1st-4th', 'Preschool', '12th'], dtype=object)

# Question 4

In [35]:
# What percentage of people with advanced education (Bachelors, Masters, or Doctorate) make more than 50K?
adv_edu_people = df[df['education'].isin(['Bachelors', 'Masters', 'Doctorate'])]
total = adv_edu_people['salary'].count()
earning_plus50K = (adv_edu_people['salary'] == '>50K').sum()
percentage = round(earning_plus50K/total * 100, 2)

f"Percentage of people with advanced education making more than 50K: {percentage}%"

'Percentage of people with advanced education making more than 50K: 46.54%'

# Question 5

In [66]:
# What percentage of people without advanced education make more than 50K?
not_adv_people = df[~df['education'].isin(['Bachelors', 'Masters', 'Doctorate'])]
total = not_adv_people['salary'].count()
earning_plus50K = (not_adv_people['salary'] == '>50K').sum()
percentage = round(earning_plus50K/total * 100, 2)

f"Percentage of people without advanced education making more than 50K: {percentage}%"

'Percentage of people without advanced education making more than 50K: 17.37%'

# Question 6

In [71]:
# What is the minimum number of hours a person works per week?
min_hours = df['hours-per-week'].min()
f"Minimum number of work hours per week: {min_hours}"

'Minimum number of work hours per week: 1'

# Question 7

In [79]:
df.columns

Index(['age', 'workclass', 'fnlwgt', 'education', 'education-num',
       'marital-status', 'occupation', 'relationship', 'race', 'sex',
       'capital-gain', 'capital-loss', 'hours-per-week', 'native-country',
       'salary'],
      dtype='object')

In [98]:
# What percentage of the people who work the minimum number of hours per week have a salary of more than 50K?
total_people = df[df['hours-per-week'] == min_hours]
earning_plus50K = total_people[total_people['salary'] == '>50K']

percentage = round(len(earning_plus50K)/len(total_people) * 100, 2)
f"Percentage of the people with minimum number of hours/week having salary more than 50K: {percentage}%"

'Percentage of the people with minimum number of hours/week having salary more than 50K: 10.0%'

# Question 8

In [101]:
import time
# What country has the highest percentage of people that earn >50K and what is that percentage?
# Method 1
stats = df.groupby("native-country")["salary"]
percentage_data = stats.apply(lambda x: (x == ">50K").mean())
print("Country with highest percentage of people that earn >50K:")
print(f"Country: {percentage_data.idxmax()}, Percentage: {round(percentage_data.max()*100, 2)}%")

# # Method 2
# # To have all contries in a list so that i can use it in method 2 of next question
# countries = df['native-country'].unique().tolist()
# df['salary'].value_counts()
# h_percentage, h_country = 0.0, ""
# for cntry in countries:
#     perc = ((df['native-country'] == cntry) & (df['salary'] == '>50K')).sum()/(df['native-country'] == cntry).sum()
#     if perc > h_percentage: h_percentage, h_country = perc, cntry
        
# print(f"highest percentage of people that earn >50K:\nCountry: {h_country}, Percentage: {round(h_percentage*100, 2)}%") 


Country with highest percentage of people that earn >50K:
Country: Iran, Percentage: 41.86%


# Question 9

In [105]:
# Identify the most popular occupation for those who earn >50K in India.

indians = df[(df['native-country'] == 'India') & (df['salary'] == '>50K')]
# occupation = indians['occupation'].value_counts().idxmax()
occupation = indians['occupation'].mode()[0]
f"The most popular occupation for those who earn >50K in India: {occupation}"


'The most popular occupation for those who earn >50K in India: Prof-specialty'