# **Pandas Intermediate**

### **Import pandas**

In [5]:
import pandas as pd

### **Importing SQL Database**

In [1]:
import sqlite3

In [6]:
# Create Connection
conn = sqlite3.connect('census_data.db')

In [7]:
census_df = pd.read_sql_query("SELECT * FROM individuals", conn)
census_df

Unnamed: 0,individual_id,age,workclass,fnlwgt,education,educational_num,marital_status,occupation,relationship,race,gender,capital_gain,capital_loss,hours_per_week,native_country,income
0,1,25,Private,226802,11th,7,Never-married,Machine-op-inspct,Own-child,Black,Male,0,0,40,United-States,<=50K
1,2,38,Private,89814,HS-grad,9,Married-civ-spouse,Farming-fishing,Husband,White,Male,0,0,50,United-States,<=50K
2,3,28,Local-gov,336951,Assoc-acdm,12,Married-civ-spouse,Protective-serv,Husband,White,Male,0,0,40,United-States,>50K
3,4,44,Private,160323,Some-college,10,Married-civ-spouse,Machine-op-inspct,Husband,Black,Male,7688,0,40,United-States,>50K
4,5,18,?,103497,Some-college,10,Never-married,?,Own-child,White,Female,0,0,30,United-States,<=50K
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
48837,48838,27,Private,257302,Assoc-acdm,12,Married-civ-spouse,Tech-support,Wife,White,Female,0,0,38,United-States,<=50K
48838,48839,40,Private,154374,HS-grad,9,Married-civ-spouse,Machine-op-inspct,Husband,White,Male,0,0,40,United-States,>50K
48839,48840,58,Private,151910,HS-grad,9,Widowed,Adm-clerical,Unmarried,White,Female,0,0,40,United-States,<=50K
48840,48841,22,Private,201490,HS-grad,9,Never-married,Adm-clerical,Own-child,White,Male,0,0,20,United-States,<=50K


## **Data Inspection** 

Data inspection is the initial review of a dataset to find missing values, 
incorrect data types, and gather basic statistics, providing insights into its quality and structure.

In [None]:
# Step 1: Identify missing values
census_df.isnull()

Unnamed: 0,individual_id,age,workclass,fnlwgt,education,educational_num,marital_status,occupation,relationship,race,gender,capital_gain,capital_loss,hours_per_week,native_country,income
0,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False
1,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False
2,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False
3,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False
4,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
48837,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False
48838,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False
48839,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False
48840,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False


In [None]:
census_df.isnull().sum()

individual_id      0
age                0
workclass          0
fnlwgt             0
education          0
educational_num    0
marital_status     0
occupation         0
relationship       0
race               0
gender             0
capital_gain       0
capital_loss       0
hours_per_week     0
native_country     0
income             0
dtype: int64

In [None]:
(census_df == "?").sum()

individual_id         0
age                   0
workclass          2799
fnlwgt                0
education             0
educational_num       0
marital_status        0
occupation         2809
relationship          0
race                  0
gender                0
capital_gain          0
capital_loss          0
hours_per_week        0
native_country      857
income                0
dtype: int64

In [None]:
# Check data type for each columns
census_df.dtypes
census_df.describe()

Unnamed: 0,individual_id,age,educational_num,capital_gain,capital_loss,hours_per_week
count,48842.0,48842.0,48842.0,48842.0,48842.0,48842.0
mean,24421.5,38.643585,10.078089,1079.067626,87.502314,40.422382
std,14099.615261,13.71051,2.570973,7452.019058,403.004552,12.391444
min,1.0,17.0,1.0,0.0,0.0,1.0
25%,12211.25,28.0,9.0,0.0,0.0,40.0
50%,24421.5,37.0,10.0,0.0,0.0,40.0
75%,36631.75,48.0,12.0,0.0,0.0,45.0
max,48842.0,90.0,16.0,99999.0,4356.0,99.0


## **Cleaning Data**

Cleaning data involves eliminating or rectifying inaccuracies, inconsistencies, 
and missing values within your dataset, utilizing techniques such as handling 
missing values via deletion or imputation, rectifying data types, and detecting 
and eliminating duplicate entries, ultimately resulting in more precise and dependable analysis.

In [None]:
# replace(old value, new value, inplace=True)
census_df.replace("?", pd.NA, inplace=True)

In [None]:
census_df

Unnamed: 0,individual_id,age,workclass,fnlwgt,education,educational_num,marital_status,occupation,relationship,race,gender,capital_gain,capital_loss,hours_per_week,native_country,income
0,1,25,Private,226802,11th,7,Never-married,Machine-op-inspct,Own-child,Black,Male,0,0,40,United-States,<=50K
1,2,38,Private,89814,HS-grad,9,Married-civ-spouse,Farming-fishing,Husband,White,Male,0,0,50,United-States,<=50K
2,3,28,Local-gov,336951,Assoc-acdm,12,Married-civ-spouse,Protective-serv,Husband,White,Male,0,0,40,United-States,>50K
3,4,44,Private,160323,Some-college,10,Married-civ-spouse,Machine-op-inspct,Husband,Black,Male,7688,0,40,United-States,>50K
4,5,18,,103497,Some-college,10,Never-married,,Own-child,White,Female,0,0,30,United-States,<=50K
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
48837,48838,27,Private,257302,Assoc-acdm,12,Married-civ-spouse,Tech-support,Wife,White,Female,0,0,38,United-States,<=50K
48838,48839,40,Private,154374,HS-grad,9,Married-civ-spouse,Machine-op-inspct,Husband,White,Male,0,0,40,United-States,>50K
48839,48840,58,Private,151910,HS-grad,9,Widowed,Adm-clerical,Unmarried,White,Female,0,0,40,United-States,<=50K
48840,48841,22,Private,201490,HS-grad,9,Never-married,Adm-clerical,Own-child,White,Male,0,0,20,United-States,<=50K


In [None]:
census_df.isnull().sum()

individual_id         0
age                   0
workclass          2799
fnlwgt                0
education             0
educational_num       0
marital_status        0
occupation         2809
relationship          0
race                  0
gender                0
capital_gain          0
capital_loss          0
hours_per_week        0
native_country      857
income                0
dtype: int64

In [None]:
# Fixing inconsistency
# fillna()
census_df["occupation"].fillna("Unemployed", inplace=True)

In [None]:
census_df

Unnamed: 0,individual_id,age,workclass,fnlwgt,education,educational_num,marital_status,occupation,relationship,race,gender,capital_gain,capital_loss,hours_per_week,native_country,income
0,1,25,Private,226802,11th,7,Never-married,Machine-op-inspct,Own-child,Black,Male,0,0,40,United-States,<=50K
1,2,38,Private,89814,HS-grad,9,Married-civ-spouse,Farming-fishing,Husband,White,Male,0,0,50,United-States,<=50K
2,3,28,Local-gov,336951,Assoc-acdm,12,Married-civ-spouse,Protective-serv,Husband,White,Male,0,0,40,United-States,>50K
3,4,44,Private,160323,Some-college,10,Married-civ-spouse,Machine-op-inspct,Husband,Black,Male,7688,0,40,United-States,>50K
4,5,18,,103497,Some-college,10,Never-married,Unemployed,Own-child,White,Female,0,0,30,United-States,<=50K
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
48837,48838,27,Private,257302,Assoc-acdm,12,Married-civ-spouse,Tech-support,Wife,White,Female,0,0,38,United-States,<=50K
48838,48839,40,Private,154374,HS-grad,9,Married-civ-spouse,Machine-op-inspct,Husband,White,Male,0,0,40,United-States,>50K
48839,48840,58,Private,151910,HS-grad,9,Widowed,Adm-clerical,Unmarried,White,Female,0,0,40,United-States,<=50K
48840,48841,22,Private,201490,HS-grad,9,Never-married,Adm-clerical,Own-child,White,Male,0,0,20,United-States,<=50K


In [None]:
# dropna(): Drop/Delete rows/columns that has missing values
census_df.dropna(inplace=True)

In [None]:
census_df.isnull().sum()

individual_id      0
age                0
workclass          0
fnlwgt             0
education          0
educational_num    0
marital_status     0
occupation         0
relationship       0
race               0
gender             0
capital_gain       0
capital_loss       0
hours_per_week     0
native_country     0
income             0
dtype: int64

### **Trimming and Cleaning Text Data**

In [8]:
# strip()
census_df["workclass"].str.strip()
census_df["workclass"]

0             Private
1             Private
2           Local-gov
3             Private
4                   ?
             ...     
48837         Private
48838         Private
48839         Private
48840         Private
48841    Self-emp-inc
Name: workclass, Length: 48842, dtype: object

In [9]:
census_df["occupation"].unique()

array(['Machine-op-inspct', 'Farming-fishing', 'Protective-serv', '?',
       'Other-service', 'Prof-specialty', 'Craft-repair', 'Adm-clerical',
       'Exec-managerial', 'Tech-support', 'Sales', 'Priv-house-serv',
       'Transport-moving', 'Handlers-cleaners', 'Armed-Forces'],
      dtype=object)

In [11]:
# Goal: Improve Data Clarity (Occupation)
# {key:value}
occupation_data = {
    "Machine-op-inspct": "Machine Operator",
    "Farming-fishing": "Farming and Fishing",
    "Protective-serv": "Protective Services",
    "Other-service": "Other Service",
    "Prof-specialty": "Professional Specialty",
    "Craft-repair": "Craft Repair",
    "Adm-clerical": "Admin Clerical",
    "Exec-managerial": "Executive and Managerial",
    "Tech-support": "Tech Support",
    "Priv-house-serv": "Private Household Services",
    "Transport-moving": "Transportation and Moving",
    'Handlers-cleaners': "Handlers and Cleaners",
    'Armed-Forces': "Armed Forces"
}

# map()
# fillna()
census_df["occupation"] = census_df["occupation"].map(occupation_data).fillna(census_df["occupation"])
census_df.head()

Unnamed: 0,individual_id,age,workclass,fnlwgt,education,educational_num,marital_status,occupation,relationship,race,gender,capital_gain,capital_loss,hours_per_week,native_country,income
0,1,25,Private,226802,11th,7,Never-married,Machine Operator,Own-child,Black,Male,0,0,40,United-States,<=50K
1,2,38,Private,89814,HS-grad,9,Married-civ-spouse,Farming and Fishing,Husband,White,Male,0,0,50,United-States,<=50K
2,3,28,Local-gov,336951,Assoc-acdm,12,Married-civ-spouse,Protective Services,Husband,White,Male,0,0,40,United-States,>50K
3,4,44,Private,160323,Some-college,10,Married-civ-spouse,Machine Operator,Husband,Black,Male,7688,0,40,United-States,>50K
4,5,18,?,103497,Some-college,10,Never-married,?,Own-child,White,Female,0,0,30,United-States,<=50K


In [12]:
# regex - Regular Expression
census_df["marital_status"] = census_df["marital_status"].replace("-", " ", regex=True)
census_df

Unnamed: 0,individual_id,age,workclass,fnlwgt,education,educational_num,marital_status,occupation,relationship,race,gender,capital_gain,capital_loss,hours_per_week,native_country,income
0,1,25,Private,226802,11th,7,Never married,Machine Operator,Own-child,Black,Male,0,0,40,United-States,<=50K
1,2,38,Private,89814,HS-grad,9,Married civ spouse,Farming and Fishing,Husband,White,Male,0,0,50,United-States,<=50K
2,3,28,Local-gov,336951,Assoc-acdm,12,Married civ spouse,Protective Services,Husband,White,Male,0,0,40,United-States,>50K
3,4,44,Private,160323,Some-college,10,Married civ spouse,Machine Operator,Husband,Black,Male,7688,0,40,United-States,>50K
4,5,18,?,103497,Some-college,10,Never married,?,Own-child,White,Female,0,0,30,United-States,<=50K
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
48837,48838,27,Private,257302,Assoc-acdm,12,Married civ spouse,Tech Support,Wife,White,Female,0,0,38,United-States,<=50K
48838,48839,40,Private,154374,HS-grad,9,Married civ spouse,Machine Operator,Husband,White,Male,0,0,40,United-States,>50K
48839,48840,58,Private,151910,HS-grad,9,Widowed,Admin Clerical,Unmarried,White,Female,0,0,40,United-States,<=50K
48840,48841,22,Private,201490,HS-grad,9,Never married,Admin Clerical,Own-child,White,Male,0,0,20,United-States,<=50K


In [None]:
# to make the text into Title case
census_df["marital_status"] = census_df["marital_status"].str.title()
census_df

Unnamed: 0,individual_id,age,workclass,fnlwgt,education,educational_num,marital_status,occupation,relationship,race,gender,capital_gain,capital_loss,hours_per_week,native_country,income
0,1,25,Private,226802,11th,7,Never Married,Machine Operator,Own-child,Black,Male,0,0,40,United-States,<=50K
1,2,38,Private,89814,HS-grad,9,Married Civ Spouse,Farming and Fishing,Husband,White,Male,0,0,50,United-States,<=50K
2,3,28,Local-gov,336951,Assoc-acdm,12,Married Civ Spouse,Protective Services,Husband,White,Male,0,0,40,United-States,>50K
3,4,44,Private,160323,Some-college,10,Married Civ Spouse,Machine Operator,Husband,Black,Male,7688,0,40,United-States,>50K
4,5,18,?,103497,Some-college,10,Never Married,?,Own-child,White,Female,0,0,30,United-States,<=50K
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
48837,48838,27,Private,257302,Assoc-acdm,12,Married Civ Spouse,Tech Support,Wife,White,Female,0,0,38,United-States,<=50K
48838,48839,40,Private,154374,HS-grad,9,Married Civ Spouse,Machine Operator,Husband,White,Male,0,0,40,United-States,>50K
48839,48840,58,Private,151910,HS-grad,9,Widowed,Admin Clerical,Unmarried,White,Female,0,0,40,United-States,<=50K
48840,48841,22,Private,201490,HS-grad,9,Never Married,Admin Clerical,Own-child,White,Male,0,0,20,United-States,<=50K


In [14]:
census_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 48842 entries, 0 to 48841
Data columns (total 16 columns):
 #   Column           Non-Null Count  Dtype 
---  ------           --------------  ----- 
 0   individual_id    48842 non-null  int64 
 1   age              48842 non-null  int64 
 2   workclass        48842 non-null  object
 3   fnlwgt           48842 non-null  object
 4   education        48842 non-null  object
 5   educational_num  48842 non-null  int64 
 6   marital_status   48842 non-null  object
 7   occupation       48842 non-null  object
 8   relationship     48842 non-null  object
 9   race             48842 non-null  object
 10  gender           48842 non-null  object
 11  capital_gain     48842 non-null  int64 
 12  capital_loss     48842 non-null  int64 
 13  hours_per_week   48842 non-null  int64 
 14  native_country   48842 non-null  object
 15  income           48842 non-null  object
dtypes: int64(6), object(10)
memory usage: 6.0+ MB


In [15]:
census_df["workclass"] = census_df["workclass"].astype('category')
census_df["education"] = census_df["education"].astype('category')
census_df["educational_num"] = census_df["educational_num"].astype('category')
census_df["marital_status"] = census_df["marital_status"].astype('category')
census_df["occupation"] = census_df["occupation"].astype('category')
census_df["relationship"] = census_df["relationship"].astype('category')
census_df["race"] = census_df["race"].astype('category')
census_df["gender"] = census_df["gender"].astype('category')
census_df["native_country"] = census_df["native_country"].astype('category')
census_df["income"] = census_df["income"].astype('category')
census_df["fnlwgt"] = census_df["fnlwgt"].astype('int64')
census_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 48842 entries, 0 to 48841
Data columns (total 16 columns):
 #   Column           Non-Null Count  Dtype   
---  ------           --------------  -----   
 0   individual_id    48842 non-null  int64   
 1   age              48842 non-null  int64   
 2   workclass        48842 non-null  category
 3   fnlwgt           48842 non-null  int64   
 4   education        48842 non-null  category
 5   educational_num  48842 non-null  category
 6   marital_status   48842 non-null  category
 7   occupation       48842 non-null  category
 8   relationship     48842 non-null  category
 9   race             48842 non-null  category
 10  gender           48842 non-null  category
 11  capital_gain     48842 non-null  int64   
 12  capital_loss     48842 non-null  int64   
 13  hours_per_week   48842 non-null  int64   
 14  native_country   48842 non-null  category
 15  income           48842 non-null  category
dtypes: category(10), int64(6)
memory usage: 

### **Renaming columns and Reindexing**

In [16]:
census_df.columns

Index(['individual_id', 'age', 'workclass', 'fnlwgt', 'education',
       'educational_num', 'marital_status', 'occupation', 'relationship',
       'race', 'gender', 'capital_gain', 'capital_loss', 'hours_per_week',
       'native_country', 'income'],
      dtype='object')

In [17]:
columns = {
    "fnlwgt": "Final_weight_of_record",
    "educational_num": "educational_level",
    "hours_per_week": "working_hours_per_week",
    "income": "income_bracket"
}
# rename(columns={}, inplace=True)
census_df.rename(columns=columns, inplace=True)
census_df.columns

Index(['individual_id', 'age', 'workclass', 'Final_weight_of_record',
       'education', 'educational_level', 'marital_status', 'occupation',
       'relationship', 'race', 'gender', 'capital_gain', 'capital_loss',
       'working_hours_per_week', 'native_country', 'income_bracket'],
      dtype='object')

In [18]:
# Reindexing - change order of the column
# reindex
columns_list = ["age", "gender", "workclass", "education"]
reindex_df = census_df.reindex(columns=columns_list)
reindex_df

Unnamed: 0,age,gender,workclass,education
0,25,Male,Private,11th
1,38,Male,Private,HS-grad
2,28,Male,Local-gov,Assoc-acdm
3,44,Male,Private,Some-college
4,18,Female,?,Some-college
...,...,...,...,...
48837,27,Female,Private,Assoc-acdm
48838,40,Male,Private,HS-grad
48839,58,Female,Private,HS-grad
48840,22,Male,Private,HS-grad


### **Filtering and Selecting Data**

Filtering and selecting data are fundamental for focusing analysis on specific segments.

**Example**

1. Select individuals working more than 40 hours per week but earning '<=50K'.
2. Find divorced individuals in the Private sector.

In [19]:
census_df.head()

Unnamed: 0,individual_id,age,workclass,Final_weight_of_record,education,educational_level,marital_status,occupation,relationship,race,gender,capital_gain,capital_loss,working_hours_per_week,native_country,income_bracket
0,1,25,Private,226802,11th,7,Never Married,Machine Operator,Own-child,Black,Male,0,0,40,United-States,<=50K
1,2,38,Private,89814,HS-grad,9,Married Civ Spouse,Farming and Fishing,Husband,White,Male,0,0,50,United-States,<=50K
2,3,28,Local-gov,336951,Assoc-acdm,12,Married Civ Spouse,Protective Services,Husband,White,Male,0,0,40,United-States,>50K
3,4,44,Private,160323,Some-college,10,Married Civ Spouse,Machine Operator,Husband,Black,Male,7688,0,40,United-States,>50K
4,5,18,?,103497,Some-college,10,Never Married,?,Own-child,White,Female,0,0,30,United-States,<=50K


In [20]:
census_df[(census_df["working_hours_per_week"] > 40) & (census_df["income_bracket"] == "<=50K")]

Unnamed: 0,individual_id,age,workclass,Final_weight_of_record,education,educational_level,marital_status,occupation,relationship,race,gender,capital_gain,capital_loss,working_hours_per_week,native_country,income_bracket
1,2,38,Private,89814,HS-grad,9,Married Civ Spouse,Farming and Fishing,Husband,White,Male,0,0,50,United-States,<=50K
23,24,25,Private,220931,Bachelors,13,Never Married,Professional Specialty,Not-in-family,White,Male,0,0,43,Peru,<=50K
27,28,23,Private,134446,HS-grad,9,Separated,Machine Operator,Unmarried,Black,Male,0,0,54,United-States,<=50K
29,30,32,Self-emp-not-inc,109282,Some-college,10,Never Married,Professional Specialty,Not-in-family,White,Male,0,0,60,United-States,<=50K
31,32,56,Self-emp-not-inc,186651,11th,7,Widowed,Other Service,Unmarried,White,Female,0,0,50,United-States,<=50K
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
48818,48819,30,Private,345898,HS-grad,9,Never Married,Craft Repair,Not-in-family,Black,Male,0,0,46,United-States,<=50K
48824,48825,45,Local-gov,119199,Assoc-acdm,12,Divorced,Professional Specialty,Unmarried,White,Female,0,0,48,United-States,<=50K
48829,48830,65,Self-emp-not-inc,99359,Prof-school,15,Never Married,Professional Specialty,Not-in-family,White,Male,1086,0,60,United-States,<=50K
48831,48832,43,Self-emp-not-inc,27242,Some-college,10,Married Civ Spouse,Craft Repair,Husband,White,Male,0,0,50,United-States,<=50K


In [22]:
# 2. Find divorced individuals in the Private sector.
census_df[
    (census_df["marital_status"] == "Divorced") & 
    (census_df["workclass"] == "Private")
    
    ]

Unnamed: 0,individual_id,age,workclass,Final_weight_of_record,education,educational_level,marital_status,occupation,relationship,race,gender,capital_gain,capital_loss,working_hours_per_week,native_country,income_bracket
34,35,26,Private,43311,HS-grad,9,Divorced,Executive and Managerial,Unmarried,White,Female,0,0,40,United-States,<=50K
51,52,39,Private,280215,HS-grad,9,Divorced,Handlers and Cleaners,Own-child,Black,Male,0,0,40,United-States,<=50K
61,62,39,Private,118429,Some-college,10,Divorced,Sales,Not-in-family,White,Male,0,0,40,United-States,<=50K
119,120,43,Private,179866,Bachelors,13,Divorced,Sales,Unmarried,White,Female,0,0,40,United-States,>50K
123,124,41,Private,110732,Some-college,10,Divorced,Tech Support,Not-in-family,White,Female,0,0,40,United-States,<=50K
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
48804,48805,30,Private,77266,HS-grad,9,Divorced,Transportation and Moving,Not-in-family,White,Male,0,0,55,United-States,<=50K
48815,48816,37,Private,179137,Some-college,10,Divorced,Admin Clerical,Unmarried,White,Female,0,0,39,United-States,<=50K
48819,48820,38,Private,139180,Bachelors,13,Divorced,Professional Specialty,Unmarried,Black,Female,15020,0,45,United-States,>50K
48825,48826,31,Private,199655,Masters,14,Divorced,Other Service,Not-in-family,Other,Female,0,0,30,United-States,<=50K


### **Removing Columns and Rows**

In [23]:
# Drop/Delete the first row/record
census_df.drop(0, inplace=True)
census_df.head()

Unnamed: 0,individual_id,age,workclass,Final_weight_of_record,education,educational_level,marital_status,occupation,relationship,race,gender,capital_gain,capital_loss,working_hours_per_week,native_country,income_bracket
1,2,38,Private,89814,HS-grad,9,Married Civ Spouse,Farming and Fishing,Husband,White,Male,0,0,50,United-States,<=50K
2,3,28,Local-gov,336951,Assoc-acdm,12,Married Civ Spouse,Protective Services,Husband,White,Male,0,0,40,United-States,>50K
3,4,44,Private,160323,Some-college,10,Married Civ Spouse,Machine Operator,Husband,Black,Male,7688,0,40,United-States,>50K
4,5,18,?,103497,Some-college,10,Never Married,?,Own-child,White,Female,0,0,30,United-States,<=50K
5,6,34,Private,198693,10th,6,Never Married,Other Service,Not-in-family,White,Male,0,0,30,United-States,<=50K


In [24]:
# Remove multiple row/records
remove_rows = [1,3,7]
census_df.drop(index=remove_rows, inplace=True)
census_df

Unnamed: 0,individual_id,age,workclass,Final_weight_of_record,education,educational_level,marital_status,occupation,relationship,race,gender,capital_gain,capital_loss,working_hours_per_week,native_country,income_bracket
2,3,28,Local-gov,336951,Assoc-acdm,12,Married Civ Spouse,Protective Services,Husband,White,Male,0,0,40,United-States,>50K
4,5,18,?,103497,Some-college,10,Never Married,?,Own-child,White,Female,0,0,30,United-States,<=50K
5,6,34,Private,198693,10th,6,Never Married,Other Service,Not-in-family,White,Male,0,0,30,United-States,<=50K
6,7,29,?,227026,HS-grad,9,Never Married,?,Unmarried,Black,Male,0,0,40,United-States,<=50K
8,9,24,Private,369667,Some-college,10,Never Married,Other Service,Unmarried,White,Female,0,0,40,United-States,<=50K
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
48837,48838,27,Private,257302,Assoc-acdm,12,Married Civ Spouse,Tech Support,Wife,White,Female,0,0,38,United-States,<=50K
48838,48839,40,Private,154374,HS-grad,9,Married Civ Spouse,Machine Operator,Husband,White,Male,0,0,40,United-States,>50K
48839,48840,58,Private,151910,HS-grad,9,Widowed,Admin Clerical,Unmarried,White,Female,0,0,40,United-States,<=50K
48840,48841,22,Private,201490,HS-grad,9,Never Married,Admin Clerical,Own-child,White,Male,0,0,20,United-States,<=50K


In [None]:
# axis/axes
# 1st list - List of indexes for each Row
# 2nd list - List of Columns name
census_df.axes
census_df.axes[0] # Rows
census_df.axes[1] # Columns

Index(['individual_id', 'age', 'workclass', 'Final_weight_of_record',
       'education', 'educational_level', 'marital_status', 'occupation',
       'relationship', 'race', 'gender', 'capital_gain', 'capital_loss',
       'working_hours_per_week', 'native_country', 'income_bracket'],
      dtype='object')

In [None]:
# this will drop or delete the column Final_weight_of_record
census_df.drop("Final_weight_of_record", axis=1, inplace=True)
census_df.columns

Index(['individual_id', 'age', 'workclass', 'education', 'educational_level',
       'marital_status', 'occupation', 'relationship', 'race', 'gender',
       'capital_gain', 'capital_loss', 'working_hours_per_week',
       'native_country', 'income_bracket'],
      dtype='object')

In [None]:
# this will delete multiple columns
census_df.drop(columns=["capital_gain", "capital_loss", "relationship"], inplace=True)
census_df.columns

## **Handling Duplicates**

Identifying and removing duplicate records are crucial for maintaining data quality.

In [None]:
# checking of duplicates
census_df.duplicated()

2        False
4        False
5        False
6        False
8        False
         ...  
48837    False
48838    False
48839    False
48840    False
48841    False
Length: 48838, dtype: bool

In [None]:
#  checking the total number of duplicate rows

census_df.duplicated().sum()

np.int64(0)

In [32]:
# Remove duplicates
census_df.drop_duplicates(inplace=True)

### **Aggregating Data** (.groupby)

Aggregating data involves summarizing data points into meaningful statistics, 
such as averages, sums, or counts, which can be achieved using GroupBy operations or pivot tables. 
This helps in understanding the dataset at a higher level.

In [33]:
census_df.head()

Unnamed: 0,individual_id,age,workclass,education,educational_level,marital_status,occupation,relationship,race,gender,capital_gain,capital_loss,working_hours_per_week,native_country,income_bracket
2,3,28,Local-gov,Assoc-acdm,12,Married Civ Spouse,Protective Services,Husband,White,Male,0,0,40,United-States,>50K
4,5,18,?,Some-college,10,Never Married,?,Own-child,White,Female,0,0,30,United-States,<=50K
5,6,34,Private,10th,6,Never Married,Other Service,Not-in-family,White,Male,0,0,30,United-States,<=50K
6,7,29,?,HS-grad,9,Never Married,?,Unmarried,Black,Male,0,0,40,United-States,<=50K
8,9,24,Private,Some-college,10,Never Married,Other Service,Unmarried,White,Female,0,0,40,United-States,<=50K


In [34]:
# count the number of records of each unique value
census_df["occupation"].value_counts()

occupation
Professional Specialty        6171
Craft Repair                  6112
Executive and Managerial      6086
Admin Clerical                5611
Sales                         5504
Other Service                 4923
Machine Operator              3020
?                             2809
Transportation and Moving     2355
Handlers and Cleaners         2072
Farming and Fishing           1489
Tech Support                  1446
Protective Services            983
Private Household Services     242
Armed Forces                    15
Name: count, dtype: int64

In [35]:
# Task: Average age of an individual working for each occupation
average_age = census_df.groupby('occupation')['age'].mean().round(2)
average_age

  average_age = census_df.groupby('occupation')['age'].mean().round(2)


occupation
?                             40.07
Admin Clerical                37.19
Armed Forces                  31.47
Craft Repair                  39.01
Executive and Managerial      42.20
Farming and Fishing           41.28
Handlers and Cleaners         32.65
Machine Operator              37.74
Other Service                 35.11
Private Household Services    43.40
Professional Specialty        40.56
Protective Services           38.90
Sales                         37.41
Tech Support                  37.15
Transportation and Moving     40.65
Name: age, dtype: float64

In [36]:
# Task: Youngest age of an individual working for each occupation
# sort_values()
youngest_age = census_df.groupby('occupation')['age'].min().sort_values()
youngest_age

  youngest_age = census_df.groupby('occupation')['age'].min().sort_values()


occupation
?                             17
Admin Clerical                17
Craft Repair                  17
Executive and Managerial      17
Handlers and Cleaners         17
Farming and Fishing           17
Machine Operator              17
Other Service                 17
Sales                         17
Private Household Services    17
Professional Specialty        17
Protective Services           17
Transportation and Moving     17
Tech Support                  17
Armed Forces                  23
Name: age, dtype: int64

In [None]:
oldest_age = census_df.groupby('occupation')['age'].max().sort_values()
oldest_age