# **Pandas Intermediate**

### **Import pandas**

In [1]:
import pandas as pd

### **Importing and Exporting Data**

Pandas supports reading from and writing to a variety of file formats, 
including CSV, Excel, SQL, making it easy to integrate with data analysis workflows.

In [2]:
# Import CSV to a Dataframe
csv_df = pd.read_csv("example.csv")
csv_df

Unnamed: 0,A,B,C
0,1.0,5.0,10.0
1,2.0,6.5,11.0
2,2.333333,6.5,12.0
3,4.0,8.0,11.0


### **Install openpyxl**

In [3]:
# Export Dataframe into an Excel Spreadsheet
%pip install openpyxl

Note: you may need to restart the kernel to use updated packages.



[notice] A new release of pip is available: 24.2 -> 24.3.1
[notice] To update, run: python.exe -m pip install --upgrade pip


In [7]:
csv_df.to_excel("exported_csv_df.xlsx", sheet_name="csv_df", index=False)

In [9]:
csv_df.to_csv("exported_csv_df.csv", index=False)

### **Importing SQL Database**

In [10]:
import sqlite3

## **Data Inspection** 

Data inspection is the initial review of a dataset to find missing values, 
incorrect data types, and gather basic statistics, providing insights into its quality and structure.

In [14]:
conn = sqlite3.connect("census_data.db")
census_df = pd.read_sql_query("SELECT * FROM individuals", conn)
census_df

Unnamed: 0,individual_id,age,workclass,fnlwgt,education,educational_num,marital_status,occupation,relationship,race,gender,capital_gain,capital_loss,hours_per_week,native_country,income
0,1,25,Private,226802,11th,7,Never-married,Machine-op-inspct,Own-child,Black,Male,0,0,40,United-States,<=50K
1,2,38,Private,89814,HS-grad,9,Married-civ-spouse,Farming-fishing,Husband,White,Male,0,0,50,United-States,<=50K
2,3,28,Local-gov,336951,Assoc-acdm,12,Married-civ-spouse,Protective-serv,Husband,White,Male,0,0,40,United-States,>50K
3,4,44,Private,160323,Some-college,10,Married-civ-spouse,Machine-op-inspct,Husband,Black,Male,7688,0,40,United-States,>50K
4,5,18,?,103497,Some-college,10,Never-married,?,Own-child,White,Female,0,0,30,United-States,<=50K
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
48837,48838,27,Private,257302,Assoc-acdm,12,Married-civ-spouse,Tech-support,Wife,White,Female,0,0,38,United-States,<=50K
48838,48839,40,Private,154374,HS-grad,9,Married-civ-spouse,Machine-op-inspct,Husband,White,Male,0,0,40,United-States,>50K
48839,48840,58,Private,151910,HS-grad,9,Widowed,Adm-clerical,Unmarried,White,Female,0,0,40,United-States,<=50K
48840,48841,22,Private,201490,HS-grad,9,Never-married,Adm-clerical,Own-child,White,Male,0,0,20,United-States,<=50K


In [15]:
census_df.isnull()

Unnamed: 0,individual_id,age,workclass,fnlwgt,education,educational_num,marital_status,occupation,relationship,race,gender,capital_gain,capital_loss,hours_per_week,native_country,income
0,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False
1,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False
2,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False
3,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False
4,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
48837,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False
48838,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False
48839,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False
48840,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False


In [16]:
census_df.isnull().sum()

individual_id      0
age                0
workclass          0
fnlwgt             0
education          0
educational_num    0
marital_status     0
occupation         0
relationship       0
race               0
gender             0
capital_gain       0
capital_loss       0
hours_per_week     0
native_country     0
income             0
dtype: int64

In [18]:
(census_df == "?").sum()

individual_id         0
age                   0
workclass          2799
fnlwgt                0
education             0
educational_num       0
marital_status        0
occupation         2809
relationship          0
race                  0
gender                0
capital_gain          0
capital_loss          0
hours_per_week        0
native_country      857
income                0
dtype: int64

In [19]:
# Check data type of each column inside dataframe
census_df.dtypes

individual_id       int64
age                 int64
workclass          object
fnlwgt             object
education          object
educational_num     int64
marital_status     object
occupation         object
relationship       object
race               object
gender             object
capital_gain        int64
capital_loss        int64
hours_per_week      int64
native_country     object
income             object
dtype: object

In [20]:
census_df.describe()

Unnamed: 0,individual_id,age,educational_num,capital_gain,capital_loss,hours_per_week
count,48842.0,48842.0,48842.0,48842.0,48842.0,48842.0
mean,24421.5,38.643585,10.078089,1079.067626,87.502314,40.422382
std,14099.615261,13.71051,2.570973,7452.019058,403.004552,12.391444
min,1.0,17.0,1.0,0.0,0.0,1.0
25%,12211.25,28.0,9.0,0.0,0.0,40.0
50%,24421.5,37.0,10.0,0.0,0.0,40.0
75%,36631.75,48.0,12.0,0.0,0.0,45.0
max,48842.0,90.0,16.0,99999.0,4356.0,99.0


In [21]:
# Replace "?" into sparse data type
census_df.replace("?", pd.NA, inplace=True)
census_df

Unnamed: 0,individual_id,age,workclass,fnlwgt,education,educational_num,marital_status,occupation,relationship,race,gender,capital_gain,capital_loss,hours_per_week,native_country,income
0,1,25,Private,226802,11th,7,Never-married,Machine-op-inspct,Own-child,Black,Male,0,0,40,United-States,<=50K
1,2,38,Private,89814,HS-grad,9,Married-civ-spouse,Farming-fishing,Husband,White,Male,0,0,50,United-States,<=50K
2,3,28,Local-gov,336951,Assoc-acdm,12,Married-civ-spouse,Protective-serv,Husband,White,Male,0,0,40,United-States,>50K
3,4,44,Private,160323,Some-college,10,Married-civ-spouse,Machine-op-inspct,Husband,Black,Male,7688,0,40,United-States,>50K
4,5,18,,103497,Some-college,10,Never-married,,Own-child,White,Female,0,0,30,United-States,<=50K
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
48837,48838,27,Private,257302,Assoc-acdm,12,Married-civ-spouse,Tech-support,Wife,White,Female,0,0,38,United-States,<=50K
48838,48839,40,Private,154374,HS-grad,9,Married-civ-spouse,Machine-op-inspct,Husband,White,Male,0,0,40,United-States,>50K
48839,48840,58,Private,151910,HS-grad,9,Widowed,Adm-clerical,Unmarried,White,Female,0,0,40,United-States,<=50K
48840,48841,22,Private,201490,HS-grad,9,Never-married,Adm-clerical,Own-child,White,Male,0,0,20,United-States,<=50K


## **Cleaning Data**

Cleaning data involves eliminating or rectifying inaccuracies, inconsistencies, 
and missing values within your dataset, utilizing techniques such as handling 
missing values via deletion or imputation, rectifying data types, and detecting 
and eliminating duplicate entries, ultimately resulting in more precise and dependable analysis.

In [22]:
# Replace values in a DataFrame.
# Replacing null representation(?) values with null

# replace(character/value to replace, what to replace it with, inplace/save)
# pd.NA = Null value of pandas
# inplace = Save it in the DataFrame

# Step 1: Turn the '?' into a null value
census_df.replace("?", pd.NA, inplace=True)
census_df

Unnamed: 0,individual_id,age,workclass,fnlwgt,education,educational_num,marital_status,occupation,relationship,race,gender,capital_gain,capital_loss,hours_per_week,native_country,income
0,1,25,Private,226802,11th,7,Never-married,Machine-op-inspct,Own-child,Black,Male,0,0,40,United-States,<=50K
1,2,38,Private,89814,HS-grad,9,Married-civ-spouse,Farming-fishing,Husband,White,Male,0,0,50,United-States,<=50K
2,3,28,Local-gov,336951,Assoc-acdm,12,Married-civ-spouse,Protective-serv,Husband,White,Male,0,0,40,United-States,>50K
3,4,44,Private,160323,Some-college,10,Married-civ-spouse,Machine-op-inspct,Husband,Black,Male,7688,0,40,United-States,>50K
4,5,18,,103497,Some-college,10,Never-married,,Own-child,White,Female,0,0,30,United-States,<=50K
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
48837,48838,27,Private,257302,Assoc-acdm,12,Married-civ-spouse,Tech-support,Wife,White,Female,0,0,38,United-States,<=50K
48838,48839,40,Private,154374,HS-grad,9,Married-civ-spouse,Machine-op-inspct,Husband,White,Male,0,0,40,United-States,>50K
48839,48840,58,Private,151910,HS-grad,9,Widowed,Adm-clerical,Unmarried,White,Female,0,0,40,United-States,<=50K
48840,48841,22,Private,201490,HS-grad,9,Never-married,Adm-clerical,Own-child,White,Male,0,0,20,United-States,<=50K


In [23]:
# Checking every column for null/True values
census_df.isnull().sum()

individual_id         0
age                   0
workclass          2799
fnlwgt                0
education             0
educational_num       0
marital_status        0
occupation         2809
relationship          0
race                  0
gender                0
capital_gain          0
capital_loss          0
hours_per_week        0
native_country      857
income                0
dtype: int64

In [25]:
# Rectifying inconsistency
# fillna(): Fill missing values.
# Replace null with unemployed
census_df["occupation"] = census_df["occupation"].fillna("Unemployed")
census_df

Unnamed: 0,individual_id,age,workclass,fnlwgt,education,educational_num,marital_status,occupation,relationship,race,gender,capital_gain,capital_loss,hours_per_week,native_country,income
0,1,25,Private,226802,11th,7,Never-married,Machine-op-inspct,Own-child,Black,Male,0,0,40,United-States,<=50K
1,2,38,Private,89814,HS-grad,9,Married-civ-spouse,Farming-fishing,Husband,White,Male,0,0,50,United-States,<=50K
2,3,28,Local-gov,336951,Assoc-acdm,12,Married-civ-spouse,Protective-serv,Husband,White,Male,0,0,40,United-States,>50K
3,4,44,Private,160323,Some-college,10,Married-civ-spouse,Machine-op-inspct,Husband,Black,Male,7688,0,40,United-States,>50K
4,5,18,,103497,Some-college,10,Never-married,Unemployed,Own-child,White,Female,0,0,30,United-States,<=50K
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
48837,48838,27,Private,257302,Assoc-acdm,12,Married-civ-spouse,Tech-support,Wife,White,Female,0,0,38,United-States,<=50K
48838,48839,40,Private,154374,HS-grad,9,Married-civ-spouse,Machine-op-inspct,Husband,White,Male,0,0,40,United-States,>50K
48839,48840,58,Private,151910,HS-grad,9,Widowed,Adm-clerical,Unmarried,White,Female,0,0,40,United-States,<=50K
48840,48841,22,Private,201490,HS-grad,9,Never-married,Adm-clerical,Own-child,White,Male,0,0,20,United-States,<=50K


In [26]:
# dropna(): Drop/deletes rows or columns with missing values.
# Drop the rows that have null values
census_df.dropna(inplace=True)
census_df

Unnamed: 0,individual_id,age,workclass,fnlwgt,education,educational_num,marital_status,occupation,relationship,race,gender,capital_gain,capital_loss,hours_per_week,native_country,income
0,1,25,Private,226802,11th,7,Never-married,Machine-op-inspct,Own-child,Black,Male,0,0,40,United-States,<=50K
1,2,38,Private,89814,HS-grad,9,Married-civ-spouse,Farming-fishing,Husband,White,Male,0,0,50,United-States,<=50K
2,3,28,Local-gov,336951,Assoc-acdm,12,Married-civ-spouse,Protective-serv,Husband,White,Male,0,0,40,United-States,>50K
3,4,44,Private,160323,Some-college,10,Married-civ-spouse,Machine-op-inspct,Husband,Black,Male,7688,0,40,United-States,>50K
5,6,34,Private,198693,10th,6,Never-married,Other-service,Not-in-family,White,Male,0,0,30,United-States,<=50K
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
48837,48838,27,Private,257302,Assoc-acdm,12,Married-civ-spouse,Tech-support,Wife,White,Female,0,0,38,United-States,<=50K
48838,48839,40,Private,154374,HS-grad,9,Married-civ-spouse,Machine-op-inspct,Husband,White,Male,0,0,40,United-States,>50K
48839,48840,58,Private,151910,HS-grad,9,Widowed,Adm-clerical,Unmarried,White,Female,0,0,40,United-States,<=50K
48840,48841,22,Private,201490,HS-grad,9,Never-married,Adm-clerical,Own-child,White,Male,0,0,20,United-States,<=50K


In [27]:
# Confirm if all the rows/records with Null Values is Dropped.
census_df.isnull().sum()

individual_id      0
age                0
workclass          0
fnlwgt             0
education          0
educational_num    0
marital_status     0
occupation         0
relationship       0
race               0
gender             0
capital_gain       0
capital_loss       0
hours_per_week     0
native_country     0
income             0
dtype: int64

### **Trimming and Cleaning Text Data**

In [29]:
# Removing the spaces at the beginning and at the end of the string.
census_df["workclass"].str.strip()
census_df["workclass"]

0             Private
1             Private
2           Local-gov
3             Private
5             Private
             ...     
48837         Private
48838         Private
48839         Private
48840         Private
48841    Self-emp-inc
Name: workclass, Length: 45232, dtype: object

In [31]:
census_df["occupation"].unique()

array(['Machine-op-inspct', 'Farming-fishing', 'Protective-serv',
       'Other-service', 'Prof-specialty', 'Craft-repair', 'Adm-clerical',
       'Exec-managerial', 'Tech-support', 'Sales', 'Priv-house-serv',
       'Transport-moving', 'Handlers-cleaners', 'Armed-Forces',
       'Unemployed'], dtype=object)

In [33]:
occupation_mapping = {
    "Machine-op-inspct": "Machine Operator",
    "Farming-fishing": "Farming and Fishing",
    "Protective-serv": "Protective Services",
    "Other-service": "Other Service",
    "Prof-specialty": "Professional Specialty",
    "Craft-repair": "Craft Repair",
    "Adm-clerical": "Admin Clerical",
    "Exec-managerial":"Executive and Managerial",
    "Tech-support": "Tech Support",
    "Priv-house-serv":"Private Household Services",
    "Transport-moving":"Transportation and Moving",
    "Handlers-cleaners":"Handlers and Cleaners",
    "Armed-Forces": "Armed Forces"
}

In [34]:
census_df["occupation"] = census_df["occupation"].map(occupation_mapping).fillna(census_df["occupation"])
census_df

Unnamed: 0,individual_id,age,workclass,fnlwgt,education,educational_num,marital_status,occupation,relationship,race,gender,capital_gain,capital_loss,hours_per_week,native_country,income
0,1,25,Private,226802,11th,7,Never-married,Machine Operator,Own-child,Black,Male,0,0,40,United-States,<=50K
1,2,38,Private,89814,HS-grad,9,Married-civ-spouse,Farming and Fishing,Husband,White,Male,0,0,50,United-States,<=50K
2,3,28,Local-gov,336951,Assoc-acdm,12,Married-civ-spouse,Protective Services,Husband,White,Male,0,0,40,United-States,>50K
3,4,44,Private,160323,Some-college,10,Married-civ-spouse,Machine Operator,Husband,Black,Male,7688,0,40,United-States,>50K
5,6,34,Private,198693,10th,6,Never-married,Other Service,Not-in-family,White,Male,0,0,30,United-States,<=50K
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
48837,48838,27,Private,257302,Assoc-acdm,12,Married-civ-spouse,Tech Support,Wife,White,Female,0,0,38,United-States,<=50K
48838,48839,40,Private,154374,HS-grad,9,Married-civ-spouse,Machine Operator,Husband,White,Male,0,0,40,United-States,>50K
48839,48840,58,Private,151910,HS-grad,9,Widowed,Admin Clerical,Unmarried,White,Female,0,0,40,United-States,<=50K
48840,48841,22,Private,201490,HS-grad,9,Never-married,Admin Clerical,Own-child,White,Male,0,0,20,United-States,<=50K


In [None]:
# Remove space before and after the word
census_df['workclass'].str.strip()
census_df["workclass"]


0             Private
1             Private
2           Local-gov
3             Private
5             Private
             ...     
48837         Private
48838         Private
48839         Private
48840         Private
48841    Self-emp-inc
Name: workclass, Length: 45232, dtype: object

In [38]:
census_df["workclass"] = census_df["workclass"].replace("-", " ", regex=True)
census_df

Unnamed: 0,individual_id,age,workclass,fnlwgt,education,educational_num,marital_status,occupation,relationship,race,gender,capital_gain,capital_loss,hours_per_week,native_country,income
0,1,25,Private,226802,11th,7,Never-married,Machine Operator,Own-child,Black,Male,0,0,40,United-States,<=50K
1,2,38,Private,89814,HS-grad,9,Married-civ-spouse,Farming and Fishing,Husband,White,Male,0,0,50,United-States,<=50K
2,3,28,Local gov,336951,Assoc-acdm,12,Married-civ-spouse,Protective Services,Husband,White,Male,0,0,40,United-States,>50K
3,4,44,Private,160323,Some-college,10,Married-civ-spouse,Machine Operator,Husband,Black,Male,7688,0,40,United-States,>50K
5,6,34,Private,198693,10th,6,Never-married,Other Service,Not-in-family,White,Male,0,0,30,United-States,<=50K
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
48837,48838,27,Private,257302,Assoc-acdm,12,Married-civ-spouse,Tech Support,Wife,White,Female,0,0,38,United-States,<=50K
48838,48839,40,Private,154374,HS-grad,9,Married-civ-spouse,Machine Operator,Husband,White,Male,0,0,40,United-States,>50K
48839,48840,58,Private,151910,HS-grad,9,Widowed,Admin Clerical,Unmarried,White,Female,0,0,40,United-States,<=50K
48840,48841,22,Private,201490,HS-grad,9,Never-married,Admin Clerical,Own-child,White,Male,0,0,20,United-States,<=50K


In [39]:
census_df["marital_status"] = census_df["marital_status"].replace("-", " ", regex=True)
census_df['marital_status'] = census_df["marital_status"].str.title()
census_df

Unnamed: 0,individual_id,age,workclass,fnlwgt,education,educational_num,marital_status,occupation,relationship,race,gender,capital_gain,capital_loss,hours_per_week,native_country,income
0,1,25,Private,226802,11th,7,Never Married,Machine Operator,Own-child,Black,Male,0,0,40,United-States,<=50K
1,2,38,Private,89814,HS-grad,9,Married Civ Spouse,Farming and Fishing,Husband,White,Male,0,0,50,United-States,<=50K
2,3,28,Local gov,336951,Assoc-acdm,12,Married Civ Spouse,Protective Services,Husband,White,Male,0,0,40,United-States,>50K
3,4,44,Private,160323,Some-college,10,Married Civ Spouse,Machine Operator,Husband,Black,Male,7688,0,40,United-States,>50K
5,6,34,Private,198693,10th,6,Never Married,Other Service,Not-in-family,White,Male,0,0,30,United-States,<=50K
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
48837,48838,27,Private,257302,Assoc-acdm,12,Married Civ Spouse,Tech Support,Wife,White,Female,0,0,38,United-States,<=50K
48838,48839,40,Private,154374,HS-grad,9,Married Civ Spouse,Machine Operator,Husband,White,Male,0,0,40,United-States,>50K
48839,48840,58,Private,151910,HS-grad,9,Widowed,Admin Clerical,Unmarried,White,Female,0,0,40,United-States,<=50K
48840,48841,22,Private,201490,HS-grad,9,Never Married,Admin Clerical,Own-child,White,Male,0,0,20,United-States,<=50K


In [42]:
# Check all of our columns
census_df.columns

Index(['individual_id', 'age', 'workclass', 'fnlwgt', 'education',
       'educational_num', 'marital_status', 'occupation', 'relationship',
       'race', 'gender', 'capital_gain', 'capital_loss', 'hours_per_week',
       'native_country', 'income'],
      dtype='object')

In [43]:
# Changing Column Data Types
census_df["workclass"] = census_df["workclass"].astype('category')
census_df["education"] = census_df["education"].astype('category')
census_df["educational_num"] = census_df["educational_num"].astype('category')
census_df["marital_status"] = census_df["marital_status"].astype('category')
census_df["occupation"] = census_df["occupation"].astype('category')
census_df["relationship"] = census_df["relationship"].astype('category')
census_df["race"] = census_df["race"].astype('category')
census_df["gender"] = census_df["gender"].astype('category')
census_df["native_country"] = census_df["native_country"].astype('category')
census_df["income"] = census_df["income"].astype('category')
census_df.dtypes

individual_id         int64
age                   int64
workclass          category
fnlwgt               object
education          category
educational_num    category
marital_status     category
occupation         category
relationship       category
race               category
gender             category
capital_gain          int64
capital_loss          int64
hours_per_week        int64
native_country     category
income             category
dtype: object

### **Renaming columns and Reindexing**

In [44]:
# Check all of our columns
census_df.columns

Index(['individual_id', 'age', 'workclass', 'fnlwgt', 'education',
       'educational_num', 'marital_status', 'occupation', 'relationship',
       'race', 'gender', 'capital_gain', 'capital_loss', 'hours_per_week',
       'native_country', 'income'],
      dtype='object')

In [45]:
census_df.rename(columns={
    "hours_per_week":"working_hours_per_week",
    "fnlwgt": "final_weight_of_the_record",
    "educational_num":"educational_level",
    "income": "income_bracket"}, inplace=True)
census_df.columns

Index(['individual_id', 'age', 'workclass', 'final_weight_of_the_record',
       'education', 'educational_level', 'marital_status', 'occupation',
       'relationship', 'race', 'gender', 'capital_gain', 'capital_loss',
       'working_hours_per_week', 'native_country', 'income_bracket'],
      dtype='object')

In [46]:
# This is the old order of the index
census_df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 45232 entries, 0 to 48841
Data columns (total 16 columns):
 #   Column                      Non-Null Count  Dtype   
---  ------                      --------------  -----   
 0   individual_id               45232 non-null  int64   
 1   age                         45232 non-null  int64   
 2   workclass                   45232 non-null  category
 3   final_weight_of_the_record  45232 non-null  object  
 4   education                   45232 non-null  category
 5   educational_level           45232 non-null  category
 6   marital_status              45232 non-null  category
 7   occupation                  45232 non-null  category
 8   relationship                45232 non-null  category
 9   race                        45232 non-null  category
 10  gender                      45232 non-null  category
 11  capital_gain                45232 non-null  int64   
 12  capital_loss                45232 non-null  int64   
 13  working_hours_per_wee

In [47]:
reindex_df = census_df.reindex(columns=["age", "gender", "workclass", "education"])
reindex_df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 45232 entries, 0 to 48841
Data columns (total 4 columns):
 #   Column     Non-Null Count  Dtype   
---  ------     --------------  -----   
 0   age        45232 non-null  int64   
 1   gender     45232 non-null  category
 2   workclass  45232 non-null  category
 3   education  45232 non-null  category
dtypes: category(3), int64(1)
memory usage: 840.4 KB


### **Filtering and Selecting Data**

Filtering and selecting data are fundamental for focusing analysis on specific segments.

**Example**

1. Select individuals working more than 40 hours per week but earning '<=50K'.
2. Find divorced individuals in the Private sector.

In [50]:
census_df

Unnamed: 0,individual_id,age,workclass,final_weight_of_the_record,education,educational_level,marital_status,occupation,relationship,race,gender,capital_gain,capital_loss,working_hours_per_week,native_country,income_bracket
0,1,25,Private,226802,11th,7,Never Married,Machine Operator,Own-child,Black,Male,0,0,40,United-States,<=50K
1,2,38,Private,89814,HS-grad,9,Married Civ Spouse,Farming and Fishing,Husband,White,Male,0,0,50,United-States,<=50K
2,3,28,Local gov,336951,Assoc-acdm,12,Married Civ Spouse,Protective Services,Husband,White,Male,0,0,40,United-States,>50K
3,4,44,Private,160323,Some-college,10,Married Civ Spouse,Machine Operator,Husband,Black,Male,7688,0,40,United-States,>50K
5,6,34,Private,198693,10th,6,Never Married,Other Service,Not-in-family,White,Male,0,0,30,United-States,<=50K
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
48837,48838,27,Private,257302,Assoc-acdm,12,Married Civ Spouse,Tech Support,Wife,White,Female,0,0,38,United-States,<=50K
48838,48839,40,Private,154374,HS-grad,9,Married Civ Spouse,Machine Operator,Husband,White,Male,0,0,40,United-States,>50K
48839,48840,58,Private,151910,HS-grad,9,Widowed,Admin Clerical,Unmarried,White,Female,0,0,40,United-States,<=50K
48840,48841,22,Private,201490,HS-grad,9,Never Married,Admin Clerical,Own-child,White,Male,0,0,20,United-States,<=50K


In [53]:
# 1. Select individuals working more than 40 hours per week but earning '<=50K'.
census_df[(census_df["working_hours_per_week"] > 40) & (census_df["income_bracket"] == "<=50K")]

Unnamed: 0,individual_id,age,workclass,final_weight_of_the_record,education,educational_level,marital_status,occupation,relationship,race,gender,capital_gain,capital_loss,working_hours_per_week,native_country,income_bracket
1,2,38,Private,89814,HS-grad,9,Married Civ Spouse,Farming and Fishing,Husband,White,Male,0,0,50,United-States,<=50K
23,24,25,Private,220931,Bachelors,13,Never Married,Professional Specialty,Not-in-family,White,Male,0,0,43,Peru,<=50K
27,28,23,Private,134446,HS-grad,9,Separated,Machine Operator,Unmarried,Black,Male,0,0,54,United-States,<=50K
29,30,32,Self emp not inc,109282,Some-college,10,Never Married,Professional Specialty,Not-in-family,White,Male,0,0,60,United-States,<=50K
31,32,56,Self emp not inc,186651,11th,7,Widowed,Other Service,Unmarried,White,Female,0,0,50,United-States,<=50K
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
48818,48819,30,Private,345898,HS-grad,9,Never Married,Craft Repair,Not-in-family,Black,Male,0,0,46,United-States,<=50K
48824,48825,45,Local gov,119199,Assoc-acdm,12,Divorced,Professional Specialty,Unmarried,White,Female,0,0,48,United-States,<=50K
48829,48830,65,Self emp not inc,99359,Prof-school,15,Never Married,Professional Specialty,Not-in-family,White,Male,1086,0,60,United-States,<=50K
48831,48832,43,Self emp not inc,27242,Some-college,10,Married Civ Spouse,Craft Repair,Husband,White,Male,0,0,50,United-States,<=50K


In [55]:
census_df["working_hours_per_week"]

0        40
1        50
2        40
3        40
5        30
         ..
48837    38
48838    40
48839    40
48840    20
48841    40
Name: working_hours_per_week, Length: 45232, dtype: int64

### **Removing Columns and Rows**

## **Handling Duplicates**

Identifying and removing duplicate records are crucial for maintaining data quality.

### **Aggregating Data** (.groupby)

Aggregating data involves summarizing data points into meaningful statistics, 
such as averages, sums, or counts, which can be achieved using group by operations.