In [1]:
# Duck Library

In [2]:
#pip install duckdb

Collecting duckdb
  Downloading duckdb-1.3.2-cp312-cp312-macosx_12_0_arm64.whl.metadata (7.0 kB)
Downloading duckdb-1.3.2-cp312-cp312-macosx_12_0_arm64.whl (15.5 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m15.5/15.5 MB[0m [31m65.1 MB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
[?25hInstalling collected packages: duckdb
Successfully installed duckdb-1.3.2
Note: you may need to restart the kernel to use updated packages.


In [6]:
import duckdb
import pandas as pd
import os

In [7]:
projectFolder ='/Users/du/dup/analytics/projects/piit/data/'
os.listdir(projectFolder)

['sqlpractise1.xlsx', '~$sqlpractise1.xlsx']

In [8]:
file_path = projectFolder + "sqlpractise1.xlsx"
employees_df = pd.read_excel(file_path, sheet_name="Employees")
departments_df = pd.read_excel(file_path, sheet_name="Departments")

In [9]:
employees_df

Unnamed: 0,emp_id,first_name,last_name,dept_id,salary,is_active
0,1,Alice,Smith,101,50000,1
1,2,Bob,Jones,102,60000,1
2,3,Charlie,Brown,101,55000,0
3,4,David,Wilson,103,70000,1
4,5,Eva,Taylor,102,65000,1


In [11]:
departments_df

Unnamed: 0,dept_id,dept_name,manager_id
0,101,HR,1
1,102,Engineering,2
2,103,Marketing,4


In [12]:
# Connect to DuckDB and register DataFrames as tables
con = duckdb.connect()
con.register("employees_df", employees_df)
con.register("departments_df", departments_df)

<duckdb.duckdb.DuckDBPyConnection at 0x148a20b30>

## Select all employees

In [13]:
con.execute("SELECT * FROM employees_df").df()

Unnamed: 0,emp_id,first_name,last_name,dept_id,salary,is_active
0,1,Alice,Smith,101,50000,1
1,2,Bob,Jones,102,60000,1
2,3,Charlie,Brown,101,55000,0
3,4,David,Wilson,103,70000,1
4,5,Eva,Taylor,102,65000,1


## Employees with salary > 55000

In [15]:
con.execute("SELECT * FROM employees_df WHERE salary > 55000").df()

Unnamed: 0,emp_id,first_name,last_name,dept_id,salary,is_active
0,2,Bob,Jones,102,60000,1
1,4,David,Wilson,103,70000,1
2,5,Eva,Taylor,102,65000,1


## Join Employees and Departments

In [16]:
con.execute("""
    SELECT e.first_name, e.last_name, d.dept_name
    FROM employees_df e
    JOIN departments_df d ON e.dept_id = d.dept_id
""").df()

Unnamed: 0,first_name,last_name,dept_name
0,Alice,Smith,HR
1,Bob,Jones,Engineering
2,Charlie,Brown,HR
3,David,Wilson,Marketing
4,Eva,Taylor,Engineering


## Count Employees Per Department

In [17]:
con.execute("""
    SELECT d.dept_name, COUNT(e.emp_id) AS employee_count
    FROM employees_df e
    JOIN departments_df d ON e.dept_id = d.dept_id
    GROUP BY d.dept_name
""").df()

Unnamed: 0,dept_name,employee_count
0,Marketing,1
1,Engineering,2
2,HR,2


## Inactive Employees

In [18]:
con.execute("SELECT * FROM employees_df WHERE is_active = 0").df()

Unnamed: 0,emp_id,first_name,last_name,dept_id,salary,is_active
0,3,Charlie,Brown,101,55000,0


## DuckDB Advantages:
- Faster and more powerful than pandasql
- Supports full SQL syntax including subqueries, window functions, DML, and more
- Can even query directly from CSV or Parquet without loading into pandas
