# Python is used to run the tasks in a data pipeline

In [None]:
! python ../setup.py

## Know Python basics

### Use the right data structure for your data access needs

#### `List` for iteration and `dict` for lookup

1. `Lists`: In Python, lists are ideal for storing a collection of items that you want to iterate over. Lists are ordered, mutable, and can contain duplicate elements.
2. `Dictionaries`: Dictionaries (dict) are perfect for situations where you need fast lookups by key. A dictionary stores key-value pairs and provides average O(1) time complexity for lookups.

In [None]:
# List for Iteration
names = ["Alice", "Bob", "Charlie"]
for name in names:
    print(f"Hello, {name}!")

In [None]:
# Dict for Lookup
age_lookup = {
    "Alice": 30,
    "Bob": 25,
    "Charlie": 35
}
print(f"Alice's age is {age_lookup['Alice']}")  # Fast lookup by key

#### Functions allow you to reuse blocks of code

A function is a block of code that can be re-used as needed. This allows for us to have logic defined in one place, making it easy to maintain and use.

In [None]:
def gt_three(input_list):
    result = []
    for elt in input_list:
        if elt > 3:
            result.append(elt)
    return result

In [None]:
gt_three([1,2,3,4,5,6])

#### Define a blueprint with a `Class` and create `Objects` from it

Think of a class as a blueprint and objects as things created based on that blueprint

In [None]:
class DataExtractor:

    def __init__(self, extractor_id):
        self.extractor_id = extractor_id

    def get_connection(self):
        print(f'Getting {self.extractor_id}s connection')
        return
        # Some logic

    def close_connection(self):
        print(f'Closing {self.extractor_id}s connection')
        # Some logic

In [None]:
csv_data_extractor = DataExtractor("csv")
csv_data_extractor.get_connection()

In [None]:
json_data_extractor = DataExtractor("json")
json_data_extractor.get_connection()

## Python can push data to/pull data from any system

### Interact with databases using their specific Python packages 

In [None]:
import sqlite3

# Connect to an SQLite database (or create it)
conn = sqlite3.connect('example.db')
cursor = conn.cursor()

# Query data (pull)
cursor.execute('SELECT * FROM users') # We can run any SQL query here
rows = cursor.fetchall()
for row in rows:
    print(row)

# Close the connection
conn.close()

### Interact with API endpoint using the `requests` package

In [None]:
import requests

# Pull data (GET request)
response = requests.get('https://jsonplaceholder.typicode.com/posts')
data = response.json()

# Print the first post
print(f'Data pulled: {data[0]}')

# Push data (POST request)
new_post = {
    'title': 'New Post',
    'body': 'This is the content of the new post.',
    'userId': 1
}
response = requests.post('https://jsonplaceholder.typicode.com/posts', json=new_post)
print(f'Data posted: {response.json()}')


### Interact with files in your filesystem with Python's standard libraries

In [None]:
import csv

# Write CSV to a local file
data = [["Name", "Age"], ["Alice", 30], ["Bob", 25]]
filename = "sample.csv"
with open(filename, mode="w", newline="") as file:
    writer = csv.writer(file)
    writer.writerows(data)

In [None]:
! cat ./sample.csv

In [None]:
import os

# Delete the file if it exists
if os.path.exists(filename):
    os.remove(filename)

## Run SQL queries using Python

In [None]:
import sqlite3

# Connect to an SQLite database (or create it)
conn = sqlite3.connect('example.db')
cursor = conn.cursor()

# Query data (pull)
cursor.execute('SELECT * FROM users') # We can run any SQL query here
rows = cursor.fetchall()
for row in rows:
    print(row)

cursor.execute('INSERT INTO users (name, age) VALUES (?, ?)', ('Chester', 9000))
cursor.execute('INSERT INTO users (name, age) VALUES (?, ?)', ('Geppato', 50))
# conn.commit() # Uncomment this line, else the insert will not be committed into your databsae

# Query data (pull)
cursor.execute('SELECT id, count(*) FROM users GROUP BY id ORDER BY id') # We can run any SQL query here
rows = cursor.fetchall()
for row in rows:
    print(row)

# Close the connection
conn.close()

In [None]:
import sqlite3

# Connect to an SQLite database (or create it)
conn = sqlite3.connect('example.db')
cursor = conn.cursor()

# Query data (pull)
cursor.execute('SELECT * FROM users') # We can run any SQL query here
rows = cursor.fetchall()
for row in rows:
    print(row)
conn.close()

## Dataframes provides a Pythonic way to transform data

In [None]:
# SQL basics; select, where, group by, join, window functions in Pandas dataframe

In [58]:
import duckdb

db_file_name = './tpch.db'
conn = duckdb.connect(db_file_name)
cursor = conn.cursor()

# Connect to DuckDB and load TPC-H tables into Pandas DataFrames
customer_df = con.sql("SELECT * FROM customer").df()
orders_df = con.sql("SELECT * FROM orders").df()
lineitem_df = con.sql("SELECT * FROM lineitem").df()
nation_df = con.sql("SELECT * FROM nation").df()
region_df = con.sql("SELECT * FROM region").df()
supplier_df = con.sql("SELECT * FROM supplier").df()
part_df = con.sql("SELECT * FROM part").df()
partsupp_df = con.sql("SELECT * FROM partsupp").df()

conn.close()

In [60]:
import pandas as pd

In [61]:
# Assuming 'customer_df' is the DataFrame containing the customer table data
filtered_df = customer_df[customer_df["c_nationkey"] == 20].head(10)
filtered_df

Unnamed: 0,c_custkey,c_name,c_address,c_nationkey,c_phone,c_acctbal,c_mktsegment,c_comment
5,6,Customer#000000006,"g1s,pzDenUEBW3O,2 pxu0f9n2g64rJrt5E",20,30-114-968-4951,7638.57,AUTOMOBILE,quickly silent asymptotes are slyly regular e...
80,81,Customer#000000081,9jUFbrThIIeoUNd8 9,20,30-165-277-3269,2023.71,BUILDING,s against the ironic packages haggle carefully...
99,100,Customer#000000100,MBy6qq3OEGpV4u,20,30-749-445-4907,9889.89,FURNITURE,dazzle carefully furiously final foxes. expres...
209,210,Customer#000000210,",XOlfSzkZDAkm96adR41j,",20,30-876-248-9750,7250.14,HOUSEHOLD,es cajole bravely across the blithely
222,223,Customer#000000223,MyQxUcG0P QCetmG00GlF,20,30-193-643-1517,7476.2,BUILDING,xcuses. silent theodolites across the carefull...
227,228,Customer#000000228,"rZ1wxvHNByT71bUJWZjXMDROzlAch6FVu,dj8Zfq",20,30-435-915-1603,6868.12,FURNITURE,es. blithely permanent sentim
246,247,Customer#000000247,eSAW4XaakYFj2WToKU,20,30-151-905-3513,8495.92,HOUSEHOLD,"tes nag according to the blithe, even packages..."
277,278,Customer#000000278,XHAfHlrYQM3elmhJ,20,30-445-570-5841,7621.56,BUILDING,ely unusual accounts. stealthily special instr...
284,285,Customer#000000285,rB6fTQKle64k3MvCCatad8DfMgR5OZA G4r,20,30-235-130-1313,7276.72,FURNITURE,slyly according to the blithely special instr...
320,321,Customer#000000321,LX0SKs3jqo9wH1yixIdGWp2ItclDiuL,20,30-114-675-9153,7718.77,FURNITURE,"ng the final, bold requests. furiously regular..."


In [63]:
customer_df[
    ((customer_df["c_nationkey"] == 20) & (customer_df["c_acctbal"] > 1000)) |
    (customer_df["c_nationkey"] == 11)
].head(10)

Unnamed: 0,c_custkey,c_name,c_address,c_nationkey,c_phone,c_acctbal,c_mktsegment,c_comment
5,6,Customer#000000006,"g1s,pzDenUEBW3O,2 pxu0f9n2g64rJrt5E",20,30-114-968-4951,7638.57,AUTOMOBILE,quickly silent asymptotes are slyly regular e...
51,52,Customer#000000052,"UracAlAA8tSHL5V,poTZIOjh8o,",11,21-186-284-5998,5630.28,HOUSEHOLD,ts boost. carefully express waters across the ...
80,81,Customer#000000081,9jUFbrThIIeoUNd8 9,20,30-165-277-3269,2023.71,BUILDING,s against the ironic packages haggle carefully...
83,84,Customer#000000084,GB3sUmv RRXV DPzeOSbGxMIF9Z4Eq9 rop,11,21-546-818-3802,5174.71,FURNITURE,ounts. blithely express theodolites nag carefu...
99,100,Customer#000000100,MBy6qq3OEGpV4u,20,30-749-445-4907,9889.89,FURNITURE,dazzle carefully furiously final foxes. expres...
130,131,Customer#000000131,"ItdUFrHPZlzjZ, fo03sG4topAKTV",11,21-840-210-3572,8595.53,HOUSEHOLD,ly final Tiresias. slyly permanent theodolites...
133,134,Customer#000000134,6I1TTaoG7bbiogCqRcptG6BYme,11,21-200-159-5932,4608.9,BUILDING,ly regular dolphins haggle blithely.
147,148,Customer#000000148,qJ8bFn4kwiit7RzwGrwo5m,11,21-562-498-6636,2135.6,HOUSEHOLD,e carefully pending ideas detect slyly along t...
189,190,Customer#000000190,"mY30kK8AfsTGrx,L4zI QlQnnmCUxikyc8QcZ7",11,21-730-373-8193,1657.46,AUTOMOBILE,y even packages engage furiously pending p
209,210,Customer#000000210,",XOlfSzkZDAkm96adR41j,",20,30-876-248-9750,7250.14,HOUSEHOLD,es cajole bravely across the blithely


In [68]:
# Inner join
inner_join_df = orders_df.merge(
    lineitem_df,
    left_on="o_orderkey",
    right_on="l_orderkey",
    how="inner"
)

inner_join_df[
    (inner_join_df["o_orderdate"] >= inner_join_df["l_shipdate"] - pd.Timedelta(days=5)) &
    (inner_join_df["o_orderdate"] <= inner_join_df["l_shipdate"] + pd.Timedelta(days=5))
].head(2)

Unnamed: 0,o_orderkey,o_custkey,o_orderstatus,o_totalprice,o_orderdate,o_orderpriority,o_clerk,o_shippriority,o_comment,l_orderkey,...,l_discount,l_tax,l_returnflag,l_linestatus,l_shipdate,l_commitdate,l_receiptdate,l_shipinstruct,l_shipmode,l_comment
20,7,392,O,271885.66,1996-01-10,2-HIGH,Clerk#000000470,0,", ironic packages wa",7,...,0.1,0.07,N,O,1996-01-15,1996-03-27,1996-02-03,COLLECT COD,MAIL,accounts. reque
30,32,1301,O,198665.57,1995-07-16,2-HIGH,Clerk#000000616,0,ly about the carefully express theodolites. ir...,32,...,0.04,0.03,N,O,1995-07-21,1995-09-23,1995-07-25,COLLECT COD,RAIL,ly final asymptotes. qui


In [69]:
# Left join
left_join_df = orders_df.merge(
    lineitem_df,
    left_on="o_orderkey",
    right_on="l_orderkey",
    how="left"
)

left_join_df[
    (left_join_df["o_orderdate"] >= left_join_df["l_shipdate"] - pd.Timedelta(days=5)) &
    (left_join_df["o_orderdate"] <= left_join_df["l_shipdate"] + pd.Timedelta(days=5))
].head(2)

Unnamed: 0,o_orderkey,o_custkey,o_orderstatus,o_totalprice,o_orderdate,o_orderpriority,o_clerk,o_shippriority,o_comment,l_orderkey,...,l_discount,l_tax,l_returnflag,l_linestatus,l_shipdate,l_commitdate,l_receiptdate,l_shipinstruct,l_shipmode,l_comment
20,7,392,O,271885.66,1996-01-10,2-HIGH,Clerk#000000470,0,", ironic packages wa",7,...,0.1,0.07,N,O,1996-01-15,1996-03-27,1996-02-03,COLLECT COD,MAIL,accounts. reque
30,32,1301,O,198665.57,1995-07-16,2-HIGH,Clerk#000000616,0,ly about the carefully express theodolites. ir...,32,...,0.04,0.03,N,O,1995-07-21,1995-09-23,1995-07-25,COLLECT COD,RAIL,ly final asymptotes. qui


In [70]:
# Right join
right_join_df = orders_df.merge(
    lineitem_df,
    left_on="o_orderkey",
    right_on="l_orderkey",
    how="right"
)

right_join_df[
    (right_join_df["o_orderdate"] >= right_join_df["l_shipdate"] - pd.Timedelta(days=5)) &
    (right_join_df["o_orderdate"] <= right_join_df["l_shipdate"] + pd.Timedelta(days=5))
].head(2)

Unnamed: 0,o_orderkey,o_custkey,o_orderstatus,o_totalprice,o_orderdate,o_orderpriority,o_clerk,o_shippriority,o_comment,l_orderkey,...,l_discount,l_tax,l_returnflag,l_linestatus,l_shipdate,l_commitdate,l_receiptdate,l_shipinstruct,l_shipmode,l_comment
20,7,392,O,271885.66,1996-01-10,2-HIGH,Clerk#000000470,0,", ironic packages wa",7,...,0.1,0.07,N,O,1996-01-15,1996-03-27,1996-02-03,COLLECT COD,MAIL,accounts. reque
30,32,1301,O,198665.57,1995-07-16,2-HIGH,Clerk#000000616,0,ly about the carefully express theodolites. ir...,32,...,0.04,0.03,N,O,1995-07-21,1995-09-23,1995-07-25,COLLECT COD,RAIL,ly final asymptotes. qui


In [71]:
# Full join
full_join_df = orders_df.merge(
    lineitem_df,
    left_on="o_orderkey",
    right_on="l_orderkey",
    how="outer"
)

full_join_df[
    (full_join_df["o_orderdate"] >= full_join_df["l_shipdate"] - pd.Timedelta(days=5)) &
    (full_join_df["o_orderdate"] <= full_join_df["l_shipdate"] + pd.Timedelta(days=5))
].head(2)

Unnamed: 0,o_orderkey,o_custkey,o_orderstatus,o_totalprice,o_orderdate,o_orderpriority,o_clerk,o_shippriority,o_comment,l_orderkey,...,l_discount,l_tax,l_returnflag,l_linestatus,l_shipdate,l_commitdate,l_receiptdate,l_shipinstruct,l_shipmode,l_comment
20,7,392,O,271885.66,1996-01-10,2-HIGH,Clerk#000000470,0,", ironic packages wa",7,...,0.1,0.07,N,O,1996-01-15,1996-03-27,1996-02-03,COLLECT COD,MAIL,accounts. reque
30,32,1301,O,198665.57,1995-07-16,2-HIGH,Clerk#000000616,0,ly about the carefully express theodolites. ir...,32,...,0.04,0.03,N,O,1995-07-21,1995-09-23,1995-07-25,COLLECT COD,RAIL,ly final asymptotes. qui


### Popularity of Pandas in data science made it a critical part of most data pipelines