In [1]:
import dlt
import duckdb
from typing import Literal

## Homework

### 1. Basic generator usage

Remember the concept of generator? Let's practice using them to futher our understanding of how they work.  
Let's define a generator and then run it as practice.

The following **square_root_gen** is required for Questions 1 and 2

In [2]:
def square_root_gen(limit):
    for num in range(1, limit+1):
        yield num ** 0.5

**Question 1**: What is the sum of the outputs of the generator for limit = 5?
- [ ] A: 10.234
- [ ] B: 7.892
- [x] C: 8.382
- [ ] D: 9.123

In [3]:
sqrt_generator = square_root_gen(5)

In [4]:
print(sum([entry for entry in sqrt_generator]))

8.382332347441762


**Question 2**: What is the 13th number yielded by the generator?
- [ ] A: 4.236
- [x] B: 3.605
- [ ] C: 2.345
- [ ] D: 5.678

In [5]:
sqrt_generator_q2 = square_root_gen(13)

In [6]:
entries = list(sqrt_generator_q2)

In [7]:
entries[12]

3.605551275463989

### 2. Write to DuckDB with Append strategy

Below you have the **citizen_gen** generator. You will be tasked to load them to duckdb and answer some questions from the data

1. Load the first generator and calculate the sum of ages of all people. Make sure to only load it once.
2. Append the second generator to the same table as the first.
3. **After correctly appending the data, calculate the sum of all ages of people.**


In [8]:
citizen_a = lambda num: {
    'id': num,
    'name': f"Person_{num}",
    'age': 25 + num,
    'city': "City_A",
}

citizen_b = lambda num: {
    'id': num,
    'name': f"Person_{num}",
    'age': 30 + num,
    'city': "City_B",
    'occupation': f"Job_{num}",
}

In [9]:
def citizen_gen(type: Literal["a", "b"]):
    if (type == "a"):
        for _ in range(1, 6):
            yield citizen_a(_)
    elif (type == "b"):
        for _ in range(3, 9):
            yield citizen_b(_)
    else:
        raise RuntimeError("Accepted parameters are 'a' or 'b'")

In [10]:
citizens_a = citizen_gen('a')
citizens_b = citizen_gen('b')

**Question 3**: Append the 2 generators. After correctly appending the data, calculate the sum of all ages of people.
- [x] A: 353
- [ ] B: 365
- [ ] C: 378
- [ ] D: 390

In [11]:
conn = duckdb.connect()

In [12]:
pipeline = dlt.pipeline(
    pipeline_name='py_gen_pipeline',
    destination=dlt.destinations.duckdb(conn), 
    dataset_name='dlt',
)

In [13]:
pipeline.run(
    citizen_gen('a'),
    table_name='citizens', 
    write_disposition='replace'
);

In [14]:
pipeline.run(
    citizen_gen('b'), 
    table_name='citizens', 
    write_disposition='append'
);

In [15]:
conn.sql("SELECT * FROM dlt.citizens").df()

Unnamed: 0,id,name,age,city,_dlt_load_id,_dlt_id,occupation
0,1,Person_1,26,City_A,1708274457.706264,UIiGUDcGSil/fA,
1,2,Person_2,27,City_A,1708274457.706264,tQrUcD+M8UUgXw,
2,3,Person_3,28,City_A,1708274457.706264,mokduLXNK//zmw,
3,4,Person_4,29,City_A,1708274457.706264,M7zAzz2YaYNzIA,
4,5,Person_5,30,City_A,1708274457.706264,gvWnxcLiLzoW+g,
5,3,Person_3,33,City_B,1708274459.0597398,w3uiv8F3m7l+Nw,Job_3
6,4,Person_4,34,City_B,1708274459.0597398,1mCdOPWlvVxXWA,Job_4
7,5,Person_5,35,City_B,1708274459.0597398,1C5uXNbG4Qypbw,Job_5
8,6,Person_6,36,City_B,1708274459.0597398,7YKEjNqNa+I1ZA,Job_6
9,7,Person_7,37,City_B,1708274459.0597398,MNI8SHqg6POlEg,Job_7


In [16]:
conn.sql("SELECT sum(age) FROM dlt.citizens")

┌──────────┐
│ sum(age) │
│  int128  │
├──────────┤
│      353 │
└──────────┘

In [17]:
conn.close()

### 3. Write to DuckDB with Merge strategy

Re-use the generators from topic 2.

A table's primary key needs to be created from the start, so load your data to a new table with primary key ID.

Load your first generator first, and then load the second one with merge. Since they have overlapping IDs, some of the records from the first load should be replaced by the ones from the second load.

After loading, you should have a total of 8 records, and ID 3 should have age 33.

Question: **Calculate the sum of ages of all the people loaded as described above.**


**Question 4**: Merge the 2 generators using the ID column. Calculate the sum of ages of all the people loaded as described above.
- [ ] A: 205
- [x] B: 266
- [ ] C: 241
- [ ] D: 258

In [18]:
conn = duckdb.connect()

In [19]:
pipeline = dlt.pipeline(
    pipeline_name='py_gen_pipeline',
    destination=dlt.destinations.duckdb(conn), 
    dataset_name='dlt',
)

In [20]:
pipeline.run(
    citizen_gen('a'),
    table_name='citizens', 
    write_disposition='replace'
);

In [21]:
pipeline.run(
    citizen_gen('b'), 
    table_name='citizens', 
    write_disposition='merge',
    primary_key='id',
);

In [22]:
conn.sql("SELECT * FROM dlt.citizens").df()

Unnamed: 0,id,name,age,city,_dlt_load_id,_dlt_id,occupation
0,1,Person_1,26,City_A,1708274462.283322,f1+IW0TglnFYdw,
1,2,Person_2,27,City_A,1708274462.283322,PrHbEV/+y9Qt0g,
2,5,Person_5,35,City_B,1708274462.846535,tzrDCNOd5wF3LQ,Job_5
3,3,Person_3,33,City_B,1708274462.846535,F1/YjRWTRjCDKQ,Job_3
4,6,Person_6,36,City_B,1708274462.846535,Q5gVdSPfvyTXMw,Job_6
5,4,Person_4,34,City_B,1708274462.846535,QiaWaVygMsUO2w,Job_4
6,8,Person_8,38,City_B,1708274462.846535,KoiJTUdyslWnnA,Job_8
7,7,Person_7,37,City_B,1708274462.846535,rIl7gYl1xsGkOw,Job_7


In [23]:
conn.sql("SELECT sum(age) FROM dlt.citizens")

┌──────────┐
│ sum(age) │
│  int128  │
├──────────┤
│      266 │
└──────────┘

In [24]:
conn.close()