In [2]:
# Find all pairs of number in an array that sum to a given target (optimize for large datasets)

def find_pairs(nums, target):
    seen = {}
    pairs = []

    for num in nums:
        complement = target - num
        if complement in seen:
            # Add all occurrences of complement seen so far
            for _ in range(seen[complement]):
                pairs.append((complement, num))
        # Track how many times num has appeared
        seen[num] = seen.get(num, 0) + 1

    return pairs


# Example usage
arr = [2, 4, 3, 7, 1, 5, 8, 9, 6]
target = 10
print(find_pairs(arr, target))

[(3, 7), (2, 8), (1, 9), (4, 6)]


In [None]:
# You receive a daily CSV dump from a payment gateway
# Question : write python code to 1) ignore invalid amounts, calculate net amount per user, then return a dictionary {user-id: net_amount}
"""
1. define the "invalid amount as
2. calculate net amount per user - is it daily net / weekly net
"""

import csv
from collections import defaultdict

def compute_net_amount(file_path):
    result = defaultdict(float)
    with open(file_path, 'r') as csv_file:
        csv_reader = csv.reader(csv_file, delimiter=',')
        for row in csv_reader:
            try:
                amount = float(row["amount"])
                user_id = row["user_id"]
                result[user_id] += amount
            except ValueError:
                continue

    return dict(result)

"""
Follow-ups
How does this scale to 10GB files
Would you use pandas ?
How would you unit test this ?
How would you log bad records ?
"""


Question:
You have two massive tables
    - Payments
    - refunds

How would you endure net amounts never go negative incorrectly due to out-of-order ingestion >



In [None]:
"""
Problem: Streaming logic
you are given a huge file, find top 5 users by total amount

Assumptions:
file has only two fields user, amount and is comma spearated

"""

from collections import defaultdict
import heapq

def top_users(file_path, k=5):
    totals = defaultdict(float)
    with open(file_path, 'r') as csv_file:
        csv_reader = csv.reader(csv_file, delimiter=',')
        for row in csv_reader:
            user, amount = row.strip().split(",")
            totals[user] += float(amount)
    return heapq.nlargest(k, totals.items(), key=lambda x: x[1])



# -- alternate logic (streaming logic ... reads the file line by line )
import heapq
from collections import defaultdict

def top5_users_by_amount(file_path):
    user_totals = defaultdict(float)

    # Stream through the file line by line
    with open(file_path, 'r') as f:
        for line in f:
            # Example format: user_id,amount
            parts = line.strip().split(',')
            if len(parts) != 2:
                continue  # skip malformed lines
            user_id, amount_str = parts
            try:
                amount = float(amount_str)
            except ValueError:
                continue  # skip bad data
            user_totals[user_id] += amount

    # Use heap to get top 5 efficiently
    top5 = heapq.nlargest(5, user_totals.items(), key=lambda x: x[1])
    return top5


# Example usage
if __name__ == "__main__":
    result = top5_users_by_amount("huge_file.csv")
    for user, total in result:
        print(f"User {user}: {total:.2f}")


"""
Scaling Considerations
• 	If the file is truly massive (billions of rows):
• 	Consider chunked processing with .
• 	Or use external sort/aggregation tools (Spark, Dask, Flink).
• 	If user IDs are extremely numerous:
• 	You may need a streaming top‑k algorithm (like a min‑heap of size 5 maintained during iteration).
• 	That way you don’t store all users, only the top candidates.

"""

In [None]:
"""
Puzzle: Given transaction events
events = [
('txn1', 100),
('txn2', 200),
('txn3', 300),
('txn1', 100)   # duplicate
]

Task: Ensure each transaction is processed only once
"""

events = [
('txn1', 100),
('txn2', 200),
('txn3', 300),
('txn1', 100)   # duplicate
]

def process_events(events):
    seen = ()
    total = 0

    for txn_id, amount in events:
        if txn_id not in seen:
            seen.add(txn_id)
            total += amount
    return total


# ---- alternatively

processed = set()   # keep track of seen transaction IDs
total = 0

for txn_id, amount in events:
    if txn_id not in processed:
        # process only once
        total += amount
        processed.add(txn_id)

print("Total processed amount:", total)
print("Processed transactions:", processed)


"""
Follow up questions

What if the process crashes ?
    Executed stand alone ... data is gone

Where would state live ?
    - Small scale / Python script: store processed IDs in a database table or Redis set.
    - Large scale / production: use a streaming framework with checkpointing (Flink, Spark) or rely on idempotent sinks.
    - Critical systems: combine both — checkpoint state and enforce idempotency downstream.

"""


In [None]:
# Data Quality
"""
data = ["100", "abc", "", None, "200"]
Return sum of valid numbers only
"""

data = ["100", "abc", "", None, "200"]
total = 0
for val in data:
    try:
        total += float(val)
    except (TypeError, ValueError):
        continue

print (total)

In [None]:
# One messy pandas dataset problem



In [None]:
# Latest profile per user


In [3]:
import csv
from collections import defaultdict

transactions = [
    {"user_id": "U1", "txn_date": "2025-01-01", "net_amount": 100},
    {"user_id": "U1", "txn_date": "2025-01-02", "net_amount": 300},
    {"user_id": "U1", "txn_date": "2025-01-03", "net_amount": -50},
    {"user_id": "U1", "txn_date": "2025-01-05", "net_amount": 200},
    {"user_id": "U1", "txn_date": "2025-01-06", "net_amount": 150},

    {"user_id": "U2", "txn_date": "2025-01-01", "net_amount": 500},
    {"user_id": "U2", "txn_date": "2025-01-03", "net_amount": 400},
    {"user_id": "U2", "txn_date": "2025-01-04", "net_amount": -100},
    {"user_id": "U2", "txn_date": "2025-01-08", "net_amount": 250},
]


from collections import defaultdict

def rolling_avg_simple(data, window=7):
    per_user = defaultdict(list)

    # Group by user
    for row in data:
        per_user[row["user_id"]].append(
            (row["txn_date"], row["net_amount"])
        )

    results = []

    # Compute rolling averages
    for user, records in per_user.items():
        records.sort()  # sort by date

        for i in range(len(records)):
            window_vals = [
                amt for _, amt in records[max(0, i-window+1):i+1]
            ]
            avg = sum(window_vals) / len(window_vals)

            results.append({
                "user_id": user,
                "txn_date": records[i][0],
                "rolling_avg": round(avg, 2)
            })

    return results


output = rolling_avg_simple(transactions)
for row in output:
    print(row)


{'user_id': 'U1', 'txn_date': '2025-01-01', 'rolling_avg': 100.0}
{'user_id': 'U1', 'txn_date': '2025-01-02', 'rolling_avg': 200.0}
{'user_id': 'U1', 'txn_date': '2025-01-03', 'rolling_avg': 116.67}
{'user_id': 'U1', 'txn_date': '2025-01-05', 'rolling_avg': 137.5}
{'user_id': 'U1', 'txn_date': '2025-01-06', 'rolling_avg': 140.0}
{'user_id': 'U2', 'txn_date': '2025-01-01', 'rolling_avg': 500.0}
{'user_id': 'U2', 'txn_date': '2025-01-03', 'rolling_avg': 450.0}
{'user_id': 'U2', 'txn_date': '2025-01-04', 'rolling_avg': 266.67}
{'user_id': 'U2', 'txn_date': '2025-01-08', 'rolling_avg': 262.5}
