In [None]:
#  Required Libraries
import pandas as pd
from datetime import datetime

#  File Paths
DATA_PATH = 'custom_data.csv'
TIMESTAMP_FILE = 'last_extraction.txt'

# ------------------------------------------------------------------------------
#  Section 1: Full Extraction
# ------------------------------------------------------------------------------

# Load the entire dataset
df_full = pd.read_csv(DATA_PATH, parse_dates=['transaction_date'])

# Display basic stats
print(" Full Extraction:")
print(f"Number of rows: {df_full.shape[0]}")
print(f"Number of columns: {df_full.shape[1]}")
print("Sample data:")
print(df_full.head())

# Print extraction message
print(f"\n Extracted {df_full.shape[0]} rows fully.\n")


# ------------------------------------------------------------------------------
#  Section 2: Incremental Extraction
# ------------------------------------------------------------------------------

# Step 1: Read the last extraction timestamp
try:
    with open(TIMESTAMP_FILE, 'r') as f:
        last_extraction_str = f.read().strip()
        last_extraction_time = datetime.strptime(last_extraction_str, "%Y-%m-%d %H:%M:%S")
        print(f" Last extraction time: {last_extraction_str}")
except FileNotFoundError:
    print(" No previous extraction found. Assuming first run.")
    last_extraction_time = datetime.min  # Extract all records

# Step 2: Perform incremental extraction
df_full['transaction_date'] = pd.to_datetime(df_full['transaction_date'])
df_incremental = df_full[df_full['transaction_date'] > last_extraction_time]

# Step 3: Display incremental results
print(f"\n Incremental Extraction:")
print(f"Found {df_incremental.shape[0]} new or updated records.")
print(df_incremental.head())

# ------------------------------------------------------------------------------
#  Section 3: Save New Timestamp
# ------------------------------------------------------------------------------

# Save current timestamp (assuming this is when the ETL ran)
current_timestamp = datetime.now().strftime("%Y-%m-%d %H:%M:%S")

with open(TIMESTAMP_FILE, 'w') as f:
    f.write(current_timestamp)

print(f"\n Updated last extraction timestamp to: {current_timestamp}")


 Full Extraction:
Number of rows: 100
Number of columns: 6
Sample data:
  transaction_id customer_id     product    price  quantity  \
0          T0001       C9730     Printer  1031.72         5   
1          T0002       C7616  Headphones  1320.50         1   
2          T0003       C5973    Keyboard   941.38         5   
3          T0004       C8436  Headphones   334.55         2   
4          T0005       C6278       Phone  1097.91         2   

     transaction_date  
0 2025-06-02 21:24:30  
1 2025-05-30 04:29:30  
2 2025-06-02 14:44:30  
3 2025-06-04 17:57:30  
4 2025-06-03 14:46:30  

 Extracted 100 rows fully.

🕓 Last extraction time: 2025-06-06 18:39:38

 Incremental Extraction:
Found 0 new or updated records.
Empty DataFrame
Columns: [transaction_id, customer_id, product, price, quantity, transaction_date]
Index: []

 Updated last extraction timestamp to: 2025-06-06 18:40:30
