In [1]:
import pandas as pd

# Loading dataset
df = pd.read_csv("custom_data.csv")

# Showing the dataset overview
print("Full Dataset Preview:")
print(df.head())

print(f"\nDataset contains {df.shape[0]} rows and {df.shape[1]} columns.")
print(f"Extracted {df.shape[0]} rows fully.")

Full Dataset Preview:
  CustomerName Product  Quantity  Price  TotalPrice        Date     Category
0        Alice  Laptop         3    350        1050  29/01/2024  Electronics
1          Bob  Laptop         5    189         945  16/03/2024  Electronics
2        Diana  Laptop         1    195         195  28/01/2024  Electronics
3          Bob  Laptop         5    303        1515  01/04/2024  Electronics
4        Diana   Phone         4    703        2812  05/02/2024  Electronics

Dataset contains 50 rows and 7 columns.
Extracted 50 rows fully.


In [2]:
from datetime import datetime

# Converting the Date column to datetime format 
df['Date'] = pd.to_datetime(df['Date'], dayfirst=True)

# Reading the last extraction date
with open("last_extraction.txt", "r") as file:
    last_extraction_str = file.read().strip()

last_extraction_date = datetime.strptime(last_extraction_str, "%d/%m/%Y")

# Filtering rows where date is after the last extraction
incremental_df = df[df['Date'] > last_extraction_date]

print(f"\nExtracted {incremental_df.shape[0]} rows incrementally since last check.")
print(incremental_df)



Extracted 0 rows incrementally since last check.
Empty DataFrame
Columns: [CustomerName, Product, Quantity, Price, TotalPrice, Date, Category]
Index: []


In [3]:
# Getting the latest date from the dataset
new_last_extraction = df['Date'].max()

# Saving the new extraction date
with open("last_extraction.txt", "w") as file:
    file.write(new_last_extraction.strftime("%d/%m/%Y"))

print(f"\nUpdated last_extraction.txt with new date: {new_last_extraction.strftime('%d/%m/%Y')}")



Updated last_extraction.txt with new date: 07/04/2024


In [None]:
import pandas as pd

# Loading full dataset
full_df = pd.read_csv('custom_data.csv')

# 1. Cleaning: Remove duplicates
full_df = full_df.drop_duplicates()

# 2. Cleaning: Handle missing values
full_df['Quantity'] = full_df['Quantity'].fillna(0)
full_df['Price'] = full_df['Price'].fillna(0)

# 3. Enriching: Add TotalPrice
full_df['total_price'] = full_df['Quantity'] * full_df['Price']

# 4. Structuring: Convert order_date to datetime
full_df['Date'] = pd.to_datetime(full_df['Date'], errors='coerce')

# 5. Categorizing TotalPrice
bins = [0, 100, 500, 1000, float('inf')]
labels = ['Low', 'Medium', 'High', 'Very High']
full_df['PriceCategory'] = pd.cut(full_df['TotalPrice'], bins=bins, labels=labels)

# Saving transformed full dataset
full_df.to_csv('transformed_full.csv', index=False)

# Preview
full_df.head()



  full_df['Date'] = pd.to_datetime(full_df['Date'], errors='coerce')


Unnamed: 0,CustomerName,Product,Quantity,Price,TotalPrice,Date,Category,total_price,PriceCategory
0,Alice,Laptop,3,350,1050,2024-01-29,Electronics,1050,Very High
1,Bob,Laptop,5,189,945,2024-03-16,Electronics,945,High
2,Diana,Laptop,1,195,195,2024-01-28,Electronics,195,Medium
3,Bob,Laptop,5,303,1515,2024-04-01,Electronics,1515,Very High
4,Diana,Phone,4,703,2812,2024-02-05,Electronics,2812,Very High


In [24]:
import pandas as pd



# Simulating incremental data from full data 
full_df = pd.read_csv('custom_data.csv')
inc_df = full_df.tail(20).copy()  # simulate incremental data

# 1. Cleaning: Remove duplicates
inc_df = inc_df.drop_duplicates()

# 2. Cleaning: Handle missing values
inc_df['Quantity'] = inc_df['Quantity'].fillna(0)
inc_df['Price'] = inc_df['Price'].fillna(0)

# 3. Enriching: Add total_price
inc_df['TotalPrice'] = inc_df['Quantity'] * inc_df['Price']

# 4. Structuring: Convert Date to datetime
inc_df['Date'] = pd.to_datetime(inc_df['Date'], errors='coerce')

# 5. Categorizing TotalPrice
bins = [0, 100, 500, 1000, float('inf')]
labels = ['Low', 'Medium', 'High', 'Very High']
inc_df['PriceCategory'] = pd.cut(inc_df['TotalPrice'], bins=bins, labels=labels)

# Saving the transformed incremental dataset
inc_df.to_csv('transformed_incremental.csv', index=False)

# Showing sample
print("Transformed Incremental Data Sample:")
print(inc_df.head())


Transformed Incremental Data Sample:
   CustomerName  Product  Quantity  Price  TotalPrice       Date     Category  \
30        Alice   Laptop         3    999        2997 2024-09-02  Electronics   
31          Bob   Laptop         2    999        1998        NaT  Electronics   
32        Alice   Laptop         4    935        3740 2024-09-01  Electronics   
33          Bob    Phone         4    662        2648        NaT  Electronics   
34      Charlie  Monitor         2    652        1304 2024-06-04  Accessories   

   PriceCategory  
30     Very High  
31     Very High  
32     Very High  
33     Very High  
34     Very High  
