**ETL operation on the data in the link below** \
which has multible file formats `csv`, `json`, `xml` \
and four headers for each file `car_model`, `year_of_manufacture`, `price`, `fuel`
```
https://cf-courses-data.s3.us.cloud-object-storage.appdomain.cloud/IBMDeveloperSkillsNetwork-PY0221EN-SkillsNetwork/labs/module%206/Lab%20-%20Extract%20Transform%20Load/data/datasource.zip
```

In [1]:
%%bash
mkdir data_source
cd data_source
wget https://cf-courses-data.s3.us.cloud-object-storage.appdomain.cloud/IBMDeveloperSkillsNetwork-PY0221EN-SkillsNetwork/labs/module%206/Lab%20-%20Extract%20Transform%20Load/data/datasource.zip
unzip datasource.zip
rm datasource.zip

Archive:  datasource.zip
  inflating: used_car_prices1.csv    
  inflating: used_car_prices2.csv    
  inflating: used_car_prices3.csv    
  inflating: used_car_prices1.json   
  inflating: used_car_prices2.json   
  inflating: used_car_prices3.json   
  inflating: used_car_prices1.xml    
  inflating: used_car_prices2.xml    
  inflating: used_car_prices3.xml    


--2025-02-14 03:29:26--  https://cf-courses-data.s3.us.cloud-object-storage.appdomain.cloud/IBMDeveloperSkillsNetwork-PY0221EN-SkillsNetwork/labs/module%206/Lab%20-%20Extract%20Transform%20Load/data/datasource.zip
Resolving cf-courses-data.s3.us.cloud-object-storage.appdomain.cloud (cf-courses-data.s3.us.cloud-object-storage.appdomain.cloud)... 169.45.118.108
Connecting to cf-courses-data.s3.us.cloud-object-storage.appdomain.cloud (cf-courses-data.s3.us.cloud-object-storage.appdomain.cloud)|169.45.118.108|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 4249 (4.1K) [application/zip]
Saving to: ‘datasource.zip’

     0K ....                                                  100% 1.36G=0s

2025-02-14 03:29:27 (1.36 GB/s) - ‘datasource.zip’ saved [4249/4249]



In [2]:
import pandas as pd
from datetime import datetime
from glob import glob
import xml.etree.ElementTree as ET

In [3]:
target_file = "transformed_data.csv"
log_file = "log_file.txt"

In [4]:
# define the functions to read the files from the folder
def extract_from_csv(file_to_process):
    return pd.read_csv(file_to_process)

def extract_from_json(file_to_process):
    return pd.read_json(file_to_process, lines=True)

def extract_from_xml(file_to_process):
    df = pd.DataFrame(columns=['car_model', 'year_of_manufacture', 'price', 'fuel'])
    tree = ET.parse(file_to_process)
    root = tree.getroot()
    for car in root:
        car_model = car.find('car_model').text
        year_of_manufacture = int(car.find('year_of_manufacture').text)
        price = float(car.find('price').text)
        fuel = car.find('fuel').text
        car_to_data_frame = {
            'car_model': [car_model],
            'year_of_manufacture': [year_of_manufacture],
            'price': [price],
            'fuel': [fuel]
        }
        car_to_data_frame = pd.DataFrame(car_to_data_frame)
        df = pd.concat([df, car_to_data_frame], ignore_index=True)
    return df

In [5]:
# the extract function
def extract():
    extracted_data = pd.DataFrame(columns=['car_model', 'year_of_manufacture', 'price', 'fuel'])
    for csvfile in glob("data_source/*.csv"):
        extracted_data = pd.concat([extracted_data, extract_from_csv(csvfile)], ignore_index=True)

    for jsonfile in glob("data_source/*.json"):
        extracted_data = pd.concat([extracted_data, extract_from_json(jsonfile)], ignore_index=True)

    for xmlfile in glob("data_source/*.xml"):
        extracted_data = pd.concat([extracted_data, extract_from_xml(xmlfile)], ignore_index=True)

    return extracted_data

In [6]:
# the tranformation function
def tranform(data):
    data["price"] = round(data["price"], 2)
    return data

In [7]:
# the load function
def load(targetfile, data_to_load):
    data_to_load.to_csv(targetfile)

In [8]:
# the logging mechanism
def log(message):
    timestamp_format = "%Y-%h-%d-%H:%M:%S"
    now = datetime.now()
    timestamp = now.strftime(timestamp_format)
    with open(log_file, "a") as f:
        f.write(f"{timestamp} ,{message}")

In [9]:
# running the ETL
log("ETL Job Started")

log("Extract phase Started")
extracted_data = extract()
log("Extract phase Ended")
print("Extracted data")
print(extracted_data.head())
print("\n\n\n")

log("Transform phase Started")
transformed_data = tranform(extracted_data)
log("Transform phase Ended")
print("Transformed data")
print(transformed_data.head())

log("Load phase Started")
load(target_file, transformed_data)
log("Load phase Ended")

log("ETL Job Ended")

Extracted data
  car_model year_of_manufacture         price    fuel
0      ritz                2014   5000.000000  Petrol
1       sx4                2013   7089.552239  Diesel
2      ciaz                2017  10820.895522  Petrol
3   wagon r                2011   4253.731343  Petrol
4     swift                2014   6865.671642  Diesel




Transformed data
  car_model year_of_manufacture     price    fuel
0      ritz                2014   5000.00  Petrol
1       sx4                2013   7089.55  Diesel
2      ciaz                2017  10820.90  Petrol
3   wagon r                2011   4253.73  Petrol
4     swift                2014   6865.67  Diesel


  extracted_data = pd.concat([extracted_data, extract_from_csv(csvfile)], ignore_index=True)
  df = pd.concat([df, car_to_data_frame], ignore_index=True)
  df = pd.concat([df, car_to_data_frame], ignore_index=True)
  df = pd.concat([df, car_to_data_frame], ignore_index=True)


In [10]:
%%bash
head transformed_data.csv

,car_model,year_of_manufacture,price,fuel
0,ritz,2014,5000.0,Petrol
1,sx4,2013,7089.55,Diesel
2,ciaz,2017,10820.9,Petrol
3,wagon r,2011,4253.73,Petrol
4,swift,2014,6865.67,Diesel
5,vitara brezza,2018,13805.97,Diesel
6,ciaz,2015,10074.63,Petrol
7,s cross,2015,9701.49,Diesel
8,ciaz,2016,13059.7,Diesel
