In [1]:
import glob
import pandas as pd
import xml.etree.ElementTree as ET
from datetime import datetime

In [2]:
# two file paths that will be available globally in the code for all functions
log_file = "log_file.txt" # stores all the logs
target_file = "transformed_data.csv" # store the final output data that you can load to a database

In [None]:
# extract the data from different file formats
def extract_from_csv(file_to_process):
    df = pd.read_csv(file_to_process)
    return df

def extract_from_json(file_to_process):
    df = pd.read_json(file_to_process, lines=True)
    return df

def extract_from_xml(file_to_process):
    df = pd.DataFrame(columns=["car_model", "year_of_manufacture", "price", "fuel"])
    tree = ET.parse(file_to_process)
    root = tree.getroot()
    for car in root:
        car_model = car.find("car_model").text
        year_of_manufature = int(car.find("year_of_manufacture").text)
        price = float(car.find("price").text)
        fuel = car.find("fuel").text
        df = pd.concat([df, pd.DataFrame([{"car_model":car_model, "year_of_manufacture":year_of_manufature, "price":price, "fuel":fuel}])], ignore_index=True)
    return df

# extract_from_json("/home/jbyers/Data_Engineering/PerScholas_DE/Data_Engineering_Trng/Intro_Big_Data_ETL/data_source/used_car_prices1.json")

In [11]:
def extract(): 
    extracted_data = pd.DataFrame(columns=["car_model", "year_of_manufacture", "price", "fuel"]) # create an empty data frame to hold extracted data 
     
    # process all csv files, except the target file
    for csvfile in glob.glob("*.csv"): 
        if csvfile != target_file:  # check if the file is not the target file
            extracted_data = pd.concat([extracted_data, pd.DataFrame(extract_from_csv(csvfile))], ignore_index=True) 
         
    # process all json files 
    for jsonfile in glob.glob("*.json"): 
        extracted_data = pd.concat([extracted_data, pd.DataFrame(extract_from_json(jsonfile))], ignore_index=True) 
     
    # process all xml files 
    for xmlfile in glob.glob("*.xml"): 
        extracted_data = pd.concat([extracted_data, pd.DataFrame(extract_from_xml(xmlfile))], ignore_index=True) 
         
    return extracted_data 

In [7]:
# Transform the values under the 'price' header such that they are rounded to 2 decimal places
def transform(data):
    data["price"] = round(data.price, 2)
    return data

In [13]:
def load_data(target_file, transformed_data):
    transformed_data.to_csv(target_file, index=False)

In [9]:
def log_progress(message):
    timestamp_format = '%Y-%h-%d-%H:%M:%S' # Year-Monthname-Day-Hour-Minute-Second
    now = datetime.now() # get current timestamp
    timestamp = now.strftime(timestamp_format)
    with open(log_file, "a") as f:
        f.write(timestamp + ',' + message + '\n')

In [14]:
# Log the initialization of the ETL process 
log_progress("ETL Job Started") 
 
# Log the beginning of the Extraction process 
log_progress("Extract phase Started") 
extracted_data = extract() 
 
# Log the completion of the Extraction process 
log_progress("Extract phase Ended") 
 
# Log the beginning of the Transformation process 
log_progress("Transform phase Started") 
transformed_data = transform(extracted_data) 
print("Transformed Data") 
print(transformed_data) 
 
# Log the completion of the Transformation process 
log_progress("Transform phase Ended") 
 
# Log the beginning of the Loading process 
log_progress("Load phase Started") 
load_data(target_file,transformed_data) 
 
# Log the completion of the Loading process 
log_progress("Load phase Ended") 
 
# Log the completion of the ETL process 
log_progress("ETL Job Ended") 

  extracted_data = pd.concat([extracted_data, pd.DataFrame(extract_from_csv(csvfile))], ignore_index=True)
  df = pd.concat([df, pd.DataFrame([{"car_model":car_model, "year_of_manufacture":year_of_manufature, "price":price, "fuel":fuel}])], ignore_index=True)
  df = pd.concat([df, pd.DataFrame([{"car_model":car_model, "year_of_manufacture":year_of_manufature, "price":price, "fuel":fuel}])], ignore_index=True)
  df = pd.concat([df, pd.DataFrame([{"car_model":car_model, "year_of_manufacture":year_of_manufature, "price":price, "fuel":fuel}])], ignore_index=True)


Transformed Data
        car_model year_of_manufacture     price    fuel
0        alto k10                2016   4253.73  Petrol
1           ignis                2017   7313.43  Petrol
2             sx4                2011   6567.16  Petrol
3        alto k10                2014   3731.34  Petrol
4         wagon r                2013   4328.36  Petrol
..            ...                 ...       ...     ...
85        etios g                2015   5895.52  Petrol
86  corolla altis                2013   8208.96  Petrol
87        corolla                2004   2238.81  Petrol
88  corolla altis                2010   7835.82  Petrol
89       fortuner                2012  21641.79  Diesel

[90 rows x 4 columns]
