# Data Extraction

## CSV

In [0]:
path = 'dbfs:/FileStore/sample_data/dirty_cafe_sales.csv' #Load File Path from FIle Explorer

In [0]:
cafe_df = (spark.read 
    .format('csv')                          # Format of File Loaded into Databricks
    .option('header','true')                # Use first line as header
    .option('inferschema',"true")           # Infer data types 
    .load(path))                            # Path to source file

In [0]:
display(cafe_df.head(10)) #Show the first 10 rows

Transaction ID,Item,Quantity,Price Per Unit,Total Spent,Payment Method,Location,Transaction Date
TXN_1961373,Coffee,2,2.0,4.0,Credit Card,Takeaway,2023-09-08
TXN_4977031,Cake,4,3.0,12.0,Cash,In-store,2023-05-16
TXN_4271903,Cookie,4,1.0,ERROR,Credit Card,In-store,2023-07-19
TXN_7034554,Salad,2,5.0,10.0,UNKNOWN,UNKNOWN,2023-04-27
TXN_3160411,Coffee,2,2.0,4.0,Digital Wallet,In-store,2023-06-11
TXN_2602893,Smoothie,5,4.0,20.0,Credit Card,,2023-03-31
TXN_4433211,UNKNOWN,3,3.0,9.0,ERROR,Takeaway,2023-10-06
TXN_6699534,Sandwich,4,4.0,16.0,Cash,UNKNOWN,2023-10-28
TXN_4717867,,5,3.0,15.0,,Takeaway,2023-07-28
TXN_2064365,Sandwich,5,4.0,20.0,,In-store,2023-12-31


In [0]:
cafe_df.dtypes  # Check data types

[('Transaction ID', 'string'),
 ('Item', 'string'),
 ('Quantity', 'string'),
 ('Price Per Unit', 'string'),
 ('Total Spent', 'string'),
 ('Payment Method', 'string'),
 ('Location', 'string'),
 ('Transaction Date', 'string')]

In [0]:
cafe_df.schema #The boolean value shows if the field is nullable

StructType([StructField('Transaction ID', StringType(), True), StructField('Item', StringType(), True), StructField('Quantity', StringType(), True), StructField('Price Per Unit', StringType(), True), StructField('Total Spent', StringType(), True), StructField('Payment Method', StringType(), True), StructField('Location', StringType(), True), StructField('Transaction Date', StringType(), True)])

## JSON

In [0]:
json_path = 'dbfs:/FileStore/sample_data/countries_single_line.json'

countries_df = (spark.read 
    .format('json')
    .option('header','true')
    .option('inferschema','true')
    .load(json_path))

In [0]:
display(countries_df.head(10))

AREA_KM2,CAPITAL,COUNTRY_CODE,COUNTRY_ID,INTERMEDIATE_REGION_ID,ISO_ALPHA2,NAME,NATIONALITY,ORGANIZATION_REGION_ID,POPULATION,REGION_ID,SUB_REGION_ID
652230.0,Kabul,AFG,1,,AF,Afghanistan,Afghan,30,38041754,30,30.0
28748.0,Tirana,ALB,2,,AL,Albania,Albanian,20,2880917,20,70.0
2381741.0,Algiers,DZA,3,,DZ,Algeria,Algerian,20,43053054,50,40.0
199.0,Pago Pago,ASM,4,,AS,American Samoa,American Samoan,30,55312,40,20.0
468.0,Andorra la Vella,AND,5,,AD,Andorra,Andorran,20,77142,20,70.0
1246700.0,Luanda,AGO,6,80.0,AO,Angola,Angolan,20,31825295,50,160.0
91.0,The Valley,AIA,7,60.0,AI,Anguilla,Anguillan,40,14869,10,10.0
14200000.0,McMurdo Station,ATA,8,,AQ,Antarctica,Antarctic,30,1106,40,
442.0,St. John's,ATG,9,60.0,AG,Antigua and Barbuda,Antiguan or Barbudan,40,97118,10,10.0
2780400.0,Buenos Aires,ARG,10,40.0,AR,Argentina,Argentine,40,44780677,10,10.0


# Multi Line JSON FIle

In [0]:
json_multi_line_path = 'dbfs:/FileStore/sample_data/countries_multi_line.json'

countries_df2 = (spark.read 
    .format('json')
    .option('header','true')
    .option('inferschema','true')
    .option("multiline","true")    # Must be true to load multi line JSON file.
    .load(json_multi_line_path))

In [0]:
text_path = 'dbfs:/FileStore/sample_data/countries.txt'

countries_df3 = (spark.read 
    .format('csv')                # Load txt files in as CSV
    .option('header','true')
    .option('inferschema','true')
    .option('sep', '\t')          # seperated by tabs
    .load(text_path))

# Load Data to Silver Layer

In [0]:
cafe_df.createOrReplaceTempView("cafe_temp_view")

In [0]:
'''

catalog = "bronze_prod"
schema = "bronze_db"
table_name = "cafe_bronze"


role_table = f"{catalog}.{schema}.{table_name}"


spark.sql("SELECT * FROM cafe_temp_view").write.format("delta").mode("overwrite").option("mergeSchema", "true").saveAsTable(role_table)

'''

'\n\ncatalog = "bronze_prod"\nschema = "bronze_db"\ntable_name = "cafe_bronze"\n\n\nrole_table = f"{catalog}.{schema}.{table_name}"\n\n\nspark.sql("SELECT * FROM role_table_view").write.format("delta").mode("overwrite").option("mergeSchema", "true").saveAsTable(role_table)\n\n'