In [1]:
from databricks.connect import DatabricksSession

spark = DatabricksSession.builder.profile("DEFAULT").getOrCreate()

from databricks.sdk import WorkspaceClient
w = WorkspaceClient(profile = "DEFAULT")
dbutils = w.dbutils

dbutils.fs.mkdirs('/Volumes/emissions_datapipeline_workspace/default/datalake/')
dbutils.fs.mkdirs('/Volumes/emissions_datapipeline_workspace/default/datalake/raw')
dbutils.fs.mkdirs('/Volumes/emissions_datapipeline_workspace/default/datalake/curated')
dbutils.fs.mkdirs('/Volumes/emissions_datapipeline_workspace/default/datalake/serving')


display(dbutils.fs.ls('/Volumes/emissions_datapipeline_workspace/default/datalake'))


[FileInfo(path='/Volumes/emissions_datapipeline_workspace/default/datalake/curated/', name='', size=None, modificationTime=None),
 FileInfo(path='/Volumes/emissions_datapipeline_workspace/default/datalake/raw/', name='', size=None, modificationTime=None),
 FileInfo(path='/Volumes/emissions_datapipeline_workspace/default/datalake/serving/', name='', size=None, modificationTime=None)]

In [2]:
from datetime import datetime
import pathlib

current_date = datetime.today().strftime('%Y%m%d')
current_path = pathlib.Path().resolve()
local_path = f'{current_path}/data/world_bank_data/'
dbfs_path = f'/Volumes/emissions_datapipeline_workspace/default/datalake/raw/world_development_indicators/date={current_date}/'

In [None]:
# Use dbutils to copy local files to databricks volume

dbutils.fs.cp(
  f'file:{local_path}', 
  f'dbfs:/Volumes/emissions_datapipeline_workspace/default/datalake/raw/world_development_indicators/date={current_date}/', 
  recurse=True
)

In [None]:
# Use spark instead of dbutils to load files
# import os
# 
# csv_files = [file for file in os.listdir(local_path) if file.endswith(".csv")]
# 
# for csv_file in csv_files:
#     file_path = os.path.join(local_path, csv_file)
#     
#     df = spark.read.format("csv").option("header", "true").load(file_path)
#     
#     # Write DataFrame to DBFS
#     dbfs_file_path = os.path.join(dbfs_path, csv_file)
#     df.write.format("csv").mode("overwrite").save(dbfs_file_path)
#     
#     print(f"Successfully copied {csv_file} to DBFS.")

In [None]:
for year in [2017, 2018, 2019]:
  file_name = f'co2_emissions_passenger_cars_{year}.json'
  dbutils.fs.cp(
    f'file:{current_path}/data/emissions_data/{file_name}', 
    f'dbfs:/Volumes/emissions_datapipeline_workspace/default/datalake/raw/co2_passenger_cars_emissions/year={year}/{file_name}'
  )

In [13]:
df_co2_emissions                = (spark.read
                                        .option("multiline","true")
                                        .json('dbfs:/Volumes/emissions_datapipeline_workspace/default/datalake/raw/co2_passenger_cars_emissions'))
df_world_development_indicators = (spark.read
                                        .option('header', 'true')
                                        .csv(f'dbfs:/Volumes/emissions_datapipeline_workspace/default/datalake/raw/world_development_indicators/date={current_date}/world_bank_data/WDIData.csv'))

# Printing the schemas of the two dataframes
df_co2_emissions.printSchema()
df_world_development_indicators.printSchema()

root
 |-- At1 (mm): long (nullable = true)
 |-- At2 (mm): long (nullable = true)
 |-- Cn: string (nullable = true)
 |-- Cr: string (nullable = true)
 |-- Ct: string (nullable = true)
 |-- De: double (nullable = true)
 |-- E (g/km): string (nullable = true)
 |-- Enedc (g/km): long (nullable = true)
 |-- Er (g/km): string (nullable = true)
 |-- Ernedc (g/km): double (nullable = true)
 |-- Erwltp (g/km): double (nullable = true)
 |-- Ewltp (g/km): long (nullable = true)
 |-- Fm: string (nullable = true)
 |-- Ft: string (nullable = true)
 |-- ID: long (nullable = true)
 |-- It: string (nullable = true)
 |-- MMS: string (nullable = true)
 |-- MS: string (nullable = true)
 |-- Man: string (nullable = true)
 |-- Mh: string (nullable = true)
 |-- Mk: string (nullable = true)
 |-- Mp: string (nullable = true)
 |-- Mt: long (nullable = true)
 |-- Status: string (nullable = true)
 |-- T: string (nullable = true)
 |-- TAN: string (nullable = true)
 |-- VFN: string (nullable = true)
 |-- Va: string

In [14]:
print(f"Number of records  for CO2 emissions DF: {df_co2_emissions.count()}")
print(f"Number of records  for World Development Indicators: {df_world_development_indicators.count()}")

Number of records  for CO2 emissions DF: 300000
Number of records  for World Development Indicators: 383838


In [15]:
display(df_co2_emissions.describe())

Unnamed: 0,summary,At1 (mm),At2 (mm),Cn,Cr,Ct,De,E (g/km),Enedc (g/km),Er (g/km),Ernedc (g/km),Erwltp (g/km),Ewltp (g/km),Fm,Ft,ID,It,MMS,MS,Man,Mh,Mk,Mp,Mt,Status,T,TAN,VFN,Va,Ve,Vf,W (mm),Zr,ec (cm3),ep (KW),m (kg),r,version_file,year,z (Wh/km)
0,count,283776.0,276247.0,299985,299094,298302,10068.0,0.0,300000.0,0.0,7155.0,2814.0,109321.0,299993,300000,300000.0,275357,299262,299996,300000,300000,299782,300000,109779.0,300000,299974,299872,285563,299901,299844,16254.0,286685.0,0.0,299975.0,248605.0,299434.0,300000.0,300000,300000.0,57.0
1,mean,1630.8991317095174,1637.648636908274,800.1971752921536,,,-0.001682816845451,,265.74504333333334,,1.6373025856043493,0.0191897654584221,307.63519360415654,,,5457517.07155,7.285714285714286,8.0,,,,8.0,,2197.6191348072034,,474.42689563652726,111.0,,3.1930333817126228E90,322978.8396673288,0.2559985234403839,2785.987331042782,,4194.065415451288,340.59067194947806,2059.734789636447,2.302503333333333,,2018.0,217.24561403508773
2,stddev,57.18203871006447,53.35806895019124,948.4930169775346,,,0.0408104533537881,,46.16069680797576,,0.3011054880652157,0.1863295940532953,36.00059485372222,,,3902514.363239479,3.5326937743674756,0.0,,,,0.0,,517.6607731000473,,5658.785598087578,128.17175976009693,,3.349852913181437E91,684271.5794259827,0.4364344144204025,320.5153047443445,,1161.4346709683432,107.19919141481658,352.7646953486153,21.99410089550318,,0.816497941758763,21.869973464738614
3,min,501.0,501.0,,,,-0.078,,185.0,,1.0,0.0,11.0,,,307.0,,,AT,,AA-IVA,,,0.0,F,,,,,,0.0,0.0,,898.0,27.0,242.0,1.0,v15,2017.0,100.0
4,max,2895.0,2250.0,unknown,m1,m1,2.0,,671.0,,3.0,1.9,535.0,P,petrol/electric,15499148.0,e24*2,ZESZUTA,SK,VOLVO CAR CORPORATION,unknown,ZENVO,VW GROUP PC,4612.0,P,l50,e9*ks07/46*6716*01,RL-ZE1AEM57A6_0000-SJN-1,ZYX10(H),h,1.0,4762.0,,8382.0,1103.0,4160.0,3250.0,v20,2019.0,296.0


In [16]:
display(df_world_development_indicators.describe())

Unnamed: 0,summary,Country Name,Country Code,Indicator Name,Indicator Code,1960,1961,1962,1963,1964,1965,1966,1967,1968,1969,1970,1971,1972,1973,1974,1975,1976,1977,1978,1979,1980,1981,1982,1983,1984,1985,1986,1987,1988,1989,1990,1991,1992,1993,1994,1995,1996,1997,1998,1999,2000,2001,2002,2003,2004,2005,2006,2007,2008,2009,2010,2011,2012,2013,2014,2015,2016,2017,2018,2019,2020,_c65
0,count,383838,383838,383838,383838,37535.0,42462.0,44706.0,44546.0,45098.0,47297.0,47381.0,49105.0,48666.0,50502.0,69180.0,76155.0,79268.0,78762.0,80277.0,83976.0,86306.0,90294.0,89880.0,90735.0,95682.0,98055.0,99788.0,99293.0,100047.0,101427.0,102269.0,103907.0,102374.0,104236.0,127130.0,134506.0,139588.0,140165.0,142471.0,150215.0,150058.0,152364.0,153543.0,159107.0,184652.0,178983.0,184432.0,184143.0,189666.0,200501.0,201218.0,206398.0,204140.0,205281.0,219195.0,211412.0,216002.0,210128.0,217378.0,212122.0,212169.0,208500.0,201189.0,168957.0,73036.0,0.0
1,mean,,,,,259287633643.2356,247792613531.08875,250135580810.63,259032712737.9009,276204806179.2987,299069423739.23914,321279135202.025,338477462816.7722,390654264065.4587,422617969993.30286,373340955027.03815,380325284738.45306,414766670161.5174,469156360602.7701,564715894964.4286,572076534234.965,632216949189.0945,612727947209.1635,571070115143.3666,554288265502.2534,484908020579.0727,482286803208.7772,514403104221.6781,691267188612.5896,756495277802.3131,866524143954.2793,835160397134.9722,795325997568.7358,618374205145.4916,682414679137.7161,729670555818.1024,701980987118.0205,718213558042.8477,793525279109.3323,890554524984.262,918203645666.9216,885933277023.8416,899048978405.2509,813997343843.7421,797168392629.255,724812584208.5822,780909138833.3547,944828493183.1068,915218348811.1382,1053976340116.476,1078168121524.7092,1215204924967.2417,1448502151714.482,1650161892669.7764,1719860507097.7822,1915110384249.4343,2225923255743.581,2356977933618.4224,2704258989566.2744,2856930714972.209,3049426845683.234,3311004239528.729,3527528021316.5835,4064997102008.9727,5333820836933.096,12581113484111.328,
2,stddev,,,,,10797733738476.115,11147454676350.646,11630259678816.145,12239266006399.56,13185972623906.729,14755976342961.0,16251153807308.844,17743566435030.81,20395801001277.598,23088122782073.53,21965429832077.76,23866724299488.383,26802940228178.3,29520474293516.625,33410995460145.76,33339841929436.875,38252397685288.78,36528157667116.66,31790837897702.965,29211662907997.41,23823018063523.06,23176993694507.758,25873541955899.145,40112836190047.38,50519019136850.53,67470232989690.66,64298326730174.88,54234884731967.73,28442286210096.742,31259159118196.43,40879901620072.98,35217592709358.625,36889520550068.93,43272352809412.69,51808299579281.65,53962475114428.336,40014255636221.81,38284147938832.04,34157901302299.5,36579325785800.88,39897309180472.48,46194864282991.2,38057211801649.234,51908820125511.03,45011720400733.63,49300393165402.69,51787442462570.89,55858280294237.42,61821913291943.31,65697236874938.41,76358238430763.86,87334114789140.84,93849309673361.52,108604994027778.05,118786051917433.14,125957546724442.8,139455294136478.69,151002828623132.9,176367595651078.84,224759172213319.1,408074150207942.5,
3,min,Afghanistan,ABW,ARI treatment (% of children under 5 taken to a health provider),AG.AGR.TRAC.NO,-0.0033083903,-0.0017284017979619,-0.0001556363635454,-0.0002432727263636,-0.0006494545454545,-0.0006414545444545,-0.0005541818184545,-0.0008767272734545,-0.0002550001,-0.0010000001,-0.0002970001,-0.0010000001,-0.0002699999999999,-0.0004599999966666,-0.0004959106445312,-0.0001685807853699,-0.0001015662999961,-0.000157833,-0.00013113,-0.0001515258608406,-0.0002143096932329,-0.0001831054996728,-0.00030517578125,-0.0001137062376913,-0.000213623046875,-0.0002476626484751,-0.0001658762295173,-0.0001525878000165,-0.0005785801257047,-0.000457763671875,-0.0001049041748046,-0.0003173964578639,-0.0003524249336775,-0.0003200324015287,-0.0001613954558122,-0.0001193285143601,-0.0002174283122446,-0.0002583471979619,-0.0013451272070363,-0.000213623046875,-0.0001068115234375,-0.000335693359375,-0.000585132319695,-0.0005885935938202,-0.0001220703125,-0.0001220703125,-0.0010833740234375,-0.00024414,-0.0002812000910453,-0.0002899169921875,-0.0001449688558574,-0.000244140625,-0.0002390760032983,-0.0006108586228776,-0.000213623046875,-0.0002046855513528,-0.000160862773167,-0.0001490411599625,-0.0002001327828384,-0.000101089477539,-0.0009613037109375,
4,max,Zimbabwe,ZWE,Young people (ages 15-24) newly infected with HIV,per_si_allsi.cov_q5_tot,999999900.0,9e-06,9e-06,9e-06,9e-06,9e-06,9e-06,9e-06,9e-06,9e-06,9999999.99993896,99999993.3333333,99999.9977648259,99999.9999999999,99999.9977648258,99999.9977648258,99999.9977648259,99999801300.0,9999999.74668025,9999088938.55911,999951.0,999999.973922969,99999.9977648258,999999.960884452,999999.983236194,99999.9996274711,99999.9977648259,9999.99977648258,999951022.807046,9999.99999999979,999999.985098838,99996799.5051345,99999.9977648258,9999.99977648258,99999998000.0,99999.999627471,99999.9996274709,99992073789.124,999939416700.0,99999998000.0,9999999.99999999,99999.999627471,9999878167.94802,9999.99977648258,99999.9999990463,99999.9977648258,9999999.91245567,99999100000.0,99999.999627471,99999.999627471,99999.9999961853,9999999.5,999999996.267259,9999.99977648258,99999.9977648259,99992000.0,999999.994412065,999999.98509884,999999.996274709,999999.992549419,999984698012.454,


In [17]:
display(df_co2_emissions)

Unnamed: 0,At1 (mm),At2 (mm),Cn,Cr,Ct,De,E (g/km),Enedc (g/km),Er (g/km),Ernedc (g/km),Erwltp (g/km),Ewltp (g/km),Fm,Ft,ID,It,MMS,MS,Man,Mh,Mk,Mp,Mt,Status,T,TAN,VFN,Va,Ve,Vf,W (mm),Zr,ec (cm3),ep (KW),m (kg),r,version_file,year,z (Wh/km)
0,,,BUGATTI BG 744,M1,M1,,,575,,,,,M,petrol,4169823,,BUGATTI F,DE,AA-IVA,AA-IVA,BUGATTI,,,F,,,,,,,,,7993.0,1103.0,1995,1,v20,2019,
1,,,BUGATTI CHIRON-DIVO,M1,M1,,,575,,,,,M,petrol,4169522,,BUGATTI F,DE,AA-IVA,AA-IVA,BUGATTI,,,P,,,,,,,,,,1103.0,1995,1,v19,2019,
2,,,BUGATTI CHIRON-DIVO,M1,M1,,,575,,,,,M,petrol,4169522,,BUGATTI F,DE,AA-IVA,AA-IVA,BUGATTI,,,F,,,,,,,,,,1103.0,1995,1,v20,2019,
3,,,BUGATTI BG 744,M1,M1,,,575,,,,,M,petrol,4169823,,BUGATTI F,DE,AA-IVA,AA-IVA,BUGATTI,,,P,,,,,,,,,7993.0,1103.0,1995,1,v19,2019,
4,,,AUDI A8,M1,M1,,,545,,,,,M,petrol,4784577,,AUDI,DE,AA-IVA,AA-IVA,AUDI,,,F,F8,,,,,,,,3993.0,420.0,3895,1,v20,2019,
5,,,AUDI A8,M1,M1,,,545,,,,,M,petrol,4784577,,AUDI,DE,AA-IVA,AA-IVA,AUDI,,,P,F8,,,,,,,,3993.0,420.0,3895,1,v19,2019,
6,1747.0,1670.0,,M1,M1,,,516,,,,505.0,M,petrol,754447,,BUGATTI,GB,BUGATTI AUTOMOBILES SAS,BUGATTI,BUGATTI,VW GROUP PC,2150.0,P,5B,e1*ks07/46*0008*09,,DALAADX1CN,TAD7AD71C002,,2711.0,,7993.0,,2070,1,v19,2019,
7,1747.0,1670.0,BUGATTI CHIRON,M1,M1,,,516,,,,506.0,M,petrol,4898524,,BUGATTI F,DE,BUGATTI AUTOMOBILES SAS,BUGATTI,BUGATTI,VW GROUP PC,2150.0,P,5B,E1*KS07/46*0008*09,,DALAADX1CN,TAD7AD71C002,,2711.0,,7993.0,1103.0,2070,1,v19,2019,
8,1747.0,1670.0,BUGATTI CHIRON,M1,M1,,,516,,,,506.0,M,petrol,4898531,,BUGATTI F,DE,BUGATTI AUTOMOBILES SAS,BUGATTI,BUGATTI,VW GROUP PC,2150.0,F,5B,E1*KS07/46*0008*09,,DALAADX1CN,TAD7AD71C002,,2711.0,,7993.0,1103.0,2070,1,v20,2019,
9,1747.0,1670.0,,M1,M1,,,516,,,,505.0,M,petrol,12889810,,BUGATTI,GB,BUGATTI AUTOMOBILES SAS,BUGATTI,BUGATTI,VW GROUP PC,2150.0,P,5B,e1*ks07/46*0008*09,,DALAADX1CN,TAD7AD71C002,,2711.0,,7993.0,,2070,1,v19,2019,


In [18]:
display(df_world_development_indicators)

Unnamed: 0,Country Name,Country Code,Indicator Name,Indicator Code,1960,1961,1962,1963,1964,1965,1966,1967,1968,1969,1970,1971,1972,1973,1974,1975,1976,1977,1978,1979,1980,1981,1982,1983,1984,1985,1986,1987,1988,1989,1990,1991,1992,1993,1994,1995,1996,1997,1998,1999,2000,2001,2002,2003,2004,2005,2006,2007,2008,2009,2010,2011,2012,2013,2014,2015,2016,2017,2018,2019,2020,_c65
0,Africa Eastern and Southern,AFE,Access to clean fuels and technologies for cooking (% of population),EG.CFT.ACCS.ZS,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,12.205985334256,12.5493332143826,12.8877052744847,13.2263347774934,13.5755915831281,13.9243539706952,14.2660310520972,14.5962971181626,14.9552860566119,15.2810233296453,15.6312911024101,15.9812561302977,16.320474865037,16.6432428558172,16.9946949543723,17.3131682773135,17.6049500630141,,,,,
1,Africa Eastern and Southern,AFE,Access to electricity (% of population),EG.ELC.ACCS.ZS,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,20.0860066130196,23.1031784776438,24.2234301519858,25.0231841252266,26.1763357723345,23.621498439778,27.2994330458403,28.737952342644,28.813735402838,27.4830612476805,28.2333728874701,28.3818359876251,32.2240274434276,32.0464780952398,31.3235788003652,33.3121631315881,38.3804331871331,39.754201347135,42.1682410699846,43.6406611120552,,
2,Africa Eastern and Southern,AFE,"Access to electricity, rural (% of rural population)",EG.ELC.ACCS.RU.ZS,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,8.49160997191625,13.2018746955682,15.2093473536694,13.8135351072229,15.4236146358427,10.9588403149893,16.7624691704676,14.417101517166,16.9779753635741,18.1733410522112,16.2410638636384,15.2959503612632,20.5253527781327,19.4613833438545,17.790698468584,16.5534701416911,23.9078966279348,24.6247249863346,26.8138996635713,28.8411502327974,,
3,Africa Eastern and Southern,AFE,"Access to electricity, urban (% of urban population)",EG.ELC.ACCS.UR.ZS,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,56.0834516422826,54.7701226470266,56.1023300404056,57.0943101230703,57.836866299283,58.9533652971298,59.5469745890136,59.5989593932235,60.9834004395019,61.2272550575374,62.5208242745159,65.4706154133,66.303599202538,66.4960101543106,65.8289881700667,66.9266915686378,68.7221844313438,71.0854179924979,71.9949328691005,73.5898862435969,,
4,Africa Eastern and Southern,AFE,Account ownership at a financial institution or with a mobile-money-service provider (% of population ages 15+),FX.OWN.TOTL.ZS,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
5,Africa Eastern and Southern,AFE,"Account ownership at a financial institution or with a mobile-money-service provider, female (% of population ages 15+)",FX.OWN.TOTL.FE.ZS,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
6,Africa Eastern and Southern,AFE,"Account ownership at a financial institution or with a mobile-money-service provider, male (% of population ages 15+)",FX.OWN.TOTL.MA.ZS,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
7,Africa Eastern and Southern,AFE,"Account ownership at a financial institution or with a mobile-money-service provider, older adults (% of population ages 25+)",FX.OWN.TOTL.OL.ZS,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
8,Africa Eastern and Southern,AFE,"Account ownership at a financial institution or with a mobile-money-service provider, poorest 40% (% of population ages 15+)",FX.OWN.TOTL.40.ZS,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
9,Africa Eastern and Southern,AFE,"Account ownership at a financial institution or with a mobile-money-service provider, primary education or less (% of population ages 15+)",FX.OWN.TOTL.PL.ZS,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
