### Import necessary libraries

In [0]:
from zipfile import ZipFile
import os
import json
import pymongo
import pyspark.pandas as pd
from pyspark.sql.types import StructType, StructField, StringType, TimestampType, BinaryType
from pyspark.sql.types import ByteType, ShortType, IntegerType, LongType, DecimalType

import jaydebeapi as jdbc
import sys


### Connect to Kaggle API
Source for all Kaggle related work: https://github.com/MrFuguDataScience/various_API_connections/blob/master/Kaggle_api_basics.ipynb

In [0]:
api_token = {"username":"kqn3ryn","key":"4824461b985d3c075664cc2ebf9fb922"}

import json

with open('/root/.kaggle/kaggle.json', 'w') as file:
    json.dump(api_token, file)

!chmod 600 ~/.kaggle/kaggle.json

In [0]:
#connect to Kaggle API
from kaggle.api.kaggle_api_extended import KaggleApi
api = KaggleApi()
api.authenticate()

Find dataset from Kaggle using the API

In [0]:
api.dataset_list_files('vivek468/superstore-dataset-final').files

Out[4]: [Sample - Superstore.csv]

Make a directory to save zip file to

In [0]:
# import os

# define the name of the directory to be created
path = os.getcwd()+"/DS3002-final"

try:
    os.mkdir(path)
except OSError:
    print ("Creation of the directory %s failed" % path)
else:
    print ("Successfully created the directory %s " % path)

Successfully created the directory /databricks/driver/DS3002-final 


Save file to path

In [0]:
api.dataset_download_files('vivek468/superstore-dataset-final', '/databricks/driver/DS3002-final')

List what is in the file and save it as an array

In [0]:
superstore=!ls /databricks/driver/DS3002-final
superstore

Out[7]: ['superstore-dataset-final.zip']

Function to find path of file

In [0]:
def os_dir_search(file):
    u=[]
    for p,n,f in os.walk(os.getcwd()):
        
        for a in f:
            a = str(a)
            if a.endswith(file): # can be (.csv) or a file like I did and search 
#                 print(a)
#                 print(p)
                t=p
    return t

os_dir_search(superstore[0])

Out[8]: '/databricks/driver/DSfinal'

Unzip file to csv and save to path

In [0]:
# from zipfile import ZipFile
# specifying the zip file name 

file_name=os_dir_search(superstore[0])+'/'+superstore[0]
  
# opening the zip file in READ mode 
with ZipFile(file_name, 'r') as zip: 
    # printing all the contents of the zip file 
    zip.printdir() 
  
    # extracting all the files 
    print('Extracting all the files now...') 
    zip.extractall('/databricks/driver/DS3002-final') 
    print('Done!')

File Name                                             Modified             Size
Sample - Superstore.csv                        2022-02-17 11:33:08      2287806
Extracting all the files now...
Done!


### Read in csv file into dataframe

In [0]:
dbutils.fs.cp("file:/databricks/driver/DS3002-final/Sample - Superstore.csv", 
   "/FileStore/Superstore.csv")
df = spark.read.csv("/FileStore/Superstore.csv")

Edit dataframe to have correct column titles and delete first row.
<br/><br/>
***Note: needed to make these changes becasue of the way the file was read in

In [0]:
# Source: https://sparkbyexamples.com/pyspark/pyspark-rename-dataframe-column/
# Ranaming columns and dropping row becasue of thw way the csv was read from kaggle API
df = df.withColumnRenamed("_c0","RowID") \
    .withColumnRenamed("_c1","OrderID") \
    .withColumnRenamed("_c2","OrderDate") \
    .withColumnRenamed("_c3","ShipDate") \
    .withColumnRenamed("_c4","ShipMode") \
    .withColumnRenamed("_c5","CustomerID") \
    .withColumnRenamed("_c6","CustomerName") \
    .withColumnRenamed("_c7","Segment") \
    .withColumnRenamed("_c8","Country") \
    .withColumnRenamed("_c9","City") \
    .withColumnRenamed("_c10","State") \
    .withColumnRenamed("_c11","PostalCode") \
    .withColumnRenamed("_c12","Region") \
    .withColumnRenamed("_c13","ProductID") \
    .withColumnRenamed("_c14","Category") \
    .withColumnRenamed("_c15","SubCategory") \
    .withColumnRenamed("_c16","ProductName") \
    .withColumnRenamed("_c17","Sales") \
    .withColumnRenamed("_c18","Quantity") \
    .withColumnRenamed("_c19","Discount") \
    .withColumnRenamed("_c20","Profit") \

df = df.filter(df.RowID!='Row ID')
df.head()

Out[11]: Row(RowID='1', OrderID='CA-2016-152156', OrderDate='11/8/2016', ShipDate='11/11/2016', ShipMode='Second Class', CustomerID='CG-12520', CustomerName='Claire Gute', Segment='Consumer', Country='United States', City='Henderson', State='Kentucky', PostalCode='42420', Region='South', ProductID='FUR-BO-10001798', Category='Furniture', SubCategory='Bookcases', ProductName='Bush Somerset Collection Bookcase', Sales='261.96', Quantity='2', Discount='0', Profit='41.9136')

### Connect to Azure SQL server

In [0]:
jdbcHostname = "kqn3ryn-sqlsvr.database.windows.net"
jdbcDatabase = "superstore"
jdbcPort = 1433
jdbcUrl = "jdbc:sqlserver://{0}:{1};database={2}".format(jdbcHostname, jdbcPort, jdbcDatabase) 

connectionProperties = {
  "user" : "kqn3ryn",
  "password" : "Quynhanh01",
  "driver" : "com.microsoft.sqlserver.jdbc.SQLServerDriver"
}

Populate 'superstore' data base with superstore_data table using csv (database was already created in Azure)

In [0]:
# Source: https://community.databricks.com/s/question/0D53f00001gsZ3QCAU/cant-write-big-dataframe-into-mssql-server-by-using-jdbc-driver-on-azure-databricks
# Populating table in superstore database using the csv
username = 'kqn3ryn'
password = 'Quynhanh01'
tablename = 'superstore_data'
batch_size = 9995

df.write \
            .format("jdbc") \
            .mode("overwrite") \
            .option("driver", "com.microsoft.sqlserver.jdbc.SQLServerDriver") \
            .option("url", jdbcUrl) \
            .option("dbtable", tablename) \
            .option("user", username) \
            .option("password", password) \
            .option("batchsize", batch_size) \
            .save()

Read in new table from SQL to show it has been properly populated.

In [0]:
sql_query = """
(SELECT * FROM [dbo].[superstore_data]) superstore
"""

superstore_data = spark.read.jdbc(url=jdbcUrl, table=sql_query, properties=connectionProperties)
display(superstore_data)

RowID,OrderID,OrderDate,ShipDate,ShipMode,CustomerID,CustomerName,Segment,Country,City,State,PostalCode,Region,ProductID,Category,SubCategory,ProductName,Sales,Quantity,Discount,Profit
1,CA-2016-152156,11/8/2016,11/11/2016,Second Class,CG-12520,Claire Gute,Consumer,United States,Henderson,Kentucky,42420,South,FUR-BO-10001798,Furniture,Bookcases,Bush Somerset Collection Bookcase,261.96,2,0.0,41.9136
2,CA-2016-152156,11/8/2016,11/11/2016,Second Class,CG-12520,Claire Gute,Consumer,United States,Henderson,Kentucky,42420,South,FUR-CH-10000454,Furniture,Chairs,"Hon Deluxe Fabric Upholstered Stacking Chairs, Rounded Back",731.94,3,0.0,219.582
3,CA-2016-138688,6/12/2016,6/16/2016,Second Class,DV-13045,Darrin Van Huff,Corporate,United States,Los Angeles,California,90036,West,OFF-LA-10000240,Office Supplies,Labels,Self-Adhesive Address Labels for Typewriters by Universal,14.62,2,0.0,6.8714
4,US-2015-108966,10/11/2015,10/18/2015,Standard Class,SO-20335,Sean O'Donnell,Consumer,United States,Fort Lauderdale,Florida,33311,South,FUR-TA-10000577,Furniture,Tables,Bretford CR4500 Series Slim Rectangular Table,957.5775,5,0.45,-383.031
5,US-2015-108966,10/11/2015,10/18/2015,Standard Class,SO-20335,Sean O'Donnell,Consumer,United States,Fort Lauderdale,Florida,33311,South,OFF-ST-10000760,Office Supplies,Storage,Eldon Fold 'N Roll Cart System,22.368,2,0.2,2.5164
6,CA-2014-115812,6/9/2014,6/14/2014,Standard Class,BH-11710,Brosina Hoffman,Consumer,United States,Los Angeles,California,90032,West,FUR-FU-10001487,Furniture,Furnishings,"Eldon Expressions Wood and Plastic Desk Accessories, Cherry Wood",48.86,7,0.0,14.1694
7,CA-2014-115812,6/9/2014,6/14/2014,Standard Class,BH-11710,Brosina Hoffman,Consumer,United States,Los Angeles,California,90032,West,OFF-AR-10002833,Office Supplies,Art,Newell 322,7.28,4,0.0,1.9656
8,CA-2014-115812,6/9/2014,6/14/2014,Standard Class,BH-11710,Brosina Hoffman,Consumer,United States,Los Angeles,California,90032,West,TEC-PH-10002275,Technology,Phones,Mitel 5320 IP Phone VoIP phone,907.152,6,0.2,90.7152
9,CA-2014-115812,6/9/2014,6/14/2014,Standard Class,BH-11710,Brosina Hoffman,Consumer,United States,Los Angeles,California,90032,West,OFF-BI-10003910,Office Supplies,Binders,DXL Angle-View Binders with Locking Rings by Samsill,18.504,3,0.2,5.7825
10,CA-2014-115812,6/9/2014,6/14/2014,Standard Class,BH-11710,Brosina Hoffman,Consumer,United States,Los Angeles,California,90032,West,OFF-AP-10002892,Office Supplies,Appliances,Belkin F5C206VTEL 6 Outlet Surge,114.9,5,0.0,34.47


I have created the following dimension tables in Azure Data Studio: dim_customers, dim_products, and dim_orders. I then read each table into a data frame below and saved it as a table in the Data Bricks File System (DBFS).

In [0]:
sql_query = """
(SELECT * FROM [dbo].[dim_customers]) superstore
"""

dim_customers = spark.read.jdbc(url=jdbcUrl, table=sql_query, properties=connectionProperties)
display(dim_customers)

customerKey,customerName,customerType,country,city,state,zipCode,region,orderID
CG-12520,Claire Gute,Consumer,United States,Henderson,Kentucky,42420,South,CA-2016-152156
CG-12520,Claire Gute,Consumer,United States,Henderson,Kentucky,42420,South,CA-2016-152156
DV-13045,Darrin Van Huff,Corporate,United States,Los Angeles,California,90036,West,CA-2016-138688
SO-20335,Sean O'Donnell,Consumer,United States,Fort Lauderdale,Florida,33311,South,US-2015-108966
SO-20335,Sean O'Donnell,Consumer,United States,Fort Lauderdale,Florida,33311,South,US-2015-108966
BH-11710,Brosina Hoffman,Consumer,United States,Los Angeles,California,90032,West,CA-2014-115812
BH-11710,Brosina Hoffman,Consumer,United States,Los Angeles,California,90032,West,CA-2014-115812
BH-11710,Brosina Hoffman,Consumer,United States,Los Angeles,California,90032,West,CA-2014-115812
BH-11710,Brosina Hoffman,Consumer,United States,Los Angeles,California,90032,West,CA-2014-115812
BH-11710,Brosina Hoffman,Consumer,United States,Los Angeles,California,90032,West,CA-2014-115812


In [0]:
dim_customers.write.mode("overwrite").saveAsTable("dim_customers")

In [0]:
sql_query = """
(SELECT * FROM [dbo].[dim_products]) superstore
"""

dim_products = spark.read.jdbc(url=jdbcUrl, table=sql_query, properties=connectionProperties)
display(dim_products)

productKey,productCategory,productSubCategory,productName,price,quantity,discount
FUR-BO-10001798,Furniture,Bookcases,Bush Somerset Collection Bookcase,261.96,2,0.0
FUR-CH-10000454,Furniture,Chairs,"Hon Deluxe Fabric Upholstered Stacking Chairs, Rounded Back",731.94,3,0.0
OFF-LA-10000240,Office Supplies,Labels,Self-Adhesive Address Labels for Typewriters by Universal,14.62,2,0.0
FUR-TA-10000577,Furniture,Tables,Bretford CR4500 Series Slim Rectangular Table,957.5775,5,0.45
OFF-ST-10000760,Office Supplies,Storage,Eldon Fold 'N Roll Cart System,22.368,2,0.2
FUR-FU-10001487,Furniture,Furnishings,"Eldon Expressions Wood and Plastic Desk Accessories, Cherry Wood",48.86,7,0.0
OFF-AR-10002833,Office Supplies,Art,Newell 322,7.28,4,0.0
TEC-PH-10002275,Technology,Phones,Mitel 5320 IP Phone VoIP phone,907.152,6,0.2
OFF-BI-10003910,Office Supplies,Binders,DXL Angle-View Binders with Locking Rings by Samsill,18.504,3,0.2
OFF-AP-10002892,Office Supplies,Appliances,Belkin F5C206VTEL 6 Outlet Surge,114.9,5,0.0


In [0]:
dim_products.write.mode("overwrite").saveAsTable("dim_products")

In [0]:
sql_query = """
(SELECT * FROM [dbo].[dim_orders]) superstore
"""

dim_orders = spark.read.jdbc(url=jdbcUrl, table=sql_query, properties=connectionProperties)
display(dim_orders)

orderKey,orderDate,shipDate,shipMode,profit,productID
CA-2016-152156,11/8/2016,11/11/2016,Second Class,41.9136,FUR-BO-10001798
CA-2016-152156,11/8/2016,11/11/2016,Second Class,219.582,FUR-CH-10000454
CA-2016-138688,6/12/2016,6/16/2016,Second Class,6.8714,OFF-LA-10000240
US-2015-108966,10/11/2015,10/18/2015,Standard Class,-383.031,FUR-TA-10000577
US-2015-108966,10/11/2015,10/18/2015,Standard Class,2.5164,OFF-ST-10000760
CA-2014-115812,6/9/2014,6/14/2014,Standard Class,14.1694,FUR-FU-10001487
CA-2014-115812,6/9/2014,6/14/2014,Standard Class,1.9656,OFF-AR-10002833
CA-2014-115812,6/9/2014,6/14/2014,Standard Class,90.7152,TEC-PH-10002275
CA-2014-115812,6/9/2014,6/14/2014,Standard Class,5.7825,OFF-BI-10003910
CA-2014-115812,6/9/2014,6/14/2014,Standard Class,34.47,OFF-AP-10002892


In [0]:
dim_orders.write.mode("overwrite").saveAsTable("dim_orders")

Using these dimension tables, I have created a fact table below using the dimension tables saved in the DBFS. 
<br><br>
I Have also provided code to create the fact table in azure. 
<br>
***Note: Execution was unsuccessful due to azure free trial restrictions, however, the code is correct

In [0]:
%sql
SELECT c.customerKey,
	c.customerName,
    c.customerType,
    c.country,
    c.city,
    c.state,
    c.zipCode,
    c.region,
    o.orderKey,
    o.orderDate,
    o.shipDate,
    o.shipMode,
    p.productKey,
    p.productCategory,
    p.productSubCategory,
    p.productName,
    p.price,
    p.quantity,
    p.discount,
    o.profit
FROM dim_customers AS c
INNER JOIN dim_orders AS o
ON c.orderID = o.orderKey
RIGHT OUTER JOIN dim_products AS p
ON o.productID = p.productKey

customerKey,customerName,customerType,country,city,state,zipCode,region,orderKey,orderDate,shipDate,shipMode,productKey,productCategory,productSubCategory,productName,price,quantity,discount,profit
JL-15505,Jeremy Lonsdale,Consumer,United States,New York City,New York,10035,East,CA-2016-138520,4/8/2016,4/13/2016,Standard Class,FUR-BO-10002268,Furniture,Bookcases,Sauder Barrister Bookcases,388.704,6,0.2,-4.8588
JL-15505,Jeremy Lonsdale,Consumer,United States,New York City,New York,10035,East,CA-2016-138520,4/8/2016,4/13/2016,Standard Class,FUR-BO-10002268,Furniture,Bookcases,Sauder Barrister Bookcases,388.704,6,0.2,-4.8588
JL-15505,Jeremy Lonsdale,Consumer,United States,New York City,New York,10035,East,CA-2016-138520,4/8/2016,4/13/2016,Standard Class,FUR-BO-10002268,Furniture,Bookcases,Sauder Barrister Bookcases,388.704,6,0.2,-4.8588
JL-15505,Jeremy Lonsdale,Consumer,United States,New York City,New York,10035,East,CA-2016-138520,4/8/2016,4/13/2016,Standard Class,FUR-BO-10002268,Furniture,Bookcases,Sauder Barrister Bookcases,388.704,6,0.2,-4.8588
JL-15505,Jeremy Lonsdale,Consumer,United States,New York City,New York,10035,East,CA-2016-138520,4/8/2016,4/13/2016,Standard Class,FUR-BO-10002268,Furniture,Bookcases,Sauder Barrister Bookcases,388.704,6,0.2,-4.8588
JL-15505,Jeremy Lonsdale,Consumer,United States,New York City,New York,10035,East,CA-2016-138520,4/8/2016,4/13/2016,Standard Class,FUR-BO-10002268,Furniture,Bookcases,Sauder Barrister Bookcases,388.704,6,0.2,-4.8588
JL-15505,Jeremy Lonsdale,Consumer,United States,New York City,New York,10035,East,CA-2016-138520,4/8/2016,4/13/2016,Standard Class,FUR-BO-10002268,Furniture,Bookcases,Sauder Barrister Bookcases,388.704,6,0.2,-4.8588
JL-15505,Jeremy Lonsdale,Consumer,United States,New York City,New York,10035,East,CA-2016-138520,4/8/2016,4/13/2016,Standard Class,FUR-BO-10002268,Furniture,Bookcases,Sauder Barrister Bookcases,388.704,6,0.2,-4.8588
JL-15505,Jeremy Lonsdale,Consumer,United States,New York City,New York,10035,East,CA-2016-138520,4/8/2016,4/13/2016,Standard Class,FUR-BO-10002268,Furniture,Bookcases,Sauder Barrister Bookcases,388.704,6,0.2,-4.8588
JL-15505,Jeremy Lonsdale,Consumer,United States,New York City,New York,10035,East,CA-2016-138520,4/8/2016,4/13/2016,Standard Class,FUR-BO-10002268,Furniture,Bookcases,Sauder Barrister Bookcases,388.704,6,0.2,-4.8588


In [0]:
sql_query = """
(SELECT c.customerKey, 
    c.customerName,
    c.customerType,
    c.country,
    c.city, 
    c.state,
    c.zipCode,
    c.region,
    o.orderKey,
    o.orderDate,
    o.shipDate,
    o.shipMode,
    p.productKey,
    p.productCategory,
    p.productSubCategory,
    p.productName,
    p.price,
    p.quantity,
    p.discount,
    o.profit
FROM dbo.dim_customers AS c
INNER JOIN dbo.dim_orders AS o
ON c.orderID = o.orderKey
INNER JOIN dbo.dim_products AS p
ON o.productID = p.productKey) superstore
"""

fact_superstore = spark.read.jdbc(url=jdbcUrl, table=sql_query, properties=connectionProperties)
display(fact_superstore)

### Connect to MongoDB Atlas

In [0]:
atlas_cluster_name = "ds3002"
atlas_default_dbname = "superstore"
atlas_user_name = "kqn3ryn"
atlas_password = "Quynhanh01"

conn_str = f"mongodb+srv://{atlas_user_name}:{atlas_password}@ds3002.mii28.mongodb.net/{atlas_default_dbname}?retryWrites=true&w=majority"

client = pymongo.MongoClient(conn_str)
client.list_database_names()

Out[22]: ['superstore', 'admin', 'local']

In [0]:
db_name = "superstore"

db = client[db_name]
db.list_collection_names()

Out[23]: ['superstore_data']

In [0]:
collection = "superstore_data"

superstore_data = db[collection]
superstore_data.find_one()

Out[24]: {'_id': ObjectId('627d3bbaa51f038622ba0e95'),
 'Row ID': '1',
 'Order ID': 'CA-2016-152156',
 'Order Date': '11/8/2016',
 'Ship Date': '11/11/2016',
 'Ship Mode': 'Second Class',
 'Customer ID': 'CG-12520',
 'Customer Name': 'Claire Gute',
 'Segment': 'Consumer',
 'Country': 'United States',
 'City': 'Henderson',
 'State': 'Kentucky',
 'Postal Code': '42420',
 'Region': 'South',
 'Product ID': 'FUR-BO-10001798',
 'Category': 'Furniture',
 'Sub-Category': 'Bookcases',
 'Product Name': 'Bush Somerset Collection Bookcase',
 'Sales': '261.96',
 'Quantity': '2',
 'Discount': '0',
 'Profit': '41.9136'}

### Load collections onto MongoDB from json files that I uploaded in the DBFS

In [0]:
# ######################################################################################################################
# Use this Function to Create New Collections by Uploading JSON file(s) to the MongoDB Atlas server.
# ######################################################################################################################
def set_mongo_collection(user_id, pwd, cluster_name, db_name, src_file_path, json_files):
    '''Create a client connection to MongoDB'''
    mongo_uri = f"mongodb+srv://{user_id}:{pwd}@{cluster_name}.mii28.mongodb.net/{db_name}?retryWrites=true&w=majority"
    client = pymongo.MongoClient(mongo_uri)
    db = client[db_name]
    
    '''Read in a JSON file, and Use It to Create a New Collection'''
    for file in json_files:
        db.drop_collection(file)
        json_file = os.path.join(src_file_path, json_files[file])
        with open(json_file, 'r') as openfile:
            json_object = json.load(openfile)
            file = db[file]
            result = file.insert_many(json_object)

    client.close()
    
    return result

***Note: this was unsuccessful becasue I was receiving this error "FileNotFoundError: [Errno 2] No such file or directory: '/dbfs/FileStore/tables/dim_customers.json'"" for all files: dim_customers.json, dim_orders.json, and dim_products.json

In [0]:
src_dbname = "superstore"
src_dir = '/dbfs/FileStore/tables'
json_files = {"dim_customers" : "dim_customers.json"}

set_mongo_collection(atlas_user_name, atlas_password, atlas_cluster_name, src_dbname, src_dir, json_files)

[0;31m---------------------------------------------------------------------------[0m
[0;31mFileNotFoundError[0m                         Traceback (most recent call last)
[0;32m<command-4336689022240853>[0m in [0;36m<module>[0;34m[0m
[1;32m      3[0m [0mjson_files[0m [0;34m=[0m [0;34m{[0m[0;34m"dim_customers"[0m [0;34m:[0m [0;34m"dim_customers.json"[0m[0;34m}[0m[0;34m[0m[0;34m[0m[0m
[1;32m      4[0m [0;34m[0m[0m
[0;32m----> 5[0;31m [0mset_mongo_collection[0m[0;34m([0m[0matlas_user_name[0m[0;34m,[0m [0matlas_password[0m[0;34m,[0m [0matlas_cluster_name[0m[0;34m,[0m [0msrc_dbname[0m[0;34m,[0m [0msrc_dir[0m[0;34m,[0m [0mjson_files[0m[0;34m)[0m[0;34m[0m[0;34m[0m[0m
[0m
[0;32m<command-4336689022240854>[0m in [0;36mset_mongo_collection[0;34m(user_id, pwd, cluster_name, db_name, src_file_path, json_files)[0m
[1;32m     12[0m         [0mdb[0m[0;34m.[0m[0mdrop_collection[0m[0;34m([0m[0mfile[0m[0;34m)[0m[0;3

In [0]:
json_files = {"dim_customers" : "dim_orders.json"}

set_mongo_collection(atlas_user_name, atlas_password, atlas_cluster_name, src_dbname, src_dir, json_files)

[0;31m---------------------------------------------------------------------------[0m
[0;31mFileNotFoundError[0m                         Traceback (most recent call last)
[0;32m<command-4336689022240859>[0m in [0;36m<module>[0;34m[0m
[1;32m      1[0m [0mjson_files[0m [0;34m=[0m [0;34m{[0m[0;34m"dim_customers"[0m [0;34m:[0m [0;34m"dim_orders.json"[0m[0;34m}[0m[0;34m[0m[0;34m[0m[0m
[1;32m      2[0m [0;34m[0m[0m
[0;32m----> 3[0;31m [0mset_mongo_collection[0m[0;34m([0m[0matlas_user_name[0m[0;34m,[0m [0matlas_password[0m[0;34m,[0m [0matlas_cluster_name[0m[0;34m,[0m [0msrc_dbname[0m[0;34m,[0m [0msrc_dir[0m[0;34m,[0m [0mjson_files[0m[0;34m)[0m[0;34m[0m[0;34m[0m[0m
[0m
[0;32m<command-4336689022240854>[0m in [0;36mset_mongo_collection[0;34m(user_id, pwd, cluster_name, db_name, src_file_path, json_files)[0m
[1;32m     12[0m         [0mdb[0m[0;34m.[0m[0mdrop_collection[0m[0;34m([0m[0mfile[0m[0;34m)[0m[0;34m

In [0]:
json_files = {"dim_customers" : "dim_products.json"}

set_mongo_collection(atlas_user_name, atlas_password, atlas_cluster_name, src_dbname, src_dir, json_files)

[0;31m---------------------------------------------------------------------------[0m
[0;31mFileNotFoundError[0m                         Traceback (most recent call last)
[0;32m<command-4336689022240860>[0m in [0;36m<module>[0;34m[0m
[1;32m      1[0m [0mjson_files[0m [0;34m=[0m [0;34m{[0m[0;34m"dim_customers"[0m [0;34m:[0m [0;34m"dim_products.json"[0m[0;34m}[0m[0;34m[0m[0;34m[0m[0m
[1;32m      2[0m [0;34m[0m[0m
[0;32m----> 3[0;31m [0mset_mongo_collection[0m[0;34m([0m[0matlas_user_name[0m[0;34m,[0m [0matlas_password[0m[0;34m,[0m [0matlas_cluster_name[0m[0;34m,[0m [0msrc_dbname[0m[0;34m,[0m [0msrc_dir[0m[0;34m,[0m [0mjson_files[0m[0;34m)[0m[0;34m[0m[0;34m[0m[0m
[0m
[0;32m<command-4336689022240854>[0m in [0;36mset_mongo_collection[0;34m(user_id, pwd, cluster_name, db_name, src_file_path, json_files)[0m
[1;32m     12[0m         [0mdb[0m[0;34m.[0m[0mdrop_collection[0m[0;34m([0m[0mfile[0m[0;34m)[0m[0;34

However, the files do exist and I have provided proof below.

In [0]:
%fs
ls FileStore/tables

path,name,size,modificationTime
dbfs:/FileStore/tables/dim_customers.json,dim_customers.json,21702922,1652385629000
dbfs:/FileStore/tables/dim_orders.json,dim_orders.json,15532810,1652385625000
dbfs:/FileStore/tables/dim_products-1.json,dim_products-1.json,19846250,1652385704000
dbfs:/FileStore/tables/dim_products.json,dim_products.json,19846250,1652385665000
