## Load CSV Data to MongoDB

Let us understand the overall process of loading CSV data to MongoDB with attribute level mapping.
* Read data from file into a Pandas Dataframe.
* Drop the fields that are not required and rename the fields as per the target structure defined.
* Load the data to MongoDB using Bulk load. We can also load the data in chunks.

In [None]:
import pandas as pd
customers = pd.read_csv('/data/ecomm/customers/part-00000')

In [None]:
column_mapping_str = '''{
    "customer_first_name": {"target_field_name": "FirstName", "is_required": true},
    "customer_last_name": {"target_field_name": "LastName", "is_required": true},
    "customer_email": {"target_field_name": "Email", "is_required": true},
    "product_name": {"is_required": false},
    "product_subscription": {"is_required": false}
}'''

import json
column_mapping = json.loads(column_mapping_str)

# Assigning the list of not required fields to a variable
columns_to_be_dropped = dict(list(filter(lambda col: not col[1]['is_required'], column_mapping.items()))).keys()
required_columns_list = list(filter(lambda col: col[1]['is_required'], column_mapping.items()))
required_columns_mapping = dict(map(lambda col: (col[0], col[1]['target_field_name']), required_columns_list))

# This will take care of dropping the not required fields and rename others as per mapping
customers_target = customers.drop(columns=columns_to_be_dropped).rename(columns=required_columns_mapping)

In [None]:
import pymongo, getpass, configparser

username = getpass.getuser()
config = configparser.ConfigParser()
config.read(f'/home/{username}/.jupyterenv')

client = pymongo.MongoClient(
    host='pylabsmd.itversity.com', 
    username=f'{username}_scratch_user', 
    password=config['DEFAULT']['MONGO_SCRATCH_PASS'], 
    authSource='admin'
)

In [None]:
client[f'{username}_scratch_db']['customers'].delete_many({})

In [None]:
for doc in client[f'{username}_scratch_db']['customers'].find({}):
    print(doc)

In [None]:
customers_target.to_dict?

In [None]:
# Pandas dataframe can be converted to list of dicts using to_dict function.
customers_target.to_dict(orient='records')

In [None]:
client[f'{username}_scratch_db']['customers'].insert_many(customers_target.to_dict(orient='records'))

In [None]:
for doc in client[f'{username}_scratch_db']['customers'].find({}):
    print(doc)