### Go through all documents in the Atlas base and update the documents' Tag  field to parse the concatenated string into a list of strings

In [1]:
import pymongo
from IPython.display import display
from pymongo import MongoClient
import pandas as pd
from dotenv import dotenv_values


config = dotenv_values(".env.local")

# setup mongo connection
client = MongoClient('mongodb+srv://' + config['MONGO_USER'] + ':' + config['MONGO_PASS'] + '@final-cluster.uucno.mongodb.net')
db = client.final
col = db.StackOverflowPosts

#### Can run this update query whenever we have new data (for a more scalable workflow)

In [2]:
# Update every document to parse tha Tags field into a list of strings rather than a large concatenated string with all tags. 
# This will make indexing more performant and remove the need to parse in our future queries which will greatly improve our performance and get closer to realtime
col.update_many(
    {
        "Tags": {"$type": "string"}  # Match documents where 'Tags' is a string
    },
    [
        {
            "$set": {
                "Tags": {
                    "$cond": {
                        "if": {"$eq": [{"$type": "$Tags"}, "string"]},
                        "then": {
                            # Only split if 'Tags' is a string
                            "$split": [
                                {"$substrCP": ["$Tags", 1, {"$subtract": [{"$strLenCP": "$Tags"}, 2]}]},
                                "><"
                            ]
                        },
                        "else": "$Tags"  # Keep the original 'Tags' if it's not a string
                    }
                }
            }
        }
    ]
)

print("TAGS UPDATE COMPLETE")

WriteError: $strLenCP requires a string argument, found: array, full error: {'index': 0, 'code': 34471, 'errmsg': '$strLenCP requires a string argument, found: array'}