In [1]:
import pymongo
import json
from bson import ObjectId

In [9]:
MONGO_URI = "mongodb+srv://leogym2:NPgwIUneOWWyNuMo@cluster0.mf0xa.mongodb.net/"
client = pymongo.MongoClient(MONGO_URI)
db = client["Raw_data"]
Insider_Trading_coll = db["Insider_Trading"]

In [14]:
pipeline = [
        { # Remove documents where nonDerivativeTable is None and keep only document type 4 
        "$match": {"Filling_data.ownershipDocument.documentType":"4", 
            "Filling_data.ownershipDocument.nonDerivativeTable": {"$ne": None}
        }
    },

    {
        "$addFields": {       
            # iterate over all elements of the array if it is the case and remove the elements(i.e transactions) that has these codes.
            "filtered_nonDerivativeTransaction": {
                "$cond": {
                    "if": {"$isArray": "$Filling_data.ownershipDocument.nonDerivativeTable.nonDerivativeTransaction"},
                    "then": {
                        "$filter": {
                            "input": "$Filling_data.ownershipDocument.nonDerivativeTable.nonDerivativeTransaction",
                            "as": "transaction",
                            "cond": {
                                "$not": {
                                    "$in": ["$$transaction.transactionCoding.transactionCode", ["G", "M", "A", "F", "J"]]
                                }
                            }
                        }
                    },
                    "else": {
                        # If it's a single object, apply the same filter logic
                        "$cond": {
                            "if": {
                                "$in": 
                                ["$Filling_data.ownershipDocument.nonDerivativeTable.nonDerivativeTransaction.transactionCoding.transactionCode", 
                                 ["G", "M", "A", "F", "J"]]
                            },
                            "then": None,  # Replace with `None` or keep the object based on logic
                            "else": "$Filling_data.ownershipDocument.nonDerivativeTable.nonDerivativeTransaction"  # Keep the object as is
                        }
                    }
                }
            }
        }
    }
    ,
    {  #if after filter no transaction is usefull for our analysis just remove the entire docs
        "$match": {"filtered_nonDerivativeTransaction":{"$ne":None}}
    },
    {
    "$addFields": {
        "Entity_name_cleaned": {
            "$arrayElemAt": [{ "$split": ["$Entity_name", "\n"] }, 0]
        },
         "Date_transactions": {
            "$dateFromString": {
                "dateString": "$Date_transactions",
                "format": "%Y-%m-%d"  
            }
        },
        "Date_publication": {
            "$dateFromString": {
                "dateString": "$Date_publication",
                "format": "%Y-%m-%d"
            }
        },
        # to convert inside each array same logic, need to use map function
        "filtered_nonDerivativeTransaction": {
            "$cond": {
                "if": { "$isArray": "$filtered_nonDerivativeTransaction" },
                "then": {
                    "$map": {
                        "input": "$filtered_nonDerivativeTransaction",
                        "as": "transaction",
                        "in": {
                            "$mergeObjects": [
                                "$$transaction",  
                                {
                                    "transactionShares": {
                                        "$toDouble": "$$transaction.transactionAmounts.transactionShares.value"
                                    },
                                    "transactionPricePerShare": {
                                        "$toDouble": "$$transaction.transactionAmounts.transactionPricePerShare.value"
                                    },
                                     "sharesOwnedFollowingTransaction": {
                                        "$toDouble": "$$transaction.postTransactionAmounts.sharesOwnedFollowingTransaction.value"}
                                }
                            ]
                        }
                    }
                },
                "else": {
                    "$mergeObjects": [
                        "$filtered_nonDerivativeTransaction",
                        {
                        "transactionShares": {
                            "$toDouble": "$filtered_nonDerivativeTransaction.transactionAmounts.transactionShares.value"
                        },
                        "transactionPricePerShare": {
                            "$toDouble": "$filtered_nonDerivativeTransaction.transactionAmounts.transactionPricePerShare.value"
                        },
                        "sharesOwnedFollowingTransaction": {
                             "$toDouble": "$filtered_nonDerivativeTransaction.postTransactionAmounts.sharesOwnedFollowingTransaction.value"}
                                
                        } 
                    ]
                }
            }
        }, # remove redundant nested fields
        "isDirector": "$Filling_data.ownershipDocument.reportingOwner.reportingOwnerRelationship.isDirector",
        "isOfficer": "$Filling_data.ownershipDocument.reportingOwner.reportingOwnerRelationship.isOfficer",
        "isTenPercentOwner": "$Filling_data.ownershipDocument.reportingOwner.reportingOwnerRelationship.isTenPercentOwner",
        "isOther": "$Filling_data.ownershipDocument.reportingOwner.reportingOwnerRelationship.isOther",
        "officerTitle": "$Filling_data.ownershipDocument.reportingOwner.reportingOwnerRelationship.officerTitle",
        "otherText": "$Filling_data.ownershipDocument.reportingOwner.reportingOwnerRelationship.otherText"
        }
    },
    {
        # the filter above could produce an array of n values, an array of 1 values (if before the filter 2 transaction but one of them was removed)
        # and also a dcitionary of 1 values as in case of only one transaction the raw data stored a dictionary, convert this case in an array with size1
     "$addFields": {
        "filtered_nonDerivativeTransaction": {
            "$cond": {
                "if": {"$isArray": "$filtered_nonDerivativeTransaction"},
                "then": "$filtered_nonDerivativeTransaction",  # Keep the array as is
                "else": {"$cond": {
                        "if": {"$ne": ["$filtered_nonDerivativeTransaction", None]},  # If it's a dictionary
                        "then": ["$filtered_nonDerivativeTransaction"],  # Convert to array with 1 item
                        "else": []  # If it's None or missing, convert to empty array
                    }
                }
            }
        }
      }
    },
    # final clean of all empy or null transaction list in each document
    {"$match": {"$nor": [{"filtered_nonDerivativeTransaction": []}, {"filtered_nonDerivativeTransaction": None}]}},
                   
    {"$project": {"Date_publication": 1,"Date_transactions":1, "Entity_name_cleaned":1,"Company":1,
                   "isDirector":1, "isOfficer":1, "isTenPercentOwner":1, 
                  "isOther":1, "officerTitle":1, "otherText":1,
                  "filtered_nonDerivativeTransaction":1
                  }}  # Show only relevant fields,
]

result = list(Insider_Trading_coll.aggregate(pipeline))

# for doc in result:
#     print(doc)
#     print("***")


In [15]:
len(result)

7798

In [16]:
from pymongo import InsertOne

Insider_Trading_coll_2 = db["Insider_Trading_cleaned"]




# Prepare a list for bulk insert operations
bulk_operations = []

for doc in result:
    # Prepare a bulk operation for each document using InsertOne
    bulk_operations.append(InsertOne(doc))

# Insert data in chunks to avoid memory overload
if len(bulk_operations) >= 1000:  # Adjust the batch size as needed
    Insider_Trading_coll_2.bulk_write(bulk_operations)
    bulk_operations.clear()  # Reset after a batch write

# If any remaining operations, write them as well
if bulk_operations:
    Insider_Trading_coll_2.bulk_write(bulk_operations)

print("Data written to Insider_Trading_cleaned successfully.")



Data written to Insider_Trading_cleaned successfully.


In [17]:
#db["Insider_Trading_cleaned"].create_index([("Date_transactions", 1), ("Company", 1)])

'Date_transactions_1_Company_1'