In [29]:
import pandas as pd
from pymongo import MongoClient
import json
from typing import Any, Dict, List, Optional
from bson.binary import Binary
import uuid

In [108]:
def get_database(DBS):
    client = MongoClient('mongodb://crinlp:123@10.230.252.3:27017/?authSource=admin&readPreference=primary&appname=MongoDB%20Compass&ssl=false')
    db = client[DBS]
    return db

db = get_database('preprocessing_texts')
col_ner = db['NER_Mapping_testFM']
col_data = db['v5_meta_examineresult']
col_store_false_ss = db['selected_sentences_v5_FalseMapping']
col_store_false_nm = db['NER_Mapping_FalseMapping']

In [116]:
class ModifyFalseMapping():

    def __init__(self, col_ner, col_data, col_store_false_ss, col_store_false_nm) -> None:
        self.col_ner = col_ner
        self.col_data = col_data
        self.col_store_false_ss = col_store_false_ss
        self.col_store_false_nm = col_store_false_nm


    def extract_ner_data(self, u3_id_list: List[int]) -> List[dict]:
        """
        input empty list means take all updated == 1
        """
        if u3_id_list == []:
            ner_data = col_ner.find({'Updated':1})
        else:
            ner_data = col_ner.find({'Updated':1, 'U3_Company_Number': {"$in": u3_id_list}})
        ner_data_list = []
        for i in ner_data:
            ner_data_list.append(i)
        return ner_data_list
    
    def convert_uuid(self, x: str) -> str:
        id = uuid.UUID(x)
        return id
    
    def modify_data(self, data_in: dict, ner_data: dict, entity_index: List[int]) -> dict:
        common_name = ['Similarity', 'Similarity_1st', 'Similarity_2nd', '1st_cleaned_ner_entity', 
        '1st_matched_cleaned_comp', '2nd_ner_entity', '2nd_matched_comp']
        name_in_ss = ['Entity_id', 'Bingo_entity'] + common_name
        name_in_ner_mapping = ['U3_Company_Number', 'Company_Name'] + common_name
        len_name = len(name_in_ss)
        outdata = {name: [] for name in name_in_ss}
        for idx in range(len_name):
            list_data = data_in[name_in_ss[idx]]
            list_ner = ner_data[name_in_ner_mapping[idx]] ## Constant
            for data_i in range(len(list_data)):
                if data_i in entity_index:
                    outdata[name_in_ss[idx]].append(list_ner)
                else:
                    outdata[name_in_ss[idx]].append(list_data[data_i])
        return outdata
    
    def modified_ner_mapping(self, ner_name_list: List[str], ner_correction_list: List[List]) -> None:
        len_name = len(ner_name_list)
        for ner_name_index in range(len_name):
            find_cursor = self.col_ner.find({'NER_Name': ner_name_list[ner_name_index]})
            ner_data = [i for i in find_cursor][0]
            print(ner_data)
            try:
                self.col_store_false_nm.insert_one(ner_data)
            except:
                pass
            self.col_ner.delete_one({"_id": ner_data['_id']})
            results_dict = {"_id": ner_data['_id'], 
                "U3_Company_Number": ner_correction_list[ner_name_index][0], "Company_Name": ner_correction_list[ner_name_index][1],
                "NER_Name": ner_data["NER_Name"], "NER_Original_Name": ner_data["NER_Original_Name"],
                "Similarity": 1, "Similarity_1st": [1], 
                "Similarity_2nd": [1], "1st_cleaned_ner_entity": ner_data["1st_cleaned_ner_entity"],
                "1st_matched_cleaned_comp": ner_data["1st_matched_cleaned_comp"], "2nd_ner_entity": ner_data["2nd_ner_entity"], 
                "2nd_matched_comp": ner_data["2nd_matched_comp"], "Pre_Defined": 0,
                "Updated": 1}
            self.col_ner.insert_one(results_dict)
            break
    def run(self):

        ner_data_list = self.extract_ner_data([])
        for ner_data in ner_data_list:
            for ss_data in col_data.find({'NER_Mapping_ID': ner_data['_id']}):
                try:
                    self.col_store_false_ss.insert_one(ss_data)
                except:
                    pass

                entities = ss_data['Companies_econs_sectors_instruments']
                ner_mapping_id = ss_data['NER_Mapping_ID']
                store_idx = []

                ## Look for index that need to be modified
                for entity_i in range(len(entities)):
                    if entities[entity_i] == ner_data['NER_Name'] and ner_mapping_id[entity_i] == ner_data['_id']:
                        store_idx.append(entity_i)
                
                modified_dict = self.modify_data(ss_data, ner_data, store_idx)
                results_dict = {"_id": ss_data['_id'], "Sentence_id": ss_data['Sentence_id'], "Output_sentence1": ss_data['Output_sentence1'], 
                                "Output_sentence2": ss_data['Output_sentence2'], "Storage_date": ss_data['Storage_date'], "Companies_econs_sectors_instruments": ss_data['Companies_econs_sectors_instruments'],
                                "Source": ss_data['Source'], "Category": ss_data['Category'], "Date": ss_data['Date'], 
                                "Title": ss_data['Title'], "Link": ss_data['Link'], 
                                "Entity_id": modified_dict["Entity_id"], "Bingo_entity": modified_dict["Bingo_entity"],
                                "Similarity": modified_dict["Similarity"], "Similarity_1st": modified_dict["Similarity_1st"], 
                                "Similarity_2nd": modified_dict["Similarity_2nd"], "1st_cleaned_ner_entity": modified_dict["1st_cleaned_ner_entity"],
                                "1st_matched_cleaned_comp": modified_dict["1st_matched_cleaned_comp"], "2nd_ner_entity": modified_dict["2nd_ner_entity"], 
                                "2nd_matched_comp": modified_dict["2nd_matched_comp"], "NER_Mapping_ID": ss_data['NER_Mapping_ID'],
                                "Updated": 1}
                ## Delete previous one
                self.col_data.delete_one({'_id': ss_data['_id']})
                self.col_data.insert_one(results_dict)



# for ner_data in ner_data_list:
#     for ss_data in col_data.find({'NER_Mapping_ID': ner_data['_id']}):
#         try:
#             col_store_temp_data.insert_one(ss_data)
#         except:
#             pass

#         entities = ss_data['Companies_econs_sectors_instruments']
#         ner_mapping_id = ss_data['NER_Mapping_ID']
#         store_idx = []

#         ## Look for index that need to be modified
#         for entity_i in range(len(entities)):
#             if entities[entity_i] == ner_data['NER_Name'] and ner_mapping_id[entity_i] == ner_data['_id']:
#                 store_idx.append(entity_i)
        
#         modified_dict = modify_data(ss_data, ner_data, store_idx)
#         results_dict = {"_id": ss_data['_id'], "Sentence_id": ss_data['Sentence_id'], "Output_sentence1": ss_data['Output_sentence1'], 
#                         "Output_sentence2": ss_data['Output_sentence2'], "Storage_date": ss_data['Storage_date'], "Companies_econs_sectors_instruments": ss_data['Companies_econs_sectors_instruments'],
#                         "Source": ss_data['Source'], "Category": ss_data['Category'], "Date": ss_data['Date'], 
#                         "Title": ss_data['Title'], "Link": ss_data['Link'], 
#                         "Entity_id": modified_dict["Entity_id"], "Bingo_entity": modified_dict["Bingo_entity"],
#                         "Similarity": modified_dict["Similarity"], "Similarity_1st": modified_dict["Similarity_1st"], 
#                         "Similarity_2nd": modified_dict["Similarity_2nd"], "1st_cleaned_ner_entity": modified_dict["1st_cleaned_ner_entity"],
#                         "1st_matched_cleaned_comp": modified_dict["1st_matched_cleaned_comp"], "2nd_ner_entity": modified_dict["2nd_ner_entity"], 
#                         "2nd_matched_comp": modified_dict["2nd_matched_comp"], "NER_Mapping_ID": ss_data['NER_Mapping_ID'],
#                         "Updated": 1}
#         ## Delete previous one
#         col_data.delete_one({'_id': ss_data['_id']})
#         col_data.insert_one(results_dict)


In [117]:
mfalsemapping = ModifyFalseMapping(col_ner, col_data, col_store_false_ss, col_store_false_nm)
mfalsemapping.modified_ner_mapping(["Facebook"], [[129360, "Meta Platforms Inc"]])
mfalsemapping.run()

In [71]:
store_idx

[0]

In [38]:
ner_data_list[0]['_id']

Binary(b'\xbd\xeb\xb8/R2>9\xab9\xe7\x18%=\x1b\x0c', 3)