In [20]:
import pandas as pd
import re

# Read the CSV file
file_path = 'test_with_predictions_MDM.csv'
df = pd.read_csv(file_path)
# Remove '℃' from 'tag_description'
df['tag_description'] = df['tag_description'].apply(lambda x: re.sub('℃', '', str(x)))

# Filter rows where 'p_MDM' is TRUE
filtered_df = df[df['p_MDM'] == True]

# Group by 'p_thing_property' and count the items in each group
grouped = filtered_df.groupby('p_thing_property').size()

# Convert the result to a DataFrame for better readability
grouped_counts = grouped.reset_index(name='count')

# Sort by count in descending order
grouped_counts_sorted = grouped_counts.sort_values(by='count', ascending=False)

# Initialize the 'r_doc' field in the grouped_counts_sorted DataFrame
grouped_counts_sorted['r_doc'] = ""

# Iterate over the grouped_counts_sorted DataFrame and update the 'r_doc' field
for i, row in grouped_counts_sorted.iterrows():
    # Find matching rows in the original DataFrame where 'thing_property' matches 'p_thing_property'
    matching_rows = df[df['thing_property'] == row['p_thing_property']]
    # Concatenate the 'tag_description' values
    concatenated_description = ' '.join(matching_rows['tag_description'].astype(str).tolist())
    # Update the 'r_doc' field in the grouped_counts_sorted DataFrame
    grouped_counts_sorted.at[i, 'r_doc'] = concatenated_description

# Check for rows where 'r_doc' is NULL
null_r_doc_rows = grouped_counts_sorted[grouped_counts_sorted['r_doc'] == ""]

if not null_r_doc_rows.empty:
    print("p_thing_property values with NULL 'r_doc':")
    print(null_r_doc_rows['p_thing_property'])

# Save the updated DataFrame to a CSV file
output_file_path = 'test_with_predictions_r_doc.csv'
grouped_counts_sorted.to_csv(output_file_path, index=False)

print(f"Updated DataFrame saved to {output_file_path}")


  df = pd.read_csv(file_path)


p_thing_property values with NULL 'r_doc':
543           MainEngine2 RunningState
403      GeneratorEngine6 RunningState
411      GeneratorEngine8 RunningState
405      GeneratorEngine7 RunningState
540           MainEngine2 LOInletPress
                    ...               
409            GeneratorEngine8 LFOUse
407     GeneratorEngine8 CFWOutletTemp
399    GeneratorEngine6 BearingDETemp6
400           GeneratorEngine6 Current
401             GeneratorEngine6 DoUse
Name: p_thing_property, Length: 73, dtype: object
Updated DataFrame saved to test_with_predictions_r_doc.csv


In [21]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

# Initialize the 'tfidf_similarity' field in the original DataFrame
df['tfidf_similarity'] = 0.0

# Define a function to compute TF-IDF similarity
def compute_tfidf_similarity(text1, text2):
    vectorizer = TfidfVectorizer()
    tfidf_matrix = vectorizer.fit_transform([text1, text2])
    cosine_sim = cosine_similarity(tfidf_matrix[0:1], tfidf_matrix[1:2])
    return cosine_sim[0][0]

# Iterate over the original DataFrame
for i, row in df.iterrows():
    if row['p_MDM'] == True and row['duplicate'] >= 2:
        # Find the matching 'p_thing_property' in grouped_counts_sorted
        matching_r_doc = grouped_counts_sorted[grouped_counts_sorted['p_thing_property'] == row['p_thing_property']]['r_doc']
        if not matching_r_doc.empty:
            r_doc_text = matching_r_doc.values[0]
            # Compute TF-IDF similarity
            similarity = compute_tfidf_similarity(row['tag_description'], r_doc_text)
            # Update the 'tfidf_similarity' field
            df.at[i, 'tfidf_similarity'] = similarity

# Print the updated DataFrame
print(df)

# Optionally, save the updated DataFrame to a CSV file
output_file_path = 'test_with_predictions_with_tfidf_similarity.csv'
df.to_csv(output_file_path, index=False)

print(f"Updated DataFrame saved to {output_file_path}")

                  thing              property  ships_idx  \
0      ME1TurboCharger1                   RPM       1025   
1      ME1TurboCharger1          LOInletPress       1025   
2      ME1TurboCharger1          LOInletPress       1029   
3           MainEngine1                  Load       1034   
4               ME1Flow          FOMassFlowIn       1017   
...                 ...                   ...        ...   
62064     FuelOilSystem  LFOVolumeStorageTK1S       1001   
62065     FuelOilSystem  LFOVolumeStorageTK2P       1001   
62066     FuelOilSystem  HFOVolumeStorageTK1S       1002   
62067     FuelOilSystem  HFOVolumeStorageTK2S       1002   
62068     FuelOilSystem  LFOVolumeStorageTK1S       1004   

                   tag_name  equip_type_code  \
0      MB.KM.IAS.Q1.A400003              NaN   
1      MB.KM.IAS.Q1.A400004              NaN   
2                     ML003              NaN   
3      MB.YO.IAS.Q1.A400003              NaN   
4               MF_000009_Y            

In [25]:
import pandas as pd

# Read the CSV file
file_path = 'test_with_predictions_MDM.csv'
df = pd.read_csv(file_path)

# Initialize the 'correct_mapping' field in the original DataFrame
df['correct_mapping'] = False

# Iterate over each unique ships_idx
for ship_idx in df['ships_idx'].unique():
    ship_df = df[df['ships_idx'] == ship_idx]
    
    # Find the highest tfidf_similarity for each p_thing_property
    max_similarity_idx = ship_df.groupby('p_thing_property')['tfidf_similarity'].idxmax()
    
    # Update the correct_mapping field based on the given conditions
    for idx in max_similarity_idx:
        if (df.at[idx, 'p_thing_correct'] == True) and \
           (df.at[idx, 'p_property_correct'] == True) and \
           (df.at[idx, 'MDM'] == "TRUE"):
            df.at[idx, 'correct_mapping'] = True

# Print the correct_mapping value for the specified conditions
specific_row = df[(df['ships_idx'] == 1025) & (df['tag_description'] == 'M/E SCAV. AIR RECEIVER TEMP H')]
print(specific_row[['ships_idx', 'tag_description', 'correct_mapping']])

# Save the updated DataFrame to a CSV file
output_file_path = 'test_with_predictions_with_correct_mapping.csv'
df.to_csv(output_file_path, index=False)

print(f"Updated DataFrame saved to {output_file_path}")


                  thing              property  ships_idx  \
0      ME1TurboCharger1                   RPM       1025   
1      ME1TurboCharger1          LOInletPress       1025   
2      ME1TurboCharger1          LOInletPress       1029   
3           MainEngine1                  Load       1034   
4               ME1Flow          FOMassFlowIn       1017   
...                 ...                   ...        ...   
62064     FuelOilSystem  LFOVolumeStorageTK1S       1001   
62065     FuelOilSystem  LFOVolumeStorageTK2P       1001   
62066     FuelOilSystem  HFOVolumeStorageTK1S       1002   
62067     FuelOilSystem  HFOVolumeStorageTK2S       1002   
62068     FuelOilSystem  LFOVolumeStorageTK1S       1004   

                   tag_name  equip_type_code  \
0      MB.KM.IAS.Q1.A400003              NaN   
1      MB.KM.IAS.Q1.A400004              NaN   
2                     ML003              NaN   
3      MB.YO.IAS.Q1.A400003              NaN   
4               MF_000009_Y            