In [1]:
import pandas as pd
import numpy as np
from sklearn.metrics import classification_report
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.model_selection import GridSearchCV, cross_val_score
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
import joblib
from scipy.sparse import hstack
from lightgbm import LGBMClassifier
from sentence_transformers import SentenceTransformer
import concurrent.futures
import os

# Loading Data & Data Exploratory

In [2]:
data_og= pd.read_csv("/Users/irenie/Downloads/X_LIB_R1 REOL(X_MAP_R1 REOL).csv",encoding='latin1')
data = data_og.dropna(subset=["X_OBJ Name", "Uniclass Systems"]).reset_index(drop=True)

In [3]:
# Extract the code (prefix like Ss_XX_XX) from the beginning
data[['Uniclass_Code', 'Uniclass_Name']] = data['Uniclass Systems'].str.extract(r'^([A-Za-z]{2}_\d{2}(?:_\d{2}){0,3})\s+(.*)$')

# Clean up any trailing short codes in parentheses from the name
data['Uniclass_Name'] = data['Uniclass_Name'].str.replace(r'\s*\([^)]*\)', '', regex=True).str.strip()

# Show sample of parsed values
data[['Uniclass Systems', 'Uniclass_Code', 'Uniclass_Name']].dropna().head(10)

data.drop(columns = ['Uniclass Code'], inplace=True)

In [4]:
#Create parent code for stratification
data['parent_code'] = data['Uniclass_Code'].apply(lambda x: "_".join(x.split("_")[:2])) 

# Drop heavy-null columns > 50%

In [5]:
# Calculate null percentage for each column
null_percentage = (data.isnull().sum() / len(data)) * 100

# Display null percentages
print("Null percentage by column:")
print(null_percentage.sort_values(ascending=False))

# Drop columns with more than X% null values (e.g., 50%)
threshold = 50  
columns_to_drop = null_percentage[null_percentage > threshold].index
data = data.drop(columns=columns_to_drop)

print(f"\nColumns dropped: {list(columns_to_drop)}")
print(f"Columns dropped count: {len(columns_to_drop)}")
print(f'Remain data shape: {data.shape}')

Null percentage by column:
Omniclass 22 - 15    100.0
Omniclass 21 - 10    100.0
Omniclass 21 - 11    100.0
Omniclass 21 - 12    100.0
Revit Family         100.0
                     ...  
OD2_Subdiscipline      0.0
Uniclass Products      0.0
Uniclass Elements      0.0
Omniclass 21           0.0
parent_code            0.0
Length: 104, dtype: float64

Columns dropped: ['Object Type2', 'Suggested Description', 'Suggested LOD', 'IFC Suggested Primary Discipline', 'Revit Category - 1', 'Revit Category - 2', 'Revit Family', 'OD1_Discipline Order', 'ICMS Construction', 'ICMS Renewal', 'ICMS Maintenance', 'Uniclass Products - 1', 'Uniclass Products - 2', 'Uniclass Products - 3', 'Uniclass Products - 4', 'Uniclass Products - 5', 'Uniclass Products - 6', 'Uniclass Products - 7', 'Uniclass Products - 8', 'Uniclass Elements - 1', 'Uniclass Elements - 2', 'Uniclass Elements - 3', 'Omniclass 21 - 1', 'Omniclass 21 - 2', 'Omniclass 21 - 3', 'Omniclass 21 - 4', 'Omniclass 21 - 5', 'Omniclass 21 - 6',

In [6]:
data.columns.to_list

<bound method IndexOpsMixin.tolist of Index(['X_OBJ IfcClass', 'X_OBJ Name', 'X_OBJ Type', 'X_OBJ UoM',
       'X_IsExternal', 'X_LoadBearing', 'X_FireRated', 'X_OBJ Keyword',
       'PPK Active Object', 'Object Description', 'Uniclass Systems Path',
       'Uniclass Systems', 'Count Code', 'PPK Active Task',
       ' PPK Active Construction Verb ', 'Object Number', 'IfcClass.Type',
       'IfcClass', 'IfcEnumerationType', 'Revit Category', 'OD0_Project Type',
       'OD1_Discipline', 'OD1_Discipline Split', 'OD2_Subdiscipline',
       'Uniclass Products', 'Uniclass Elements', 'Omniclass 21',
       'Omniclass 22', 'Omniclass 23', 'Uniclass_Code', 'Uniclass_Name',
       'parent_code'],
      dtype='object')>

In [7]:

# Select candidate text columns
text_columns = ['X_OBJ IfcClass', 'X_OBJ Name', 'X_OBJ Type', 'X_OBJ UoM',
       'X_IsExternal', 'X_LoadBearing', 'X_FireRated', 'X_OBJ Keyword',
       'PPK Active Object', 'Object Description',
       'Uniclass Systems Path','PPK Active Task', ' PPK Active Construction Verb ', 
       'IfcClass.Type', 'IfcClass', 'IfcEnumerationType', 'Revit Category',
       'OD0_Project Type', 'OD1_Discipline', 'OD1_Discipline Split',
       'OD2_Subdiscipline', 'Uniclass Products', 'Uniclass Elements',
       'Omniclass 21', 'Omniclass 22', 'Omniclass 23'
]

# Function to evaluate column-wise contribution
def evaluate_column_contribution(data, columns):
    metrics = []
    for col in columns:
        col_data = data[col].dropna().astype(str)
        num_unique = col_data.nunique()
        avg_length = col_data.apply(lambda x: len(x.split())).mean()
        max_length = col_data.apply(lambda x: len(x.split())).max()
        min_length = col_data.apply(lambda x: len(x.split())).min()
        top_freq = col_data.value_counts(normalize=True).iloc[0] if not col_data.empty else 0
        metrics.append({
            "Column": col,
            "Non-Null Count": col_data.shape[0],
            "Unique Values": num_unique,
            "Avg Word Count": round(avg_length, 2),
            "Max Word Count": max_length,
            "Min Word Count": min_length,
            "Top Value Frequency": round(top_freq, 2)  # high = low info
        })
    return pd.DataFrame(metrics).sort_values(by="Top Value Frequency")

# Run evaluation
column_metrics = evaluate_column_contribution(data, text_columns)

# Show the results
print("Text Column Usefulness Evaluation", column_metrics)


Text Column Usefulness Evaluation                             Column  Non-Null Count  Unique Values  \
8                PPK Active Object            3574           2804   
11                 PPK Active Task            3574           2821   
9               Object Description            3574           3574   
7                    X_OBJ Keyword            3574           3570   
1                       X_OBJ Name            3574           1322   
13                   IfcClass.Type            3574            652   
10           Uniclass Systems Path            3542            539   
14                        IfcClass            3574            119   
0                   X_OBJ IfcClass            3574            119   
20               OD2_Subdiscipline            3574            144   
16                  Revit Category            3574             64   
19            OD1_Discipline Split            3574             47   
2                       X_OBJ Type            3574           2665   


* Value Frequency: how repetitive the most common value is if it is high then it is less informative -> less entropy

WE will try to train data on required columns along with the columns which can satisfy the condition of low valuw frequency and high Unique values

In [8]:
data["input_text"] = (
    "Name: " + data["X_OBJ Name"].astype(str).fillna("") + " | " +
    "Type: " + data["X_OBJ Type"].astype(str).fillna("") + " | " +
    "Class: " + data["IfcClass.Type"].astype(str).fillna("") + " | " +
    "External: " + data["X_IsExternal"].astype(str).fillna("") + " | " +
    "LoadBearing: " + data["X_LoadBearing"].astype(str).fillna("") + " | " +
    "Keyword: " + data["X_OBJ Keyword"].astype(str).fillna("") + " | " +
    "PPK Active Object:" + data["PPK Active Object"].astype(str).fillna("") + " | " +
    "PPK Active Task:" + data["PPK Active Task"].astype(str).fillna("") + " | " +
    "Object Description:" + data["Object Description"].astype(str).fillna("") 
)

In [9]:
# Group by Uniclass_Name to reduce duplicates
data_grouped = data.groupby("Uniclass_Name")[["input_text", "Uniclass_Code"]].agg({
    "input_text": lambda texts: " ".join(texts),
    "Uniclass_Code": "first"
}).reset_index()


In [None]:

from sentence_transformers import SentenceTransformer, util
import torch


model = SentenceTransformer("all-MiniLM-L6-v2")


# Encode all class input_texts
class_embeddings = model.encode(data_grouped["input_text"].tolist(), convert_to_tensor=True)

# Retrieval function
def retrieve_uniclass(input_description, data_grouped, top_k=5):
    input_embedding = model.encode(input_description, convert_to_tensor=True)
    cosine_scores = util.cos_sim(input_embedding, class_embeddings)[0]
    top_results = torch.topk(cosine_scores, k=top_k)

    data_grouped = data_grouped.reset_index(drop=True)
    results = []
    for score, idx in zip(top_results.values, top_results.indices):
        i = int(idx)
        row = data_grouped.iloc[i]
        results.append({
            "Uniclass_Code": row["Uniclass_Code"],
            "Uniclass_Name": row["Uniclass_Name"],
            "Similarity": float(score)
        })

    return pd.DataFrame(results)

# Load new queries to classify
query_df = pd.read_csv("/Users/irenie/Downloads/X_LIB_R1 REOL(X_MAP_R1 REOL).csv",encoding='latin1')
query_df["input_text"] = (
    "Name: " + data["X_OBJ Name"].astype(str).fillna("") + " | " +
    "Type: " + data["X_OBJ Type"].astype(str).fillna("") + " | " +
    "Class: " + data["IfcClass.Type"].astype(str).fillna("") + " | " +
    "External: " + data["X_IsExternal"].astype(str).fillna("") + " | " +
    "LoadBearing: " + data["X_LoadBearing"].astype(str).fillna("") + " | " +
    "Keyword: " + data["X_OBJ Keyword"].astype(str).fillna("") + " | " +
    "PPK Active Object:" + data["PPK Active Object"].astype(str).fillna("") + " | " +
    "PPK Active Task:" + data["PPK Active Task"].astype(str).fillna("") + " | " +
    "Object Description:" + data["Object Description"].astype(str).fillna("") 
)

# Apply semantic search
batch_results = []
for _, row in query_df.iterrows():
    result = retrieve_uniclass(row["input_text"], data_grouped, top_k=1).iloc[0]
    batch_results.append({
        "X_OBJ Name": row["X_OBJ Name"],
        "X_OBJ Type": row["X_OBJ Type"],
        "X_OBJ IfcClass": row["X_OBJ IfcClass"],
        "X_IsExternal": row["X_IsExternal"],
        "X_LoadBearing": row["X_LoadBearing"],
        "X_OBJ Keyword": row["X_OBJ Keyword"],
        "PPK Active Object": row["PPK Active Object"],
        "PPK Active Task": row["PPK Active Task"],
        "Matched_Uniclass_Code": result["Uniclass_Code"],
        "Matched_Uniclass_Name": result["Uniclass_Name"],
        "Similarity": result["Similarity"]
    })

# Save results
pd.DataFrame(batch_results).to_csv("retrieval_results.csv", index=False)
print(f" Retrieval complete. Saved to retrieval_results.csv {os.path.abspath('retrieval_results.csv')}")



 Retrieval complete. Saved to retrieval_results.csv /Users/irenie/Downloads/EIC_Intern/retrieval_results.csv
