In [18]:
%run notebook_core_utils.ipynb
# %run notebook_text_processor.ipynb
%run notebook_core_mappers.ipynb

In [19]:
import pandas as pd
# from .core_mappers import HFACS_DICTIONARY, AUTO_LABELING_DICTIONARY, AST_APM_DICTIONARY
# from .core_utils import CoreUtils

def get_dic(ds_name, imbalance_options):
    is_merge_taxonomy = imbalance_options["is_merge_taxonomy"]
    if is_merge_taxonomy:
        return [
            # HFACS_DICTIONARY['hfacs_balance'],
            AST_APM_DICTIONARY,
            AUTO_LABELING_DICTIONARY[f'{ds_name}_balance']
        ]
    else:
        return [
            HFACS_DICTIONARY['hfacs'],
            AUTO_LABELING_DICTIONARY[ds_name]
        ]

class AutoLabeling:

    def __init__(self, df, ds_name='asrs', imbalance_options={
        "is_merge_taxonomy": False
    }):
        self.ds_name = ds_name
        self.imbalance_options = imbalance_options

        factor_col_name = CoreUtils.get_constant()["FACTOR_COL_NAME"]
        hfacs_dic, auto_label_dic = get_dic(ds_name, imbalance_options)
        
        self.factor_col_name = factor_col_name
        self.hfacs_dic = hfacs_dic
        self.auto_label_dic = auto_label_dic
        self.df = df

    # Function to map combined issues to HFACS based on keyword mappings
    def __map_hfacs(self, factors):

        pf_dic = self.auto_label_dic

        # Convert to lowercase for consistent keyword matching
        factors = str(factors).lower()

        for keyword, mapping in pf_dic.items():
            if keyword.lower() in factors:  # Check if keyword is in the combined issues string
                # print(keyword.lower(), '=>>>', factors)
                # if(mapping == 11):
                #     # print(mapping)
                #     return self.__map_hf_asrs(factors)
                return mapping
        return -1  # Default to unmapped if no match found

    def do_auto_label(self, sample_size=0):

        # df_tail = df.tail(sample_size).copy()

        factor_col_name = self.factor_col_name
        df = self.df
        hfacs_dic = self.hfacs_dic

        if sample_size > 0:
            df_tail =  df.sample(n=sample_size, random_state=42)
        else:
            df_tail = df
            
        print('sample_size=', df_tail.shape)
        
        df_tail.replace('', pd.NA, inplace=True)
        # print_column = ['ACN'] + factors_column

        # df_tail[print_column].value_counts()
        # df_tail.isnull().sum()

        # Load the dataset (update the file path as needed)
        data = df_tail.copy()
        # Apply the function to each 'Combined_Issues' row and map to HFACS
        data['HFACS_Category_Index'] = data[factor_col_name].apply(self.__map_hfacs)

        # Split the 'HFACS_Mapped' into two new columns: 'HFACS_Category' and 'HFACS_Level'
        data['HFACS_Category_Value'] = data['HFACS_Category_Index'].apply(lambda x: hfacs_dic[x][3])
        data['HFACS_Level'] = data['HFACS_Category_Index'].apply(lambda x: hfacs_dic[x][0])

        # # Summarize the mapped HFACS categories and levels
        hfacs_summary = data.groupby(['HFACS_Category_Value']).size().reset_index(name='Count')
        print(hfacs_summary)
        return data

In [35]:
asrs_df = CoreUtils.get_data('asrs')
ntsb_df = CoreUtils.get_data('ntsb')
asrs_ntsb_df =  pd.concat([asrs_df, ntsb_df], axis=0).reset_index(drop=True)



In [48]:
# HFACS Categories dictionary as provided
hfacs_categories = {
    'Unsafe Acts': {
        # 'Skill-based Errors': ['Distraction', 'Troubleshooting'],
        # 'Decision Errors': [],
        # 'Perceptual Errors': ['confusion'],
        'Unsafe Acts': [
            'Distraction', 'Troubleshooting', 'confusion',

            'Personnel issues-Task performance', 
            'Personnel issues-Action/decision', 
            'Personnel issues-Psychological-Perception/orientation/illusion',
            'Personnel issues-Miscellaneous-Intentional act'
        ],
    },
    'Preconditions for Unsafe Acts': {
        'Physical Environment': [
            'Weather', 'Environment - Non Weather Related', 'turbulence',
            'Environmental issues-Physical environment',
            'Environmental issues-Conditions/weather/phenomena-Turbulence',
            'Environmental issues-Task environment'
        ],
        'Technological Environment': [
            'Software and Automation', 'Incorrect / Not Installed / Unavailable Part', 'Aircraft', 'Human-machine interface'
            
            'Environmental issues-Operating environment',
            # 'Aircraft-Aircraft systems'
        ],

        # 'Adverse Mental States': ['Situational Awareness', 'time pressure', 'confusion', 'distraction', 'fatigue'],
        # 'Adverse Physiological States': ['Physiological - Other'],
        # 'Physical/Mental Limitations': [],
        'Conditions of Operators': [
            'Situational Awareness', 'time pressure', 'confusion', 'distraction', 'fatigue',
            'Physiological - Other',

            'Personnel issues-Physical-Alertness/Fatigue',
            'Personnel issues-Psychological-Attention/monitoring',
            'Personnel issues-Psychological-Personality/attitude',
            'Personnel issues-Psychological-Mental/emotional state',
            
            'Personnel issues-Psychological',
            'Personnel issues-Physical-Impairment/incapacitation',
            'Personnel issues-Physical-Health/Fitness',
            
            'Personnel issues-Physical-Sensory ability/limitation'
        ],
        # 'Crew Resource Management': ['communication breakdown'],
        # 'Personal Readiness': ['Training / Qualification'],
        'Personnel Factors': [
            'communication breakdown',
            'Training / Qualification',

            'Lack of communication',
            'Personnel issues-Experience/knowledge'
        ]
    },
    'Unsafe Supervision': {
        'Inadequate Supervision': ['insufficient training', 'lack of guidance', 'no supervision'],
        'Planned Inappropriate Operations': ['scheduling pressure', 'overwork'],
        'Failure to Correct Known Problems': ['failing to resolve known issues']
    },
    'Organizational Influences': {
        # 'Resource Management': ['Company Policy', 'Airspace Structure'],
        # 'Organizational Climate': ['Procedure', 'Manuals', 'Airport', 'ATC Equipment / Nav Facility / Buildings', 'Chart Or Publication'],
        # 'Operational Processes': ['Staffing', 'Equipment / Tooling'],
        'Organizational Influences': [
            'Company Policy', 'Airspace Structure',
            'Procedure', 'Manuals', 'Airport', 'ATC Equipment / Nav Facility / Buildings', 'Chart Or Publication',
            'Staffing', 'Equipment / Tooling',

            'Organizational issues-Management-Culture',
            'Organizational issues-Management-Communication (organizational)',
            'Organizational issues-Management-Scheduling',
            'Organizational issues-Management-Policy/procedure',
            'Organizational issues-Management-(general)-(general)-Operator',
            'Organizational issues-Support/oversight/monitoring',
            'Organizational issues-Management-Resources',
            'Organizational issues-Development-Selection/certification/testing'
        ]
    }
}


In [49]:
import pandas as pd
import numpy as np

# Step 1: Load the documents (assuming they're stored in a CSV for simplicity)
# Each row represents a document, and we focus on the "content" column
# df = pd.read_csv('aviation_documents.csv')

# df = CoreUtils.get_data('asrs')

# Step 2: Define HFACS categories and corresponding keywords
# hfacs_categories = {
#     'Environmental Factors': ['weather', 'visibility', 'turbulence'],
#     'Physical/Mental Limitations': ['fatigue', 'disorientation', 'stress'],
#     'Adverse Mental States': ['overconfidence', 'complacency', 'distraction']
#     # Add more HFACS categories as needed
# }

# Function to classify document into an HFACS category
# def classify_document(doc_text):
#     # print(doc_text)
#     if doc_text is np.nan:
#         return 'Unknown'

#     for category, keywords in hfacs_categories.items():
#         if any(keyword in doc_text.lower() for keyword in keywords):
#             return category
#     return 'Unknown'  # If no category matches

def classify_document(doc_text: str):
    if doc_text is np.nan:
        return 'Unknown'
    for category, subcategories in hfacs_categories.items():
        for subcategory, keywords in subcategories.items():
            # if 'Distraction' in doc_text:
            #     print(subcategory, keywords, doc_text.lower())
            
            if any(keyword.lower() in doc_text.lower() for keyword in keywords):
                # if 'Distraction' in doc_text:
                #     print(subcategory, keywords, doc_text.lower())
                return f"{category} -> {subcategory}"
    return 'Unknown'  # If no category matches

# Step 3: Apply classification to each document
asrs_df['hfacs_category'] = asrs_df['finding_description'].apply(classify_document)
ntsb_df['hfacs_category'] = ntsb_df['finding_description'].apply(classify_document)
asrs_ntsb_df['hfacs_category'] = asrs_ntsb_df['finding_description'].apply(classify_document)


# Step 4: Perform consistency check
# Count the number of documents in each category
category_counts = asrs_df['hfacs_category'].value_counts()

# # Step 5: Check if any category meets the consistency criterion (at least 12 out of 15)
# consistency_satisfied = category_counts.max() >= 12

# # Output results
# print(category_counts)
# if consistency_satisfied:
#     print(f"Consistency criterion satisfied: {category_counts.idxmax()} category has {category_counts.max()} documents.")
# else:
#     print("Consistency criterion not satisfied.")


In [50]:
asrs_df['hfacs_category'].value_counts()

hfacs_category
Unsafe Acts -> Unsafe Acts                                    3914
Preconditions for Unsafe Acts -> Technological Environment    2193
Organizational Influences -> Organizational Influences        1480
Preconditions for Unsafe Acts -> Conditions of Operators       869
Unknown                                                        769
Preconditions for Unsafe Acts -> Physical Environment          640
Preconditions for Unsafe Acts -> Personnel Factors             254
Name: count, dtype: int64

In [51]:
ntsb_df['hfacs_category'].value_counts()

hfacs_category
Unknown                                                       2255
Unsafe Acts -> Unsafe Acts                                     945
Preconditions for Unsafe Acts -> Technological Environment     794
Preconditions for Unsafe Acts -> Physical Environment          219
Preconditions for Unsafe Acts -> Conditions of Operators        81
Preconditions for Unsafe Acts -> Personnel Factors              20
Organizational Influences -> Organizational Influences           3
Name: count, dtype: int64

In [52]:
asrs_ntsb_df['hfacs_category'].value_counts()

hfacs_category
Unsafe Acts -> Unsafe Acts                                    4859
Unknown                                                       3024
Preconditions for Unsafe Acts -> Technological Environment    2987
Organizational Influences -> Organizational Influences        1483
Preconditions for Unsafe Acts -> Conditions of Operators       950
Preconditions for Unsafe Acts -> Physical Environment          859
Preconditions for Unsafe Acts -> Personnel Factors             274
Name: count, dtype: int64

In [56]:
asrs_ntsb_df[asrs_ntsb_df['hfacs_category']=='Unknown']['finding_description'].value_counts()

finding_description
Ambiguous                                                                                                                                                                                                                                                                                                                      497
Human Factors                                                                                                                                                                                                                                                                                                                  221
Not determined-Not determined-(general)-(general)-Unknown/Not determined                                                                                                                                                                                                                                                        65
Human Facto