In [120]:
import pandas as pd
from transformers import BertTokenizer, BertModel
import torch
import numpy as np
from datetime import datetime
from sklearn.preprocessing import OneHotEncoder, LabelEncoder

data = pd.read_csv('psc_severity_train.csv')
data = data.drop(columns=['annotation_id', 'username','deficiency_code'])
df = pd.DataFrame(data)


print(df)



       PscInspectionId annotation_severity  \
0              1702496                 Low   
1              1702496                High   
2              1702496                High   
3              1795901              Medium   
4              1795901              Medium   
...                ...                 ...   
17986          1628279                 Low   
17987          1628279              Medium   
17988          1818812              Medium   
17989          1818812                 Low   
17990          1818812              Medium   

                                                def_text InspectionDate  \
0      PscInspectionId: 1702496\n\nDeficiency/Finding...     2023-04-24   
1      PscInspectionId: 1702496\n\nDeficiency/Finding...     2023-04-24   
2      PscInspectionId: 1702496\n\nDeficiency/Finding...     2023-04-24   
3      PscInspectionId: 1795901\n\nDeficiency/Finding...     2024-03-11   
4      PscInspectionId: 1795901\n\nDeficiency/Finding...     2024-03-11 

In [121]:
# Function to handle both majority voting and selecting highest severity if no majority

def majority_or_highest_severity(severities):
    
    # Get the most frequent severity
    most_common = severities.mode()[0]

    # If there's a clear majority (one severity appears more than once), return it
    if severities.value_counts().get(most_common, 0) > len(severities) / 2: 
        return most_common
    else:
        # If all values are different or there’s no clear majority, return the highest severity
        severity_order = {'Low': 1, 'Medium': 2, 'High': 3, 'Not a deficiency': 0}
        severity_values = severities.map(severity_order)
        return severities[severity_values.idxmax()]

# Group by VesselId and apply majority voting or highest severity for ties
df['consensus_severity'] = df.groupby('VesselId')['annotation_severity'].transform(majority_or_highest_severity)

# Display the result
df_clean = df.drop_duplicates(subset=['VesselId'])

print(df_clean.head())



    PscInspectionId annotation_severity  \
0           1702496                 Low   
3           1795901              Medium   
6           1667488              Medium   
9           1733202              Medium   
12          1750422              Medium   

                                             def_text InspectionDate  \
0   PscInspectionId: 1702496\n\nDeficiency/Finding...     2023-04-24   
3   PscInspectionId: 1795901\n\nDeficiency/Finding...     2024-03-11   
6   PscInspectionId: 1667488\n\nDeficiency/Finding...     2022-12-13   
9   PscInspectionId: 1733202\n\nDeficiency/Finding...     2023-08-04   
12  PscInspectionId: 1750422\n\nDeficiency/Finding...     2023-09-29   

    VesselId  PscAuthorityId   PortId VesselGroup        age  \
0     141884               7     2028    Chemical  20.813142   
3     292540               9  1000736    Dry Bulk  11.523614   
6     303546               1     1098    Dry Bulk   7.786448   
9     286265               9      936    Dry Bulk  1

In [122]:
# Initialize BERT tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

# Initialize BERT model
model = BertModel.from_pretrained('bert-base-uncased')

# Function to tokenize and extract features from 'def_text'
def extract_bert_features(text):
    # Tokenize the text
    inputs = tokenizer(text, padding=True, truncation=True, return_tensors="pt", max_length=512)

    # Get the embeddings from BERT model
    with torch.no_grad():
        outputs = model(**inputs)

    # Extract the last hidden state (embeddings for all tokens)
    last_hidden_state = outputs.last_hidden_state

    # We can use the embedding of the [CLS] token as the feature (first token)
    cls_embedding = last_hidden_state[:, 0, :]

    return cls_embedding.squeeze().numpy()  # Convert to numpy array for compatibility

# Apply the function to the 'def_text' column and create a new 'bert_features' column
df_clean['bert_features'] = df_clean['def_text'].apply(extract_bert_features)

# Check the dataframe with the new feature
columns_to_drop = ['PscInspectionId', 'annotation_severity', 'def_text', 'VesselId']
trainingdata = df_clean.drop(columns=columns_to_drop, errors='ignore')

#all features including BERT

print(trainingdata.head())
print(trainingdata.shape)



   InspectionDate  PscAuthorityId   PortId VesselGroup        age  \
0      2023-04-24               7     2028    Chemical  20.813142   
3      2024-03-11               9  1000736    Dry Bulk  11.523614   
6      2022-12-13               1     1098    Dry Bulk   7.786448   
9      2023-08-04               9      936    Dry Bulk  12.673511   
12     2023-09-29               1     1036    Dry Bulk   4.246407   

   consensus_severity                                      bert_features  
0                High  [-0.6526791, -0.3148818, 0.10291033, 0.0186098...  
3                High  [-0.9173013, -0.42088202, 0.23114991, 0.073901...  
6              Medium  [-0.8044656, -0.22771037, 0.5885241, -0.007614...  
9              Medium  [-0.5698796, -0.20617875, 0.55618834, -0.09618...  
12               High  [-0.87276477, -0.21908852, 0.36515206, -0.0681...  
(2632, 7)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_clean['bert_features'] = df_clean['def_text'].apply(extract_bert_features)


In [123]:
# Function to convert date string in the format 'YYYY-MM-DD' to Unix timestamp
def convert_to_unix_timestamp(date_str):
    # Convert string date to datetime object
    date_obj = datetime.strptime(date_str, "%Y-%m-%d")
    
    # Convert datetime object to Unix timestamp
    unix_timestamp = int(date_obj.timestamp())
    
    return unix_timestamp

# Apply the function to the 'InspectionDate' column and create a new column for Unix timestamps

trainingdata['InspectionDate_unix'] = trainingdata['InspectionDate'].apply(convert_to_unix_timestamp)

# Drop the 'InspectionDate' column if you no longer need it
clean_data = trainingdata.drop('InspectionDate', axis=1)




# Define a mapping for consensus_severity values to numeric levels
severity_mapping = {
    "Low": 1,
    "Medium": 2,
    "High": 3,
    "Not a deficiency": 0
}

# Apply the mapping to create a new column
clean_data['severity_level'] = clean_data['consensus_severity'].map(severity_mapping)

print(clean_data.head())
num_rows = clean_data.shape[0]
print(num_rows)


    PscAuthorityId   PortId VesselGroup        age consensus_severity  \
0                7     2028    Chemical  20.813142               High   
3                9  1000736    Dry Bulk  11.523614               High   
6                1     1098    Dry Bulk   7.786448             Medium   
9                9      936    Dry Bulk  12.673511             Medium   
12               1     1036    Dry Bulk   4.246407               High   

                                        bert_features  InspectionDate_unix  \
0   [-0.6526791, -0.3148818, 0.10291033, 0.0186098...           1682294400   
3   [-0.9173013, -0.42088202, 0.23114991, 0.073901...           1710115200   
6   [-0.8044656, -0.22771037, 0.5885241, -0.007614...           1670889600   
9   [-0.5698796, -0.20617875, 0.55618834, -0.09618...           1691107200   
12  [-0.87276477, -0.21908852, 0.36515206, -0.0681...           1695945600   

    severity_level  
0                3  
3                3  
6                2  
9       

In [124]:
## BERT dimension reduction with PCA

from sklearn.decomposition import PCA

# Convert bert_features column to a matrix
bert_matrix = np.array(clean_data['bert_features'].tolist())

# Reduce dimensions using PCA
pca = PCA(n_components= 50)  # Reduce to 50 components
bert_pca = pca.fit_transform(bert_matrix)

# Add PCA components as new features
for i in range(bert_pca.shape[1]):
    clean_data[f'bert_pca_{i+1}'] = bert_pca[:, i]






# Drop bert_features if no longer needed
cleanBert = clean_data.drop('bert_features', axis=1)

from sklearn.preprocessing import LabelEncoder

# Initialize the LabelEncoder
label_encoder1 = LabelEncoder()
label_encoder2 = LabelEncoder()

cleanBert['VesselGroup'] = label_encoder1.fit_transform(cleanBert['VesselGroup'])
#cleanBert['deficiency_code'] = label_encoder2.fit_transform(cleanBert['deficiency_code'])

# Print the cleaned data
print(cleanBert)



       PscAuthorityId   PortId  VesselGroup        age consensus_severity  \
0                   7     2028            0  20.813142               High   
3                   9  1000736            2  11.523614               High   
6                   1     1098            2   7.786448             Medium   
9                   9      936            2  12.673511             Medium   
12                  1     1036            2   4.246407               High   
...               ...      ...          ...        ...                ...   
17943               7      741            2   6.433949             Medium   
17958               1     1160            2   8.238193                Low   
17961               1     1036            2   6.466804             Medium   
17967               1     1056            2   8.525667                Low   
17973               9     2135            2   4.084873                Low   

       InspectionDate_unix  severity_level  bert_pca_1  bert_pca_2  \
0    

In [125]:
!pip install flaml
from flaml import AutoML
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report



y = cleanBert['severity_level']
X = cleanBert.drop(['consensus_severity','severity_level'],axis=1)


# Print dataset info
print("Dataset shape:", cleanBert.shape)
print("\nFeature names:", X.columns.tolist())
print("\nTarget distribution:")
print(y.value_counts(normalize=True))

# Split data with stratification
X_train, X_val, y_train, y_val = train_test_split(
    X, y, 
    test_size=0.2, 
    random_state=42,
    stratify=y  # Ensure balanced split
)

# Initialize and train AutoML
automl = AutoML()
settings = {
    'time_budget': 10,  # Increased time budget
    'task': 'classification',
    'metric': 'accuracy',
    'eval_method': 'holdout',  # Change to train-test split
    'split_ratio': 0.2  # Specify the test size (e.g., 20% of the data for testing)
}

# Train model
automl.fit(
    X_train=X_train, 
    y_train=y_train,
    **settings
)

# Make predictions
y_pred = automl.predict(X_val)

# Print detailed evaluation
print("\nBest ML learner:", automl.best_estimator)
print("Best hyperparmeter config:", automl.best_config)
print("Best accuracy on validation data: {:.4f}".format(1 - automl.best_loss))
print("\nClassification Report:")
print(classification_report(y_val, y_pred))

# Cross-validation score
from sklearn.model_selection import cross_val_score
cv_scores = cross_val_score(
    automl.model.estimator,
    X, y,
    cv=5,
    scoring='accuracy'
)
print("\nCross-validation scores:", cv_scores)
print("Mean CV score: {:.4f} (+/- {:.4f})".format(cv_scores.mean(), cv_scores.std() * 2))

Dataset shape: (2632, 57)

Feature names: ['PscAuthorityId', 'PortId', 'VesselGroup', 'age', 'InspectionDate_unix', 'bert_pca_1', 'bert_pca_2', 'bert_pca_3', 'bert_pca_4', 'bert_pca_5', 'bert_pca_6', 'bert_pca_7', 'bert_pca_8', 'bert_pca_9', 'bert_pca_10', 'bert_pca_11', 'bert_pca_12', 'bert_pca_13', 'bert_pca_14', 'bert_pca_15', 'bert_pca_16', 'bert_pca_17', 'bert_pca_18', 'bert_pca_19', 'bert_pca_20', 'bert_pca_21', 'bert_pca_22', 'bert_pca_23', 'bert_pca_24', 'bert_pca_25', 'bert_pca_26', 'bert_pca_27', 'bert_pca_28', 'bert_pca_29', 'bert_pca_30', 'bert_pca_31', 'bert_pca_32', 'bert_pca_33', 'bert_pca_34', 'bert_pca_35', 'bert_pca_36', 'bert_pca_37', 'bert_pca_38', 'bert_pca_39', 'bert_pca_40', 'bert_pca_41', 'bert_pca_42', 'bert_pca_43', 'bert_pca_44', 'bert_pca_45', 'bert_pca_46', 'bert_pca_47', 'bert_pca_48', 'bert_pca_49', 'bert_pca_50']

Target distribution:
3    0.390198
1    0.320669
2    0.287994
0    0.001140
Name: severity_level, dtype: float64
[flaml.automl.logger: 01-17 




Cross-validation scores: [0.42314991 0.39658444 0.41634981 0.4486692  0.41254753]
Mean CV score: 0.4195 (+/- 0.0340)


In [126]:
test_data = pd.DataFrame(pd.read_csv('psc_severity_test.csv').drop(columns = ['PscInspectionId','VesselId','deficiency_code']))


label_encoder1.classes_ = np.append(label_encoder1.classes_, "Offshore")
test_data['VesselGroup'] = test_data["VesselGroup"].apply(lambda x: x if x in label_encoder1.classes_ else "Unknown")
test_data['VesselGroup'] = label_encoder1.transform(test_data["VesselGroup"])
test_data['bert_features'] = test_data['def_text'].apply(extract_bert_features)


## BERT dimension reduction with PCA

from sklearn.decomposition import PCA

# Convert bert_features column to a matrix
bert_matrix = np.array(test_data['bert_features'].tolist())

# Reduce dimensions using PCA
pca = PCA(n_components= 50)  # Reduce to 50 components
bert_pca = pca.fit_transform(bert_matrix)

# Add PCA components as new features
for i in range(bert_pca.shape[1]):
    test_data[f'bert_pca_{i+1}'] = bert_pca[:, i]


# Drop bert_features if no longer needed
cleanBerttest = test_data.drop(['bert_features','def_text'], axis=1)

from sklearn.preprocessing import LabelEncoder

# Initialize the LabelEncoder
label_encoder1 = LabelEncoder()
label_encoder2 = LabelEncoder()

cleanBerttest['VesselGroup'] = label_encoder1.fit_transform(cleanBerttest['VesselGroup'])
#cleanBert['deficiency_code'] = label_encoder2.fit_transform(cleanBert['deficiency_code'])



# Apply the function to the 'InspectionDate' column and create a new column for Unix timestamps

cleanBerttest['InspectionDate_unix'] = cleanBerttest['InspectionDate'].apply(convert_to_unix_timestamp)

# Drop the 'InspectionDate' column if you no longer need it
cleanBertTest = cleanBerttest.drop('InspectionDate', axis=1)



# Print the cleaned data
print(cleanBertTest)









      PscAuthorityId  PortId  VesselGroup        age  bert_pca_1  bert_pca_2  \
0                  9     936            2   9.593429   -1.938568   -1.179000   
1                  9    5237            2  25.210130   -3.527104   -0.235479   
2                  1     953            2   5.793292    3.890847   -1.070704   
3                  7    1439            5  12.446270    1.062673    0.342991   
4                  2    1366            2  11.731691   -1.821821    0.850451   
...              ...     ...          ...        ...         ...         ...   
1096               7    1797            0  18.732375    2.124475    1.330667   
1097               9    3152            2   2.529774   -0.128647    0.513392   
1098               7    1459            3  14.275154    0.440157    0.063832   
1099               9    2135            2  10.176591   -1.878403   -2.020305   
1100               9    3188            1  16.788501    1.503980    0.066686   

      bert_pca_3  bert_pca_4  bert_pca_

In [127]:
test_pred = automl.predict(cleanBertTest)
print(test_pred)

[3 1 3 ... 3 3 3]


In [128]:


# Convert predictions to a DataFrame
predictions_df = pd.DataFrame(test_pred, columns=['Predictions'])

# Export to CSV
predictions_df.to_csv('predictions.csv', index=False)