In [23]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, classification_report, recall_score
from sklearn.model_selection import GridSearchCV


import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer

nltk.download("punkt")
nltk.download("stopwords")
import xgboost as xgb

[nltk_data] Downloading package punkt to /Users/kchua/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /Users/kchua/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


# Quick Overheating Classifier

In [2]:
data = pd.read_csv('./flagged_citations/overheating.csv')
data.head()

Unnamed: 0.1,Unnamed: 0,hash_id,code,kind,repeat,desc,narrative,flag_include_1,flag_include_2,flag_include_3,flag_exclude_1,flag_exclude_2,flag_overheating
0,0,c14125ba5346e5c4,3.125(a),,False,"FACILITIES, GENERAL.","At the south farm, a goat was observed to jump...",False,False,False,False,False,False
1,1,090414fb43ad755a,2.33(b)(2),,False,ATTENDING VETERINARIAN AND ADEQUATE VETERINARY...,"Two bottles of Heparin (one partially used, on...",False,False,False,False,False,False
2,2,553396bb9bd960ea,2.33(b)(2),,False,ATTENDING VETERINARIAN AND ADEQUATE VETERINARY...,One expired bottle of Isoflourane (expiration ...,False,False,False,False,False,False
3,3,6f536be1f760dfb6,3.125(a),Critical,False,"Facilities, general.",According to facility observational and/or hea...,False,False,False,False,False,False
4,4,d8fb5331fdf6ef92,2.32(a),,False,PERSONNEL QUALIFICATIONS.,"In August 2015, a cynomolgus macaque placed un...",False,False,False,False,False,False


In [3]:
X = data['narrative'].astype(str).apply(lambda x: x.lower())
y = data['flag_overheating'].astype(int)
y.value_counts()

flag_overheating
0    38646
1      103
Name: count, dtype: int64

In [4]:
# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [5]:
def preprocess_text(text):
    tokens = word_tokenize(text)
    stop_words = set(stopwords.words("english"))
    filtered_tokens = [token.lower() for token in tokens if token.isalpha() and token.lower() not in stop_words]
    stemmer = PorterStemmer()
    stemmed_tokens = [stemmer.stem(token) for token in filtered_tokens]
    return " ".join(stemmed_tokens)

In [6]:
X_train_preprocessed = [preprocess_text(text) for text in X_train]
vectorizer = TfidfVectorizer()
X_train_transformed = vectorizer.fit_transform(X_train_preprocessed)
X_train_transformed.shape

(30999, 13695)

In [7]:
X_test_preprocessed = [preprocess_text(text) for text in X_test]
X_test_transformed = vectorizer.transform(X_test_preprocessed)
X_test_transformed.shape

(7750, 13695)

In [8]:
# Convert the sparse matrix to DMatrix format used by XGBoost
dtrain = xgb.DMatrix(X_train_transformed, label=y_train)
dtest = xgb.DMatrix(X_test_transformed, label=y_test)

sum_negative = (y_train == 0).sum()
sum_positive = (y_train == 1).sum()
scale_pos_weight = sum_negative / sum_positive

# XGBoost parameters (you may need to tune these parameters based on your dataset)
params = {
  #'multi:softmax',  # For multiclass classification
    #'num_class': len(y.unique()),  # Number of classes
    #'max_depth': 10,
    #'learning_rate': 0.1,
    #'n_estimators': 100
    #'scale_pos_weight': scale_pos_weight # Use 'mlogloss' for multiclass classification
}

# Train the XGBoost model
#model = xgb.train(params, dtrain, num_boost_round=100)

In [9]:
model = xgb.train(params, dtrain, num_boost_round=100)

In [10]:
# Make predictions on the test set
y_pred = model.predict(dtest)

# Convert the predictions to integers
#threshold = 0.4  # Adjust this threshold
#y_pred = [1 if prob[1] > threshold else 0 for prob in y_pred]
y_pred = y_pred.astype(int)

# Evaluate the classifier
accuracy = accuracy_score(y_test, y_pred)
print(f'Accuracy: {accuracy:.2f}')

# Display classification report
print('\nClassification Report:')
print(classification_report(y_test, y_pred))

Accuracy: 1.00

Classification Report:
              precision    recall  f1-score   support

           0       1.00      1.00      1.00      7731
           1       1.00      0.16      0.27        19

    accuracy                           1.00      7750
   macro avg       1.00      0.58      0.64      7750
weighted avg       1.00      1.00      1.00      7750



## Second Classifier

In [19]:
# Create a DataFrame
#df = pd.DataFrame(data, columns=['feature1', 'feature2', 'feature3', 'feature4', 'label'])

# Split data into features (X) and labels (y)
#X = df.drop('label', axis=1)
#y = df['label']

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

X_train_preprocessed = [preprocess_text(text) for text in X_train]
vectorizer = TfidfVectorizer()
X_train_transformed = vectorizer.fit_transform(X_train_preprocessed)

X_test_preprocessed = [preprocess_text(text) for text in X_test]
X_test_transformed = vectorizer.transform(X_test_preprocessed)


In [20]:
# Convert data to DMatrix format used by XGBoost
dtrain = xgb.DMatrix(X_train_transformed, label=y_train)
dtest = xgb.DMatrix(X_test_transformed, label=y_test)

# XGBoost parameters (you may need to tune these parameters based on your dataset)
params = {
    'objective': 'binary:logistic',
    'eval_metric': 'logloss',  # Use 'logloss' for binary classification
    'max_depth': 3,
    'learning_rate': 0.1,
    'n_estimators': 100,
    'scale_pos_weight': sum(y_train == 0) / sum(y_train == 1),  # Adjust for class imbalance
}


In [21]:
# Train the XGBoost model
model = xgb.train(params, dtrain, num_boost_round=100)

# Make predictions on the test set
y_pred_prob = model.predict(dtest)  # Predicted probabilities

Parameters: { "n_estimators" } are not used.



In [24]:
# Choose a threshold that maximizes recall
thresholds = [0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9]
best_recall = 0
best_threshold = 0

for threshold in thresholds:
    y_pred = [1 if prob > threshold else 0 for prob in y_pred_prob]
    current_recall = recall_score(y_test, y_pred)

    if current_recall > best_recall:
        best_recall = current_recall
        best_threshold = threshold

# Use the best threshold
y_pred = [1 if prob > best_threshold else 0 for prob in y_pred_prob]

# Evaluate the classifier
print(f'Best Threshold for Max Recall: {best_threshold}')
print('\nClassification Report:')
print(classification_report(y_test, y_pred))

Best Threshold for Max Recall: 0.1

Classification Report:
              precision    recall  f1-score   support

           0       1.00      0.99      1.00      7731
           1       0.24      1.00      0.39        19

    accuracy                           0.99      7750
   macro avg       0.62      1.00      0.69      7750
weighted avg       1.00      0.99      0.99      7750



## Use Classifier to Find "Missed" Cases of Overheating

In [25]:
X_preprocessed = [preprocess_text(text) for text in X]
X_transformed = vectorizer.transform(X_preprocessed)
X_transformed.shape

(38749, 13695)

In [26]:
dX = xgb.DMatrix(X_transformed, label=y)

In [27]:
# Make predictions on the test set
y_pred_prob = model.predict(dX)
best_threshold = 0.1
y_pred = [1 if prob > best_threshold else 0 for prob in y_pred_prob]

In [29]:
# Evaluate the classifier
accuracy = accuracy_score(y, y_pred)
print(f'Accuracy: {accuracy:.2f}')

# Display classification report
print('\nClassification Report:')
print(classification_report(y, y_pred))

Accuracy: 0.99

Classification Report:
              precision    recall  f1-score   support

           0       1.00      0.99      1.00     38646
           1       0.30      1.00      0.46       103

    accuracy                           0.99     38749
   macro avg       0.65      1.00      0.73     38749
weighted avg       1.00      0.99      1.00     38749



In [30]:
# Identified positive cases
df = pd.DataFrame()
df['narrative'] = X
df['overheating_pred'] = y_pred
df['overheating_flag'] = y

In [33]:
df['overheating_pred'].value_counts()

overheating_pred
0    38402
1      347
Name: count, dtype: int64

In [31]:
df[df['overheating_pred'] == 1]

Unnamed: 0,narrative,overheating_pred,overheating_flag
236,"on july 19th, 2019, when an issue with the air...",1,0
574,"*on june 29, 2021 at approximately 11:30 am th...",1,0
619,"on 19 august 2014, the intermediate handler tr...",1,0
876,three separate research teams had adverse even...,1,1
878,upon arrival at the farm at 8 am on 29july2021...,1,1
...,...,...,...
38252,"on february 10, 2022, wind rose aviation trans...",1,1
38280,"at the time of inspection, the atmospheric tem...",1,0
38293,a strong rancid odor was observed throughout t...,1,0
38480,six ringtail lemurs have recently been added t...,1,0


In [34]:
df[(df['overheating_pred'] == 1) & (df['overheating_flag'] == 0)]['narrative'].tolist()

['on july 19th, 2019, when an issue with the air conditioning system developed that compromised the ability of the\nvivarium to maintain an appropriate ambient temperature for hamsters, animal facility staff were not made aware of\nthe issue in a timely manner. when the appropriate individuals were notified on july 20th, initial corrective measures\nwere taken, but their efficacy was not evaluated or monitored. the temperature continued to rise between\nobservations, and recorded temperatures in the hamster facility on july 21st ranged from 88 – 93 degrees. the\ninitial corrective action was insufficient to maintain the appropriate temperature for hamsters, leading to an ambient\ntemperature of >85 degrees for more than 12 hours. indoor housing for hamsters shall provide for the health and\ncomfort of the animals, and the ambient temperature shall not be allowed to exceed 85 degrees.\nto remain corrected after july 24th, 2019.\nthis inspection and exit interview were conducted on site 

In [35]:
df[df['overheating_pred'] == 1]['narrative'].to_csv('overheating_classifier_predictions.csv')

