In [1]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, classification_report, recall_score
from sklearn.model_selection import GridSearchCV


import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer

nltk.download("punkt")
nltk.download("stopwords")
import xgboost as xgb

[nltk_data] Downloading package punkt to /Users/kchua/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /Users/kchua/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


# Air Transport Classifier

In [2]:
data = pd.read_csv('../flagged_citations/air_transport.csv')
data = data.drop(columns=['Unnamed: 0'])
data.head()

Unnamed: 0,hash_id,code,kind,repeat,desc,narrative,flag_cond_1,flag_air_transport
0,c14125ba5346e5c4,3.125(a),,False,"FACILITIES, GENERAL.","At the south farm, a goat was observed to jump...",False,False
1,090414fb43ad755a,2.33(b)(2),,False,ATTENDING VETERINARIAN AND ADEQUATE VETERINARY...,"Two bottles of Heparin (one partially used, on...",False,False
2,553396bb9bd960ea,2.33(b)(2),,False,ATTENDING VETERINARIAN AND ADEQUATE VETERINARY...,One expired bottle of Isoflourane (expiration ...,False,False
3,6f536be1f760dfb6,3.125(a),Critical,False,"Facilities, general.",According to facility observational and/or hea...,False,False
4,d8fb5331fdf6ef92,2.32(a),,False,PERSONNEL QUALIFICATIONS.,"In August 2015, a cynomolgus macaque placed un...",False,False


In [3]:
X = data['narrative'].astype(str).apply(lambda x: x.lower())
y = data['flag_air_transport'].astype(int)
y.value_counts()

flag_air_transport
0    38362
1      387
Name: count, dtype: int64

In [4]:
# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [5]:
def preprocess_text(text):
    tokens = word_tokenize(text)
    stop_words = set(stopwords.words("english"))
    filtered_tokens = [token.lower() for token in tokens if token.isalpha() and token.lower() not in stop_words]
    stemmer = PorterStemmer()
    stemmed_tokens = [stemmer.stem(token) for token in filtered_tokens]
    return " ".join(stemmed_tokens)

In [6]:
X_train_preprocessed = [preprocess_text(text) for text in X_train]
vectorizer = TfidfVectorizer()
X_train_transformed = vectorizer.fit_transform(X_train_preprocessed)
X_train_transformed.shape

(30999, 13695)

In [7]:
X_test_preprocessed = [preprocess_text(text) for text in X_test]
X_test_transformed = vectorizer.transform(X_test_preprocessed)
X_test_transformed.shape

(7750, 13695)

## Baseline Model

In [8]:
# Convert the sparse matrix to DMatrix format used by XGBoost
dtrain = xgb.DMatrix(X_train_transformed, label=y_train)
dtest = xgb.DMatrix(X_test_transformed, label=y_test)

sum_negative = (y_train == 0).sum()
sum_positive = (y_train == 1).sum()
scale_pos_weight = sum_negative / sum_positive

# XGBoost parameters
params = {
}

# Train the XGBoost model
model = xgb.train(params, dtrain, num_boost_round=100)

In [9]:
# Make predictions on the test set
y_pred = model.predict(dtest)

# Convert the predictions to integers
y_pred = y_pred.astype(int)

# Evaluate the classifier
accuracy = accuracy_score(y_test, y_pred)
print(f'Accuracy: {accuracy:.2f}')

# Display classification report
print('\nClassification Report:')
print(classification_report(y_test, y_pred))

Accuracy: 1.00

Classification Report:
              precision    recall  f1-score   support

           0       1.00      1.00      1.00      7689
           1       1.00      0.39      0.56        61

    accuracy                           1.00      7750
   macro avg       1.00      0.70      0.78      7750
weighted avg       1.00      1.00      0.99      7750



## Model for Imbalanced Data

In [10]:
# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

X_train_preprocessed = [preprocess_text(text) for text in X_train]
vectorizer = TfidfVectorizer()
X_train_transformed = vectorizer.fit_transform(X_train_preprocessed)

X_test_preprocessed = [preprocess_text(text) for text in X_test]
X_test_transformed = vectorizer.transform(X_test_preprocessed)

In [11]:
# Convert data to DMatrix format used by XGBoost
dtrain = xgb.DMatrix(X_train_transformed, label=y_train)
dtest = xgb.DMatrix(X_test_transformed, label=y_test)

# XGBoost parameters
params = {
    'objective': 'binary:logistic',
    'eval_metric': 'logloss',  # Use 'logloss' for binary classification
    'max_depth': 3,
    'learning_rate': 0.1,
    'n_estimators': 100,
    'scale_pos_weight': sum(y_train == 0) / sum(y_train == 1),  # Adjust for class imbalance
}


In [12]:
# Train the XGBoost model
model = xgb.train(params, dtrain, num_boost_round=100)

# Make predictions on the test set
y_pred_prob = model.predict(dtest) 

Parameters: { "n_estimators" } are not used.



In [13]:
# Choose a threshold that maximizes recall
thresholds = [0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9]
best_recall = 0
best_threshold = 0

for threshold in thresholds:
    y_pred = [1 if prob > threshold else 0 for prob in y_pred_prob]
    current_recall = recall_score(y_test, y_pred)

    if current_recall > best_recall:
        best_recall = current_recall
        best_threshold = threshold

# Use the best threshold
y_pred = [1 if prob > best_threshold else 0 for prob in y_pred_prob]

# Evaluate the classifier
print(f'Best Threshold for Max Recall: {best_threshold}')
print('\nClassification Report:')
print(classification_report(y_test, y_pred))

Best Threshold for Max Recall: 0.1

Classification Report:
              precision    recall  f1-score   support

           0       1.00      0.99      0.99      7689
           1       0.39      0.97      0.55        61

    accuracy                           0.99      7750
   macro avg       0.69      0.98      0.77      7750
weighted avg       0.99      0.99      0.99      7750



## Using Classifier to Find "Missed" Cases of Air Transport

In [14]:
X_preprocessed = [preprocess_text(text) for text in X]
X_transformed = vectorizer.transform(X_preprocessed)
X_transformed.shape

(38749, 13695)

In [15]:
dX = xgb.DMatrix(X_transformed, label=y)

In [16]:
# Make predictions on the test set
y_pred_prob = model.predict(dX)
best_threshold = 0.1
y_pred = [1 if prob > best_threshold else 0 for prob in y_pred_prob]

In [17]:
# Evaluate the classifier
accuracy = accuracy_score(y, y_pred)
print(f'Accuracy: {accuracy:.2f}')

# Display classification report
print('\nClassification Report:')
print(classification_report(y, y_pred))

Accuracy: 0.99

Classification Report:
              precision    recall  f1-score   support

           0       1.00      0.99      0.99     38362
           1       0.46      0.99      0.63       387

    accuracy                           0.99     38749
   macro avg       0.73      0.99      0.81     38749
weighted avg       0.99      0.99      0.99     38749



In [18]:
# Identified positive cases
df = pd.DataFrame()
df['narrative'] = X
df['air_transport_pred'] = y_pred
df['flag_air_transport'] = y

In [19]:
df['air_transport_pred'].value_counts()

air_transport_pred
0    37905
1      844
Name: count, dtype: int64

In [20]:
# Writing predictions to csv
df[(df['air_transport_pred'] == 1) & (df['flag_air_transport'] == 0)]['narrative'].to_csv('./classifier_predictions/air_transport.csv')