In [None]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, classification_report, recall_score
from sklearn.model_selection import GridSearchCV


import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer

nltk.download("punkt")
nltk.download("stopwords")
import xgboost as xgb

# Air Transport Classifier

In [None]:
data = pd.read_csv('../flagged_citations/air_transport.csv')
data = data.drop(columns=['Unnamed: 0'])
data.head()

In [None]:
X = data['narrative'].astype(str).apply(lambda x: x.lower())
y = data['flag_air_transport'].astype(int)
y.value_counts()

In [None]:
# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
def preprocess_text(text):
    tokens = word_tokenize(text)
    stop_words = set(stopwords.words("english"))
    filtered_tokens = [token.lower() for token in tokens if token.isalpha() and token.lower() not in stop_words]
    stemmer = PorterStemmer()
    stemmed_tokens = [stemmer.stem(token) for token in filtered_tokens]
    return " ".join(stemmed_tokens)

In [None]:
X_train_preprocessed = [preprocess_text(text) for text in X_train]
vectorizer = TfidfVectorizer()
X_train_transformed = vectorizer.fit_transform(X_train_preprocessed)
X_train_transformed.shape

In [None]:
X_test_preprocessed = [preprocess_text(text) for text in X_test]
X_test_transformed = vectorizer.transform(X_test_preprocessed)
X_test_transformed.shape

## Baseline Model

In [None]:
# Convert the sparse matrix to DMatrix format used by XGBoost
dtrain = xgb.DMatrix(X_train_transformed, label=y_train)
dtest = xgb.DMatrix(X_test_transformed, label=y_test)

sum_negative = (y_train == 0).sum()
sum_positive = (y_train == 1).sum()
scale_pos_weight = sum_negative / sum_positive

# XGBoost parameters
params = {
}

# Train the XGBoost model
model = xgb.train(params, dtrain, num_boost_round=100)

In [None]:
# Make predictions on the test set
y_pred = model.predict(dtest)

# Convert the predictions to integers
y_pred = y_pred.astype(int)

# Evaluate the classifier
accuracy = accuracy_score(y_test, y_pred)
print(f'Accuracy: {accuracy:.2f}')

# Display classification report
print('\nClassification Report:')
print(classification_report(y_test, y_pred))

## Model for Imbalanced Data

In [None]:
# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

X_train_preprocessed = [preprocess_text(text) for text in X_train]
vectorizer = TfidfVectorizer()
X_train_transformed = vectorizer.fit_transform(X_train_preprocessed)

X_test_preprocessed = [preprocess_text(text) for text in X_test]
X_test_transformed = vectorizer.transform(X_test_preprocessed)

In [None]:
# Convert data to DMatrix format used by XGBoost
dtrain = xgb.DMatrix(X_train_transformed, label=y_train)
dtest = xgb.DMatrix(X_test_transformed, label=y_test)

# XGBoost parameters
params = {
    'objective': 'binary:logistic',
    'eval_metric': 'logloss',  # Use 'logloss' for binary classification
    'max_depth': 3,
    'learning_rate': 0.1,
    'n_estimators': 100,
    'scale_pos_weight': sum(y_train == 0) / sum(y_train == 1),  # Adjust for class imbalance
}


In [None]:
# Train the XGBoost model
model = xgb.train(params, dtrain, num_boost_round=100)

# Make predictions on the test set
y_pred_prob = model.predict(dtest) 

In [None]:
# Choose a threshold that maximizes recall
thresholds = [0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9]
best_recall = 0
best_threshold = 0

for threshold in thresholds:
    y_pred = [1 if prob > threshold else 0 for prob in y_pred_prob]
    current_recall = recall_score(y_test, y_pred)

    if current_recall > best_recall:
        best_recall = current_recall
        best_threshold = threshold

# Use the best threshold
y_pred = [1 if prob > best_threshold else 0 for prob in y_pred_prob]

# Evaluate the classifier
print(f'Best Threshold for Max Recall: {best_threshold}')
print('\nClassification Report:')
print(classification_report(y_test, y_pred))

## Using Classifier to Find "Missed" Cases of Air Transport

In [None]:
X_preprocessed = [preprocess_text(text) for text in X]
X_transformed = vectorizer.transform(X_preprocessed)
X_transformed.shape

In [None]:
dX = xgb.DMatrix(X_transformed, label=y)

In [None]:
# Make predictions on the test set
y_pred_prob = model.predict(dX)
best_threshold = 0.1
y_pred = [1 if prob > best_threshold else 0 for prob in y_pred_prob]

In [None]:
# Evaluate the classifier
accuracy = accuracy_score(y, y_pred)
print(f'Accuracy: {accuracy:.2f}')

# Display classification report
print('\nClassification Report:')
print(classification_report(y, y_pred))

In [None]:
# Identified positive cases
df = pd.DataFrame()
df['narrative'] = X
df['air_transport_pred'] = y_pred
df['flag_air_transport'] = y

In [None]:
df['air_transport_pred'].value_counts()

In [None]:
# Writing predictions to csv
df[(df['air_transport_pred'] == 1) & (df['flag_air_transport'] == 0)]['narrative'].to_csv('./classifier_predictions/air_transport.csv')