In [1]:
%pip install nltk

Defaulting to user installation because normal site-packages is not writeable
Collecting nltk
  Downloading nltk-3.8.1-py3-none-any.whl (1.5 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.5/1.5 MB[0m [31m25.2 MB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
Installing collected packages: nltk
Successfully installed nltk-3.8.1
Note: you may need to restart the kernel to use updated packages.


In [1]:
import re
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import TfidfVectorizer

def preprocess_text(text):
    # Text Cleaning
    text = re.sub(r'[^\w\s]', '', text)
    
    # Tokenization
    tokens = word_tokenize(text)
    
    # Stop Word Removal
    stop_words = set(stopwords.words('english'))
    tokens = [token for token in tokens if token.lower() not in stop_words]
    
    # Lemmatization
    lemmatizer = WordNetLemmatizer()
    tokens = [lemmatizer.lemmatize(token) for token in tokens]
    
    # Join tokens back to text
    preprocessed_text = ' '.join(tokens)
    
    return preprocessed_text


In [2]:
import pandas as pd
data=pd.read_csv('incidents.csv')

In [3]:
data['description']=data.description.astype('str')

In [None]:
import nltk
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

In [4]:
data['preproc']=data['description'].apply(preprocess_text)

In [18]:
data['preproc']

0       Late August 31 2023 crude oil spill occurred P...
1       drug runner sailboat modified operate semisubm...
2       28AUG2023 USCG Sector Puget Sound notified NOA...
3       USCG District 7 established Crisis Action Team...
4       Late Sunday night 27AUG2023 USCG Sector San Fr...
                              ...                        
4430    1500 June 13 1968 World Glory bound Huelva Spa...
4431    March 7 1968 Greek tank vessel General Colocot...
4432    morning March 3 1968 tanker Ocean Eagle ground...
4433    morning March 18 1967 TV Torrey Canyon ran agr...
4434    Tampico Maru henceforth Tampico left Los Angel...
Name: preproc, Length: 4435, dtype: object

In [5]:
data['threat'].fillna('Unknown', inplace=True)

In [6]:
data['threat'].value_counts()

threat
Oil         2529
Unknown     1340
Chemical     292
Other        274
Name: count, dtype: int64

In [7]:
filtered_data = data[(data['threat'] == 'Oil') | (data['threat'] == 'Chemical')]

In [8]:
filtered_data

Unnamed: 0,id,open_date,name,location,lat,lon,threat,tags,commodity,measure_skim,measure_shore,measure_bio,measure_disperse,measure_burn,max_ptl_release_gallons,posts,description,preproc
0,10659,01-09-2023,"Crude Oil Spill at Port Manatee in Tampa Bay, FL","St. Petersburg, FL, USA",27.634018,-82.565190,Oil,,Crude,,,,,,3500.0,0,"Late on August 31, 2023, a crude oil spill occ...",Late August 31 2023 crude oil spill occurred P...
1,10660,01-09-2023,Grounded Drug-running Submarine,"Mona Island, PR",18.073605,-67.931855,Oil,,diesel,,,,,,,0,A drug runner sailboat modified to operate as ...,drug runner sailboat modified operate semisubm...
2,10657,28-08-2023,"26-foot Cabin Cruiser Sunk, Bellingham Bay, Be...","Bellingham, WA",48.723710,-122.509100,Oil,,gasoline,,,,,,35.0,0,"On 28-AUG-2023, USCG Sector Puget Sound notifi...",28AUG2023 USCG Sector Puget Sound notified NOA...
4,10658,27-08-2023,"53-foot Motor Yacht Adrift Off Big Sur, CA","off Big Sur, CA",36.028000,-121.867333,Oil,,Diesel,,,,,,800.0,0,"Late Sunday night (27-AUG-2023), USCG Sector S...",Late Sunday night 27AUG2023 USCG Sector San Fr...
5,10655,24-08-2023,"38-foot Fishing Vessel Listing, Funter Bay, An...","Funter Bay, Angoon, AK, USA",58.231056,-134.919000,Oil,,Diesel,,,,,,250.0,0,"On August 25, 2023, the USCG Sector Juneau con...",August 25 2023 USCG Sector Juneau contacted NO...
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4425,6209,10-02-1970,Chevron Main Pass Block 41; 11 miles E of the ...,"11 miles E of the Mississippi River delta, Lou...",29.383300,-88.983300,Oil,,crude oil,,,,1.0,,,9,The Chevron Main Pass Block 41C platform caugh...,Chevron Main Pass Block 41C platform caught fi...
4426,6208,04-02-1970,"Arrow; Nova Scotia, Canada","Nova Scotia, Canada",45.466700,-61.100000,Oil,Grounding,Bunker C Oil,,,,1.0,1.0,3470000.0,7,"On February 4, 1970, at 0935, the steam tanker...",February 4 1970 0935 steam tanker Arrow ran ha...
4427,6207,30-04-1969,"Hamilton Trader; Liverpool Bay, England","Liverpool Bay, England",53.500000,3.333330,Oil,Collision,No. 6 Fuel Oil,,,,1.0,,168000.0,6,"Early on the morning of April 30, 1969, the Ha...",Early morning April 30 1969 Hannes Knuppel col...
4428,6206,28-01-1969,"Santa Barbara Well blowout; Santa Barbara, Cal...","Santa Barbara, California",34.166700,-119.750000,Oil,Wellhead,California crude oil,,,,1.0,,4200000.0,8,"On January 28, 1969, the Union Oil Company wel...",January 28 1969 Union Oil Company well number ...


In [9]:
from sklearn.feature_extraction.text import TfidfVectorizer

# Sample preprocessed text data
preprocessed_text = filtered_data.preproc

# Initialize the TF-IDF vectorizer
tfidf_vectorizer = TfidfVectorizer()

# Fit and transform the preprocessed text data
tfidf_matrix = tfidf_vectorizer.fit_transform(preprocessed_text)

# Get the feature names (words) used for the columns in the TF-IDF matrix
feature_names = tfidf_vectorizer.get_feature_names_out()

# Convert the TF-IDF matrix to a dense array for better readability (optional)
tfidf_matrix_dense = tfidf_matrix.toarray()

# Print the feature names and TF-IDF matrix
print("Feature Names (Words):")
print(feature_names)
print("\nTF-IDF Matrix:")
print(tfidf_matrix_dense)


Feature Names (Words):
['00' '000' '0000' ... 'zug' 'zulu' 'zuma']

TF-IDF Matrix:
[[0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 ...
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]]


In [10]:
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, classification_report

# Split the data into training and testing sets
X = tfidf_matrix_dense  # TF-IDF matrix
y = filtered_data['threat']  # Target labels
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Initialize and train the classifier
classifier = MultinomialNB()
classifier.fit(X_train, y_train)

# Make predictions on the test data
y_pred = classifier.predict(X_test)

# Calculate and print the accuracy
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy:.2f}")

# Print a classification report with precision, recall, and F1-score
report = classification_report(y_test, y_pred)
print(report)


Accuracy: 0.90
              precision    recall  f1-score   support

    Chemical       0.00      0.00      0.00        58
         Oil       0.90      1.00      0.95       507

    accuracy                           0.90       565
   macro avg       0.45      0.50      0.47       565
weighted avg       0.81      0.90      0.85       565



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [11]:
%pip install -U imbalanced-learn


Note: you may need to restart the kernel to use updated packages.


In [12]:
from imblearn.over_sampling import SMOTE
from collections import Counter

# Check the class distribution before oversampling
print("Class distribution before oversampling:", Counter(y))

# Create an instance of SMOTE
smote = SMOTE(random_state=42)

# Apply SMOTE to generate synthetic samples
X_resampled, y_resampled = smote.fit_resample(X, y)

# Check the class distribution after oversampling
print("Class distribution after oversampling:", Counter(y_resampled))


Class distribution before oversampling: Counter({'Oil': 2529, 'Chemical': 292})
Class distribution after oversampling: Counter({'Oil': 2529, 'Chemical': 2529})


In [13]:
X_train, X_test, y_train, y_test = train_test_split(X_resampled, y_resampled, test_size=0.2, random_state=42)

# Initialize and train the classifier
classifier = MultinomialNB()
classifier.fit(X_train, y_train)

# Make predictions on the test data
y_pred = classifier.predict(X_test)

# Calculate and print the accuracy
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy:.2f}")

# Print a classification report with precision, recall, and F1-score
report = classification_report(y_test, y_pred)
print(report)


Accuracy: 0.98
              precision    recall  f1-score   support

    Chemical       0.97      1.00      0.98       493
         Oil       1.00      0.97      0.98       519

    accuracy                           0.98      1012
   macro avg       0.98      0.98      0.98      1012
weighted avg       0.98      0.98      0.98      1012

