In [13]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.svm import SVC
from sklearn.metrics import classification_report, accuracy_score
from sklearn.model_selection import GridSearchCV

In [14]:
filename="../data/modeling_first_iteration.csv"
df = pd.read_csv(filename)

In [15]:
df.head(10)

Unnamed: 0,MMSI,LAT,LON,SOG,COG,Heading,VesselName,IMO,CallSign,VesselType,...,Status of System_ TD,Status of System_ TS,vessel_year,vessel_month,vessel_day,vessel_hour,hurricane_year,hurricane_month,hurricane_day,hurricane_hour
0,563135300,32.11997,-79.93341,0.0,270.9,179.0,1348,281,418,70.0,...,False,False,2022,8,31,12,2022,8,31,12
1,636092896,40.4947,-73.66555,0.1,299.4,299.0,218,953,715,79.0,...,False,False,2022,8,31,12,2022,8,31,12
2,373457000,36.62089,-75.54031,11.2,336.6,337.0,849,600,38,70.0,...,False,False,2022,8,31,12,2022,8,31,12
3,211779000,27.72167,-78.49475,22.2,255.3,253.0,1027,184,852,70.0,...,False,False,2022,8,31,12,2022,8,31,12
4,636021760,36.14663,-74.65842,17.5,12.7,12.0,965,359,190,70.0,...,False,False,2022,8,31,12,2022,8,31,12
5,249830000,32.64261,-79.01387,15.3,56.1,60.0,1408,569,234,70.0,...,False,False,2022,8,31,12,2022,8,31,12
6,311000779,27.81277,-79.66919,16.4,3.5,7.0,389,1094,588,70.0,...,False,False,2022,8,31,12,2022,8,31,12
7,636092783,40.51804,-72.22835,10.8,279.0,281.0,1306,654,540,71.0,...,False,False,2022,8,31,12,2022,8,31,12
8,538009198,39.84536,-75.27043,0.0,341.6,77.0,1091,1339,1124,80.0,...,False,False,2022,8,31,12,2022,8,31,12
9,212091000,37.06007,-75.14156,13.5,260.7,260.0,1076,632,138,70.0,...,False,False,2022,8,31,12,2022,8,31,12


In [16]:
df.shape

(70623, 52)

In [17]:
df.columns

Index(['MMSI', 'LAT', 'LON', 'SOG', 'COG', 'Heading', 'VesselName', 'IMO',
       'CallSign', 'VesselType', 'Status', 'Length', 'Width', 'Draft', 'Cargo',
       'Name', 'Num Entries', 'Time', 'Record Identifier', 'Latitude',
       'Longitude', 'Maximum Sustained Wind', 'Minimum Pressure',
       '34-knot Wind Radii NE', '34-knot Wind Radii SE',
       '34-knot Wind Radii SW', '34-knot Wind Radii NW',
       '50-knot Wind Radii NE', '50-knot Wind Radii SE',
       '50-knot Wind Radii SW', '50-knot Wind Radii NW',
       '64-knot Wind Radii NE', '64-knot Wind Radii SE',
       '64-knot Wind Radii SW', '64-knot Wind Radii NW', 'Speed mph',
       'impacted', 'PathChange', 'TransceiverClass_B', 'Status of System_ EX',
       'Status of System_ HU', 'Status of System_ LO', 'Status of System_ TD',
       'Status of System_ TS', 'vessel_year', 'vessel_month', 'vessel_day',
       'vessel_hour', 'hurricane_year', 'hurricane_month', 'hurricane_day',
       'hurricane_hour'],
      dtype='obje

In [18]:
# Split the data into features and target label
y = df['PathChange']
X = df.drop('PathChange', axis=1)

In [19]:
# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=123)

In [20]:
print(X_train.shape)
print(X_test.shape)
print(y_train.shape)
print(y_test.shape)

(56498, 51)
(14125, 51)
(56498,)
(14125,)


In [21]:
# Standardize the features (SVMs are sensitive to feature scaling)
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

In [22]:
# Take a random subset of the data (20% for faster training)
subset_size = int(0.2 * len(X_train))
X_train_subset = X_train[:subset_size]
y_train_subset = y_train[:subset_size]

In [23]:
# Train the SVM model
# svm_model = SVC(kernel='linear', C=1.0, random_state=42)
# svm_model.fit(X_train_subset, y_train_subset)

In [24]:
# Define parameter grid
param_grid = {
    'C': [0.1, 1, 10],
    'kernel': ['linear']
    # 'kernel': ['linear', 'rbf'],
}

# Grid search with cross-validation
grid_search = GridSearchCV(SVC(), param_grid, refit=True, verbose=2)
grid_search.fit(X_train_subset, y_train_subset)

# Best parameters and model
print(f"Best Parameters: {grid_search.best_params_}")

Fitting 5 folds for each of 3 candidates, totalling 15 fits
[CV] END ...............................C=0.1, kernel=linear; total time=   3.0s
[CV] END ...............................C=0.1, kernel=linear; total time=   2.7s
[CV] END ...............................C=0.1, kernel=linear; total time=   2.4s
[CV] END ...............................C=0.1, kernel=linear; total time=   2.6s
[CV] END ...............................C=0.1, kernel=linear; total time=   2.4s
[CV] END .................................C=1, kernel=linear; total time=   5.2s
[CV] END .................................C=1, kernel=linear; total time=   5.0s
[CV] END .................................C=1, kernel=linear; total time=   5.1s
[CV] END .................................C=1, kernel=linear; total time=   5.3s
[CV] END .................................C=1, kernel=linear; total time=   5.3s
[CV] END ................................C=10, kernel=linear; total time=13.1min
[CV] END ................................C=10, ke

In [25]:
# Make predictions on the test set
y_pred = grid_search.predict(X_test)

# Evaluate model performance
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy}")

# Detailed classification report
print(classification_report(y_test, y_pred))

Accuracy: 0.863929203539823
              precision    recall  f1-score   support

           0       0.81      0.97      0.88      7568
           1       0.99      1.00      0.99      4633
           2       0.00      0.00      0.00        84
           3       0.57      0.12      0.19      1840

    accuracy                           0.86     14125
   macro avg       0.59      0.52      0.52     14125
weighted avg       0.83      0.86      0.83     14125



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
