In [None]:
## create trajectories
import pyarrow.csv as pc
import pyarrow as pa

# Define the path to your CSV file
csv_file = "./data/cleaned/2.csv"  # Change this to your actual file path

# Read the CSV file into a PyArrow Table
try:
    table = pc.read_csv(csv_file)
except Exception as e:
    print(f"Error reading CSV file: {e}")
    exit()

# Convert to a pandas DataFrame
df = table.to_pandas()

# Display first few rows
print(df.head())
print(df.dtypes)

In [None]:
df.columns

df.to_csv("./data/swag.csv", index=False)

In [None]:
import pandas as pd
import numpy as np

# Preprocessing and evaluation libraries
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.model_selection import train_test_split, StratifiedKFold, cross_val_score, GridSearchCV
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, classification_report

# Machine learning models
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC

# For SMOTE to address class imbalance
from imblearn.over_sampling import SMOTE

# =============================================================================
# 1. Data Preparation and Preprocessing
# =============================================================================
# Assume that the original dataframe "df" is already loaded in your environment.
# For example, you might have read it from a CSV:
# df = pd.read_csv("your_data.csv")

# ----- Filter the Dataset by Ship Type -----
# Here we choose specific ship types to include.
wanted_ship_types = ['Cargo', 'Tanker', 'Fishing', 'HSC', 'Passenger']  # <-- Change these as needed
df_filtered = df[df['Ship type'].isin(wanted_ship_types)]

# Make a copy of the filtered dataframe
df_clean = df_filtered.copy()

# ----- Remove Unnecessary Columns -----
columns_to_drop = ["Name", "Callsign", "Destination", "MMSI", "trip_id", "trip_start", "trip_end"]
df_clean = df_clean.drop(columns=columns_to_drop)

# ----- Encode Categorical Variables -----
# We encode columns such as "Ship type" and other categorical features.
cat_columns = ["Type of mobile", "Ship type", "Cargo type"]
le_dict = {}
for col in cat_columns:
    le = LabelEncoder()
    # Convert to string in case any labels are numeric already
    df_clean[col] = le.fit_transform(df_clean[col].astype(str))
    le_dict[col] = le

# ----- Check and Handle Missing Values -----
print("Missing values per column:")
print(df_clean.isna().sum())
# Here we simply drop rows with any missing values.
df_clean = df_clean.dropna()

# ----- Separate Features and Target -----
# We want to predict "Ship type", so we separate it as the target.
y = df_clean["Ship type"]             # Target variable (numeric encoding of ship type)
X = df_clean.drop(columns=["Ship type"])  # All other columns are features

# ----- Feature Scaling -----
# Scale numeric features so that each feature contributes equally.
scaler = StandardScaler()
numeric_columns = X.select_dtypes(include=['float64', 'int64']).columns
X[numeric_columns] = scaler.fit_transform(X[numeric_columns])

# ----- (Optional) Print the Mapping of Labels to Original Ship Types -----
le_ship_type = le_dict["Ship type"]
ship_type_mapping = dict(enumerate(le_ship_type.classes_))
print("\nMapping of numeric labels to original ship type names:")
print(ship_type_mapping)

# =============================================================================
# 2. Split Data into Training and Test Sets
# =============================================================================
# We use stratification to ensure that the class distribution is similar in both sets.
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, stratify=y, random_state=42
)
print("\nTraining set class distribution:")
print(y_train.value_counts())
print("\nTest set class distribution:")
print(y_test.value_counts())

# =============================================================================
# 3. Define a Helper Function for Model Evaluation
# =============================================================================
def evaluate_classifier(model, X_eval, y_eval, classifier_name):
    """
    Evaluate a classifier on the given evaluation set and print key metrics.
    This function prints accuracy, macro-averaged precision, recall, F1-score,
    and a detailed classification report.
    
    Parameters:
        model: A trained model that supports the .predict() method.
        X_eval: The feature set for evaluation.
        y_eval: The true labels.
        classifier_name: String name for the classifier (used in print statements).
    
    Returns:
        A dictionary containing the evaluation metrics.
    """
    y_pred = model.predict(X_eval)
    
    acc  = accuracy_score(y_eval, y_pred)
    prec = precision_score(y_eval, y_pred, average='macro', zero_division=0)
    rec  = recall_score(y_eval, y_pred, average='macro', zero_division=0)
    f1   = f1_score(y_eval, y_pred, average='macro', zero_division=0)
    
    print(f"\nClassifier: {classifier_name}")
    print(f"Accuracy:         {acc:.4f}")
    print(f"Precision (macro): {prec:.4f}")
    print(f"Recall (macro):    {rec:.4f}")
    print(f"F1 Score (macro):  {f1:.4f}")
    print("\nClassification Report:")
    print(classification_report(y_eval, y_pred, zero_division=0))
    print("-"*50)
    
    return {
        "Classifier": classifier_name,
        "Accuracy": acc,
        "Precision": prec,
        "Recall": rec,
        "F1": f1
    }

# =============================================================================
# 4. Setup Cross-Validation
# =============================================================================
# We use StratifiedKFold to ensure each fold has a similar class distribution.
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

# =============================================================================
# 5. Evaluate Default Models using Cross-Validation on the Training Set
# =============================================================================
# We define a dictionary of default models.
default_models = {
    "GaussianNB": GaussianNB(),
    "SVM": SVC(random_state=42),
    "Decision Tree": DecisionTreeClassifier(random_state=42),
    "Random Forest": RandomForestClassifier(random_state=42)
}

print("\n=== CROSS-VALIDATION SCORES (DEFAULT PARAMETERS) on Original Training Data ===")
for name, model in default_models.items():
    # Using accuracy as the scoring metric; you could also try others.
    scores = cross_val_score(model, X_train, y_train, cv=cv, scoring='accuracy')
    print(f"{name} - CV Accuracy: {scores.mean():.4f} (+/- {scores.std():.4f})")

# =============================================================================
# 6. Train Default Models on the Entire Training Set and Evaluate on the Test Set
# =============================================================================
results = []  # To store results for summary

print("\n=== TEST SET EVALUATION (DEFAULT MODELS on Original Data) ===")
for name, model in default_models.items():
    # Train the model on the full training data
    model.fit(X_train, y_train)
    # Evaluate on the test set and record the results
    result = evaluate_classifier(model, X_test, y_test, name + " (Default)")
    results.append(result)

# =============================================================================
# 7. Hyperparameter Tuning via GridSearchCV (Using Cross-Validation)
# =============================================================================
# We tune only SVM, Decision Tree, and Random Forest.
# Grid search will try different combinations of parameters and choose the best one based on cross-validation performance.

# --- Define parameter grids for each model ---
param_grid_svm = {
    'C': [0.1, 1, 10],
    'kernel': ['linear', 'rbf'],
    'gamma': ['scale', 'auto']
}

param_grid_dt = {
    'max_depth': [None, 5, 10, 20],
    'min_samples_split': [2, 5, 10]
}

param_grid_rf = {
    'n_estimators': [50, 100, 200],
    'max_depth': [None, 5, 10, 20],
    'min_samples_split': [2, 5, 10]
}

# --- Setup GridSearchCV for each tuned model ---
grid_svm = GridSearchCV(SVC(random_state=42), param_grid_svm, cv=cv, scoring='accuracy', n_jobs=-1)
grid_dt  = GridSearchCV(DecisionTreeClassifier(random_state=42), param_grid_dt, cv=cv, scoring='accuracy', n_jobs=-1)
grid_rf  = GridSearchCV(RandomForestClassifier(random_state=42), param_grid_rf, cv=cv, scoring='accuracy', n_jobs=-1)

print("\n=== HYPERPARAMETER TUNING (Using GridSearchCV on Original Training Data) ===")

# --- SVM Tuning ---
grid_svm.fit(X_train, y_train)
print("Best parameters for SVM:", grid_svm.best_params_)
print("Best cross-validation accuracy for SVM: {:.4f}".format(grid_svm.best_score_))

# --- Decision Tree Tuning ---
grid_dt.fit(X_train, y_train)
print("Best parameters for Decision Tree:", grid_dt.best_params_)
print("Best cross-validation accuracy for Decision Tree: {:.4f}".format(grid_dt.best_score_))

# --- Random Forest Tuning ---
grid_rf.fit(X_train, y_train)
print("Best parameters for Random Forest:", grid_rf.best_params_)
print("Best cross-validation accuracy for Random Forest: {:.4f}".format(grid_rf.best_score_))

# Store the best estimators from grid search in a dictionary.
tuned_models = {
    "SVM Tuned": grid_svm.best_estimator_,
    "Decision Tree Tuned": grid_dt.best_estimator_,
    "Random Forest Tuned": grid_rf.best_estimator_
}

# =============================================================================
# 8. Evaluate Tuned Models on the Test Set
# =============================================================================
print("\n=== TEST SET EVALUATION (TUNED MODELS on Original Data) ===")
for name, model in tuned_models.items():
    # It is good practice to re-fit the best_estimator_ on the full training set
    model.fit(X_train, y_train)
    result = evaluate_classifier(model, X_test, y_test, name)
    results.append(result)

# =============================================================================
# 9. Now, Repeat the Experiments on SMOTE-Augmented Training Data
# =============================================================================
# SMOTE is used to generate synthetic samples of the minority classes so that classes are balanced.
smote = SMOTE(random_state=42, k_neighbors=1)  # k_neighbors can be tuned if needed
X_train_smote, y_train_smote = smote.fit_resample(X_train, y_train)
print("\nAfter applying SMOTE, training set class distribution:")
print(pd.Series(y_train_smote).value_counts())

# ---- 9a. Evaluate Default Models with SMOTE ----
print("\n=== CROSS-VALIDATION SCORES (DEFAULT PARAMETERS) on SMOTE Training Data ===")
for name, model in default_models.items():
    scores = cross_val_score(model, X_train_smote, y_train_smote, cv=cv, scoring='accuracy')
    print(f"{name} - CV Accuracy (SMOTE): {scores.mean():.4f} (+/- {scores.std():.4f})")

print("\n=== TEST SET EVALUATION (DEFAULT MODELS on SMOTE Data) ===")
for name, model in default_models.items():
    model.fit(X_train_smote, y_train_smote)
    result = evaluate_classifier(model, X_test, y_test, name + " (Default + SMOTE)")
    results.append(result)

# ---- 9b. Hyperparameter Tuning on SMOTE Data ----
print("\n=== HYPERPARAMETER TUNING (Using GridSearchCV on SMOTE Training Data) ===")

# SVM Tuning with SMOTE
grid_svm_smote = GridSearchCV(SVC(random_state=42), param_grid_svm, cv=cv, scoring='accuracy', n_jobs=-1)
grid_svm_smote.fit(X_train_smote, y_train_smote)
print("Best parameters for SVM with SMOTE:", grid_svm_smote.best_params_)
print("Best CV accuracy for SVM with SMOTE: {:.4f}".format(grid_svm_smote.best_score_))

# Decision Tree Tuning with SMOTE
grid_dt_smote = GridSearchCV(DecisionTreeClassifier(random_state=42), param_grid_dt, cv=cv, scoring='accuracy', n_jobs=-1)
grid_dt_smote.fit(X_train_smote, y_train_smote)
print("Best parameters for Decision Tree with SMOTE:", grid_dt_smote.best_params_)
print("Best CV accuracy for Decision Tree with SMOTE: {:.4f}".format(grid_dt_smote.best_score_))

# Random Forest Tuning with SMOTE
grid_rf_smote = GridSearchCV(RandomForestClassifier(random_state=42), param_grid_rf, cv=cv, scoring='accuracy', n_jobs=-1)
grid_rf_smote.fit(X_train_smote, y_train_smote)
print("Best parameters for Random Forest with SMOTE:", grid_rf_smote.best_params_)
print("Best CV accuracy for Random Forest with SMOTE: {:.4f}".format(grid_rf_smote.best_score_))

# Save the best estimators for SMOTE-tuned models
tuned_models_smote = {
    "SVM Tuned + SMOTE": grid_svm_smote.best_estimator_,
    "Decision Tree Tuned + SMOTE": grid_dt_smote.best_estimator_,
    "Random Forest Tuned + SMOTE": grid_rf_smote.best_estimator_
}

print("\n=== TEST SET EVALUATION (TUNED MODELS on SMOTE Data) ===")
for name, model in tuned_models_smote.items():
    model.fit(X_train_smote, y_train_smote)
    result = evaluate_classifier(model, X_test, y_test, name)
    results.append(result)

# =============================================================================
# 10. Summarize All Results
# =============================================================================
results_df = pd.DataFrame(results)
print("\nSummary of classifier performance across all experiments:")
print(results_df)

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

# Assume "model_rf" is a trained Random Forest classifier (e.g., one of our tuned models)
model_rf = tuned_models_smote["Random Forest Tuned + SMOTE"]

# Fit the model (if not already done) on SMOTE data for instance
model_rf.fit(X_train_smote, y_train_smote)

# Get feature importances
importances = model_rf.feature_importances_
feature_names = X_train.columns  # assuming X_train is a DataFrame with proper column names

# Create a DataFrame for visualization
importance_df = pd.DataFrame({'Feature': feature_names, 'Importance': importances})
importance_df = importance_df.sort_values(by='Importance', ascending=False)

print(importance_df)

# Plot feature importance
plt.figure(figsize=(10,6))
sns.barplot(x='Importance', y='Feature', data=importance_df)
plt.title('Feature Importance from Random Forest')
plt.show()


In [None]:
from sklearn.metrics import confusion_matrix

# Let’s use our tuned Random Forest model as an example:
y_pred = model_rf.predict(X_test)
cm = confusion_matrix(y_test, y_pred)

plt.figure(figsize=(8,6))
sns.heatmap(cm, annot=True, fmt="d", cmap="Blues",
            xticklabels=list(ship_type_mapping.values()),
            yticklabels=list(ship_type_mapping.values()))
plt.ylabel('Actual')
plt.xlabel('Predicted')
plt.title('Confusion Matrix: Random Forest (Tuned + SMOTE)')
plt.show()


In [None]:
# %% [code]
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

# Check the first few rows and info about the DataFrame:
print("First 5 rows of the DataFrame:")
display(df.head())
print("\nDataFrame info:")
display(df.info())

# -------------------------------
# 2. Descriptive Statistics
# -------------------------------
# For numerical features, show summary statistics
print("\nSummary statistics for numerical features:")
display(df.describe())

# You can also get a summary for categorical columns, if needed:
print("\nSummary statistics for categorical features:")
display(df.describe(include=['object']))

# -------------------------------
# 3. Visualizing the Distribution of Key Features
# -------------------------------
# Define a list of features you want to analyze.
# (Feel free to add or remove features from this list.)
features = [
    'trip_duration_sec', 'num_positions', 'trajectory_length_km',
    'endpoint_distance_km', 'directness_ratio', 'Shape_Complexity',
    'Bridge_Position_Ratio', 'total_km2'
]

# Plot histograms for the selected numerical features
df[features].hist(bins=30, figsize=(15, 10))
plt.suptitle("Histograms of Key Numerical Features", fontsize=16)
plt.tight_layout(rect=[0, 0, 1, 0.96])
plt.show()

# -------------------------------
# 4. Correlation Analysis
# -------------------------------
# Compute the correlation matrix for the selected features and plot a heatmap.
corr_matrix = df[features].corr()

plt.figure(figsize=(10, 8))
sns.heatmap(corr_matrix, annot=True, cmap='coolwarm', fmt=".2f")
plt.title("Correlation Matrix of Selected Features")
plt.show()

# -------------------------------
# 5. Pairplot for Exploring Relationships
# -------------------------------
# Use seaborn's pairplot to see pairwise relationships between the features.
sns.pairplot(df[features])
plt.suptitle("Pairplot of Selected Features", fontsize=16, y=1.02)
plt.show()

# -------------------------------
# 6. Time Series Analysis: Trips Over Time
# -------------------------------
# Create a new column for the trip start date (without time) and count trips per day.
df['trip_date'] = df['trip_start'].dt.date
trips_per_day = df.groupby('trip_date').size()

plt.figure(figsize=(12, 6))
trips_per_day.plot(kind='bar')
plt.xlabel("Trip Start Date")
plt.ylabel("Number of Trips")
plt.title("Number of Trips Per Day")
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()

# -------------------------------
# 7. Grouped Analysis by Ship Type (or Type of Mobile)
# -------------------------------
# For example, compare trip duration across different ship types.
plt.figure(figsize=(12, 6))
sns.boxplot(x='Ship type', y='trip_duration_sec', data=df)
plt.title("Trip Duration by Ship Type")
plt.xlabel("Ship Type")
plt.ylabel("Trip Duration (sec)")
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()

# Similarly, you can analyze other relationships.
# For instance, compare trajectory length by mobile type:
plt.figure(figsize=(12, 6))
sns.boxplot(x='Type of mobile', y='trajectory_length_km', data=df)
plt.title("Trajectory Length by Type of Mobile")
plt.xlabel("Type of Mobile")
plt.ylabel("Trajectory Length (km)")
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()

# -------------------------------
# 8. Additional Analysis: Scatter Plots
# -------------------------------
# Explore the relationship between trip duration and trajectory length.
plt.figure(figsize=(10, 6))
sns.scatterplot(x='trip_duration_sec', y='trajectory_length_km', data=df, hue='Ship type', palette='viridis')
plt.title("Trajectory Length vs. Trip Duration")
plt.xlabel("Trip Duration (sec)")
plt.ylabel("Trajectory Length (km)")
plt.tight_layout()
plt.show()

# Explore the relationship between number of positions and total_km2 covered:
plt.figure(figsize=(10, 6))
sns.scatterplot(x='num_positions', y='total_km2', data=df, hue='Type of mobile', palette='coolwarm')
plt.title("Total km2 vs. Number of Positions")
plt.xlabel("Number of Positions")
plt.ylabel("Total km2")
plt.tight_layout()
plt.show()

# -------------------------------
# 9. Conclusion / Next Steps
# -------------------------------
print("Analysis complete. Review the above plots and tables to explore relationships and distributions in your data.")
