# Support Vector Machines

In [42]:
import pandas as pd
import numpy as np
import time
from sklearn.utils import shuffle
from sklearn.metrics import accuracy_score, precision_score, recall_score, confusion_matrix
from sklearn.svm import SVC, LinearSVC
from scipy import ndimage, fft
from sklearn.preprocessing import normalize, StandardScaler

In [None]:
class LightFluxProcessor:
    """Process light flux data with various signal processing techniques."""

    def __init__(self, fourier=True, normalize=True, gaussian=True, standardize=True):
        """Initialize processor with desired transformations.
        
        Args:
            fourier: Apply Fourier transform if True
            normalize: Apply normalization if True
            gaussian: Apply Gaussian filtering if True
            standardize: Apply standardization if True
        """
        self.fourier = fourier
        self.normalize = normalize
        self.gaussian = gaussian
        self.standardize = standardize

    def fourier_transform(self, data):
        """Apply Fast Fourier Transform to 1D time series data.
        
        Args:
            data: Time series data
            
        Returns:
            Magnitude of FFT result
        """
        return np.abs(fft.fft(data, n=data.size))

    def process(self, train_features, dev_features):
        """Process training and development datasets with selected transformations.
        
        Args:
            train_features: Training features DataFrame
            dev_features: Development features DataFrame
            
        Returns:
            Tuple of processed (train_features, dev_features)
        """
        # Apply Fourier transform
        if self.fourier:
            print("Applying Fourier...")
            train_shape = train_features.shape
            dev_shape = dev_features.shape
            
            # Apply FFT to each row
            train_features = train_features.apply(self.fourier_transform, axis=1)
            dev_features = dev_features.apply(self.fourier_transform, axis=1)

            # Convert Series of arrays back to DataFrame
            train_processed = np.zeros(train_shape)
            dev_processed = np.zeros(dev_shape)

            for idx, row in enumerate(train_features):
                train_processed[idx] = row

            for idx, row in enumerate(dev_features):
                dev_processed[idx] = row

            train_features = pd.DataFrame(train_processed)
            dev_features = pd.DataFrame(dev_processed)

            # Use only first half of FFT output (symmetric data)
            train_features = train_features.iloc[:, :(train_features.shape[1] // 2)].values
            dev_features = dev_features.iloc[:, :(dev_features.shape[1] // 2)].values

        # Normalize data
        if self.normalize:
            print("Normalizing...")
            train_features = pd.DataFrame(normalize(train_features))
            dev_features = pd.DataFrame(normalize(dev_features))

        # Apply Gaussian smoothing
        if self.gaussian:
            print("Applying Gaussian Filter...")
            train_features = ndimage.gaussian_filter(train_features, sigma=10)
            dev_features = ndimage.gaussian_filter(dev_features, sigma=10)

        # Standardize features
        if self.standardize:
            print("Standardizing...")
            scaler = StandardScaler()
            train_features = scaler.fit_transform(train_features)
            dev_features = scaler.transform(dev_features)

        print("Finished Processing!")
        return train_features, dev_features

def np_X_Y_from_df(dataframe):
    """Convert DataFrame to numpy arrays for features and binary labels.
    
    Args:
        dataframe: DataFrame containing features and LABEL column
        
    Returns:
        Tuple of (features_array, labels_array) with labels converted to binary
    """
    # Shuffle data for better training
    dataframe = shuffle(dataframe)
    
    # Extract features and labels
    features = dataframe.drop(["LABEL"], axis=1)
    features_array = np.array(features)
    
    # Convert labels to binary (True where LABEL==2)
    raw_labels = np.array(dataframe["LABEL"]).reshape((len(dataframe["LABEL"]), 1))
    binary_labels = raw_labels == 2
    
    return features_array, binary_labels

def Load_and_Process(train_dataset_path, dev_dataset_path):
    # Load datasets
    print("Loading datasets...")
    df_train = pd.read_csv(train_dataset_path, encoding="ISO-8859-1")
    df_dev = pd.read_csv(dev_dataset_path, encoding="ISO-8859-1")

    # Split features and labels
    train_features = df_train.drop("LABEL", axis=1)
    dev_features = df_dev.drop("LABEL", axis=1)
    train_labels = df_train["LABEL"]
    dev_labels = df_dev["LABEL"]

    # Process light flux data with multiple transformations
    processor = LightFluxProcessor(fourier=True, normalize=True, 
                                gaussian=True, standardize=True)
    processed_train_features, processed_dev_features = processor.process(train_features, dev_features)

    # Rejoin processed features with labels
    processed_train_df = pd.DataFrame(processed_train_features).join(pd.DataFrame(train_labels))
    processed_dev_df = pd.DataFrame(processed_dev_features).join(pd.DataFrame(dev_labels))

    # Convert to numpy arrays for model training
    X_train, Y_train = np_X_Y_from_df(processed_train_df)
    Y_train = Y_train.ravel()
    X_dev, Y_dev = np_X_Y_from_df(processed_dev_df)
    Y_dev = Y_dev.ravel()

    # Get dataset dimensions
    num_training_examples, feature_dimension = X_train.shape
    output_dimension = Y_train.shape

    # Display dataset information
    print(f"Training examples: {num_training_examples}")
    print(f"Feature dimension: {feature_dimension}")
    print(f"Output dimension: {output_dimension}")
    
    return X_train, Y_train, X_dev, Y_dev

def Build_and_Train(X_train, Y_train, version = "Linear", degree=4, C = 1.0):
    if version == "Linear":
        model = LinearSVC(C=C)
    elif version == "Poly":
        model = SVC(kernel='poly', degree=degree, C=C)
    elif version == "RBF":
        model = SVC(kernel='rbf', C=C)
    else:
        print("Invalid version")
        
    print(f"Fitting {version} Model")
    start = time.time()
    model.fit(X_train, Y_train)
    finish = time.time()
    print(f"Training time: {finish - start:.2f} seconds")
    
    return model

def EvaluateModel(model, X_train, X_dev, Y_train, Y_dev):
    # Predict and convert to binary classifications
    train_outputs = model.predict(X_train)
    dev_outputs = model.predict(X_dev)
    train_outputs = np.rint(train_outputs)
    dev_outputs = np.rint(dev_outputs)

    # Calculate evaluation metrics
    accuracy_train = accuracy_score(Y_train, train_outputs)
    accuracy_dev = accuracy_score(Y_dev, dev_outputs)
    precision_train = precision_score(Y_train, train_outputs)
    precision_dev = precision_score(Y_dev, dev_outputs)
    recall_train = recall_score(Y_train, train_outputs)
    recall_dev = recall_score(Y_dev, dev_outputs)

    # Calculate F1 scores
    try:
        f1_train = 2 * (precision_train * recall_train) / (precision_train + recall_train)
        f1_dev = 2 * (precision_dev * recall_dev) / (precision_dev + recall_dev)
    except ZeroDivisionError:
        f1_train = 0
        f1_dev = 0

    # Confusion matrices
    confusion_matrix_train = confusion_matrix(Y_train, train_outputs)
    confusion_matrix_dev = confusion_matrix(Y_dev, dev_outputs)

    # Print results summary
    print("Model Performance Metrics:")
    print("-" * 50)
    print(f"Training Accuracy: {accuracy_train:.4f}")
    print(f"Development Accuracy: {accuracy_dev:.4f}")
    print(f"Training Precision: {precision_train:.4f}")
    print(f"Development Precision: {precision_dev:.4f}")
    print(f"Training Recall: {recall_train:.4f}")
    print(f"Development Recall: {recall_dev:.4f}")
    print(f"Training F1 Score: {f1_train:.4f}")
    print(f"Development F1 Score: {f1_dev:.4f}")
    print("-" * 50)
    print("Confusion Matrix (Training):")
    print(confusion_matrix_train)
    print("\nConfusion Matrix (Development):")
    print(confusion_matrix_dev)


## Training the SVM with the "Not Injected" Data

In [12]:
train_dataset_path = "./data/kepler/data_no_injection/exoTrain.csv"
dev_dataset_path = "./data/kepler/data_no_injection/exoTest.csv"

X_train, Y_train, X_dev, Y_dev = Load_and_Process(train_dataset_path, dev_dataset_path)

Loading datasets...
Applying Fourier...
Normalizing...
Applying Gaussian Filter...
Standardizing...
Finished Processing!
Training examples: 5087
Feature dimension: 1598
Output dimension: (5087,)


In [23]:
model = Build_and_Train(X_train, Y_train, version="Linear", C=1.0)
EvaluateModel(model, X_train, X_dev, Y_train, Y_dev)

Fitting Linear Model
Training time: 3.70 seconds
Model Performance Metrics:
--------------------------------------------------
Training Accuracy: 1.0000
Development Accuracy: 0.9877
Training Precision: 1.0000
Development Precision: 0.4167
Training Recall: 1.0000
Development Recall: 1.0000
Training F1 Score: 1.0000
Development F1 Score: 0.5882
--------------------------------------------------
Confusion Matrix (Training):
[[5050    0]
 [   0   37]]

Confusion Matrix (Development):
[[558   7]
 [  0   5]]


In [32]:
model = Build_and_Train(X_train, Y_train, version="Poly", C=1.0)
EvaluateModel(model, X_train, X_dev, Y_train, Y_dev)

Fitting Poly Model
Training time: 0.07 seconds
Model Performance Metrics:
--------------------------------------------------
Training Accuracy: 1.0000
Development Accuracy: 0.9807
Training Precision: 1.0000
Development Precision: 0.0000
Training Recall: 1.0000
Development Recall: 0.0000
Training F1 Score: 0.0000
Development F1 Score: 0.0000
--------------------------------------------------
Confusion Matrix (Training):
[[5050    0]
 [   0   37]]

Confusion Matrix (Development):
[[559   6]
 [  5   0]]


In [37]:
model = Build_and_Train(X_train, Y_train, version="RBF", C=1.0)
EvaluateModel(model, X_train, X_dev, Y_train, Y_dev)

Fitting RBF Model
Training time: 0.20 seconds
Model Performance Metrics:
--------------------------------------------------
Training Accuracy: 0.9998
Development Accuracy: 0.9912
Training Precision: 1.0000
Development Precision: 0.0000
Training Recall: 0.9730
Development Recall: 0.0000
Training F1 Score: 0.0000
Development F1 Score: 0.0000
--------------------------------------------------
Confusion Matrix (Training):
[[5050    0]
 [   1   36]]

Confusion Matrix (Development):
[[565   0]
 [  5   0]]


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


## Training the Neural Network with the "Injected" Data

In [38]:
train_dataset_path = "./data/kepler/data_injected/exoTrain.csv"
dev_dataset_path = "./data/kepler/data_injected/exoTest.csv"

X_train, Y_train, X_dev, Y_dev = Load_and_Process(train_dataset_path, dev_dataset_path)

Loading datasets...
Applying Fourier...
Normalizing...
Applying Gaussian Filter...
Standardizing...
Finished Processing!
Training examples: 5087
Feature dimension: 1598
Output dimension: (5087,)


In [39]:
model = Build_and_Train(X_train, Y_train, version="Linear", C=1.0)
EvaluateModel(model, X_train, X_dev, Y_train, Y_dev)

Fitting Linear Model
Training time: 23.51 seconds
Model Performance Metrics:
--------------------------------------------------
Training Accuracy: 0.5813
Development Accuracy: 0.5386
Training Precision: 0.5889
Development Precision: 0.5558
Training Recall: 0.8462
Development Recall: 0.8354
Training F1 Score: 0.6945
Development F1 Score: 0.6675
--------------------------------------------------
Confusion Matrix (Training):
[[ 536 1690]
 [ 440 2421]]

Confusion Matrix (Development):
[[ 43 211]
 [ 52 264]]


In [40]:
model = Build_and_Train(X_train, Y_train, version="Poly", C=1.0)
EvaluateModel(model, X_train, X_dev, Y_train, Y_dev)

Fitting Poly Model
Training time: 16.15 seconds
Model Performance Metrics:
--------------------------------------------------
Training Accuracy: 0.5713
Development Accuracy: 0.5404
Training Precision: 0.5688
Development Precision: 0.5597
Training Recall: 0.9825
Development Recall: 0.8006
Training F1 Score: 0.7205
Development F1 Score: 0.6589
--------------------------------------------------
Confusion Matrix (Training):
[[  95 2131]
 [  50 2811]]

Confusion Matrix (Development):
[[ 55 199]
 [ 63 253]]


In [41]:
model = Build_and_Train(X_train, Y_train, version="RBF", C=1.0)
EvaluateModel(model, X_train, X_dev, Y_train, Y_dev)

Fitting RBF Model
Training time: 14.43 seconds
Model Performance Metrics:
--------------------------------------------------
Training Accuracy: 0.5669
Development Accuracy: 0.5544
Training Precision: 0.5653
Development Precision: 0.5544
Training Recall: 0.9951
Development Recall: 1.0000
Training F1 Score: 0.7210
Development F1 Score: 0.7133
--------------------------------------------------
Confusion Matrix (Training):
[[  37 2189]
 [  14 2847]]

Confusion Matrix (Development):
[[  0 254]
 [  0 316]]
