In [3]:
import numpy as np
from pyspark.sql import SparkSession
import matplotlib.pyplot as plt
import matplotlib.cm as cm

#define objective function
def objective_function(firefly, X,y):
    pred = predict(firefly,X)
    mse = np.mean(np.subtract(y,pred)**2)
    return mse

#define firefly algorithm
def firefly(X,y, alpha, th):
    #set up params
    n_fireflies = 50
    max_iter = 100
    gamma = 0.5  
    delta = 0.7 #how much it moves towards best firefly
    lb = -5
    ub = 5
    dim = X.shape[1]+1
    alpha = alpha
    th = th
    
    #initialize fireflies
    fireflies = np.random.uniform(lb,ub,(n_fireflies,dim))
    fitness = np.apply_along_axis(objective_function, 1, fireflies, X,y)
    
    gbest_firefly = fireflies[np.argmin(fitness)]
    gbest_fitness = np.min(fitness)
    
    fitness_over_time = [[i] for i in fitness]
    firefly_positions = [fireflies.copy()]
    
    for k in range(max_iter):
        for i in range(n_fireflies):
            pbest_attractiveness = 0
            pbest_firefly = fireflies[i]
            for j in range(n_fireflies):
                #if j is better, move i towards it
                if fitness[j] < fitness[i]:
                    r = np.sum(abs(np.subtract(fireflies[j], fireflies[i]))**2) #distance squared
                    beta1 = fitness[j]*np.exp(-gamma * r) #attractiveness
                    if beta1 > pbest_attractiveness:
                        pbest_attractiveness = beta1
                        pbest_firefly = fireflies[j]
            fireflies[i] += delta * np.subtract(pbest_firefly,fireflies[i]) #moves particle by delta proportion in direction of best ff
            if np.random.randn() > th:
                fireflies[i] += alpha * np.random.uniform(-0.5,0.5,dim)
            fitness[i] = objective_function(fireflies[i], X, y)
            fitness_over_time[i].append(fitness[i])

            if fitness[i] < gbest_fitness:
                gbest_fitness = fitness[i]
                gbest_firefly = fireflies[i]
        firefly_positions.append(fireflies.copy())
     
    #GRAPH
    """
    plt.figure(figsize=(10, 6))
    for i in range (n_fireflies):
        plt.plot(fitness_over_time[i], label='Best Fitness')
    plt.xlabel('Iteration')
    plt.ylabel('Fitness')
    plt.yscale('log')
    plt.title('Fitness Over Time')
    plt.grid(True)
    plt.show()
    
    # Use PCA to reduce dimensionality for visualization
    pca = PCA(n_components=2)
    firefly_positions_2d = [pca.fit_transform(positions) for positions in firefly_positions]

    # Plot firefly positions over time
    plt.figure(figsize=(10, 6))
    for i in range(0,6):
        positions = np.array([firefly_positions_2d[t][i] for t in range(max_iter + 1)])
        plt.plot(positions[:, 0], positions[:, 1], marker='o', label='Firefly {}'.format(i))
    plt.xlabel('PCA Component 1')
    plt.ylabel('PCA Component 2')
    plt.title('Firefly Positions Over Time')
    plt.grid(True)
    plt.legend()
    plt.show()
    """

    return gbest_firefly


#classifies input
def predict(model, X):
    pred = (np.dot(X,model[:-1])+model[-1]>=0).astype(int)
    return pred

#manual label encoding
def label_encode(y):
    classes = np.unique(y)
    class_to_index = {c: idx for idx, c in enumerate(classes)}
    y_encoded = np.array([class_to_index[label] for label in y])
    return y_encoded

#manual standardization
def standardize(X):
    mean = np.mean(X, axis=0)
    std = np.std(X, axis=0)
    X_scaled = (X - mean) / std
    return X_scaled

# Manual accuracy calculation
def accuracy_score(y_true, y_pred):
    return np.mean(y_true == y_pred)

def run(file_name):
    spark = SparkSession.builder \
            .appName("Firefly Algorithm with Spark") \
            .getOrCreate()
    sc = spark.sparkContext

    #read data
    df = spark.read.csv(file_name, header=True, inferSchema=True)
    X = np.array(df.select(df.columns[:-1]).collect())
    y = np.array(df.select(df.columns[-1]).collect()).flatten()
    
    
    #transform y values to ints
    y= label_encode(y)
    
    #scale X values
    X = standardize(X)
    best_alpha = 0
    best_th = 0
    best_acc = 0
    for a in range(1,10):
        alpha = 0.05*a
        for i in range(15):
            th = 0+0.1*i
            accuracy = 0
            print(f"Alpha: {alpha}, Threshold: {th}")
            for j in range(10):
                model = firefly(X,y, alpha, th)
                y_pred = predict(model,X)
                accuracy += accuracy_score(y, y_pred)
            print(f'Accuracy: {accuracy * 10:.2f}%')
            if accuracy > best_acc:
                best_alpha = alpha
                best_th = th
                best_acc = accuracy
    
    spark.stop()
        
    
if __name__ == "__main__":
    run("Behavior.csv")

Alpha: 0.05, Threshold: 0.0
Accuracy: 95.14%
Alpha: 0.05, Threshold: 0.1
Accuracy: 93.97%
Alpha: 0.05, Threshold: 0.2
Accuracy: 94.46%
Alpha: 0.05, Threshold: 0.30000000000000004
Accuracy: 96.83%
Alpha: 0.05, Threshold: 0.4
Accuracy: 96.05%
Alpha: 0.05, Threshold: 0.5
Accuracy: 96.03%
Alpha: 0.05, Threshold: 0.6000000000000001
Accuracy: 96.01%
Alpha: 0.05, Threshold: 0.7000000000000001
Accuracy: 94.64%
Alpha: 0.05, Threshold: 0.8
Accuracy: 94.67%
Alpha: 0.05, Threshold: 0.9
Accuracy: 95.13%
Alpha: 0.05, Threshold: 1.0
Accuracy: 93.35%
Alpha: 0.05, Threshold: 1.1
Accuracy: 96.18%
Alpha: 0.05, Threshold: 1.2000000000000002
Accuracy: 94.46%
Alpha: 0.05, Threshold: 1.3
Accuracy: 94.37%
Alpha: 0.05, Threshold: 1.4000000000000001
Accuracy: 93.53%
Alpha: 0.1, Threshold: 0.0
Accuracy: 95.09%
Alpha: 0.1, Threshold: 0.1
Accuracy: 97.05%
Alpha: 0.1, Threshold: 0.2
Accuracy: 97.34%
Alpha: 0.1, Threshold: 0.30000000000000004
Accuracy: 95.98%
Alpha: 0.1, Threshold: 0.4
Accuracy: 97.52%
Alpha: 0.1, T

In [3]:
import numpy as np
import pandas as pd
from pyspark.sql import SparkSession

#define objective function
def objective_function(firefly, X,y):
    pred = predict(firefly, X)
    mse = np.mean(np.subtract(y,pred)**2)
    return mse

#define firefly algorithm
def firefly(X,y,fireflies):
    #set up params
    
    max_iter = 100
    gamma = 0.5  
    delta = 0.7 #how much it moves towards best firefly
    
    #initialize fireflies
    fireflies = list(map(lambda firefly: list(firefly), fireflies))
    fitness = np.apply_along_axis(objective_function, 1, fireflies,X,y)
    n_fireflies = len(fireflies)
    dim = len(fireflies[0])
    
    #set arbitrary global best
    gbest_firefly = fireflies[np.argmin(fitness)]
    gbest_fitness = np.min(fitness)
    
    for k in range(max_iter):
        for i in range(n_fireflies):
            pbest_attractiveness = 0
            pbest_firefly = fireflies[i]
            for j in range(n_fireflies):
                #if j is better, move i towards it
                if fitness[j] < fitness[i]:
                    r = np.sum(abs(np.subtract(fireflies[j], fireflies[i]))**2) #distance squared
                    beta1 = fitness[j]*np.exp(-gamma * r) #attractiveness
                    if beta1 > pbest_attractiveness:
                        pbest_attractiveness = beta1
                        pbest_firefly = fireflies[j]
            fireflies[i] += delta * np.subtract(pbest_firefly,fireflies[i]) #proportion
            fitness[i] = objective_function(fireflies[i], X, y)

            if fitness[i] < gbest_fitness:
                gbest_fitness = fitness[i]
                gbest_firefly = fireflies[i]
   
    return gbest_firefly
    
#classifies input
def predict(model, X):
    pred = (np.dot(X,model[:-1])+model[-1]>=0).astype(int)
    return pred

#manual label encoding
def label_encode(y):
    classes = np.unique(y)
    class_to_index = {c: idx for idx, c in enumerate(classes)}
    y_encoded = np.array([class_to_index[label] for label in y])
    return y_encoded

#manual standardization
def standardize(X):
    mean = np.mean(X, axis=0)
    std = np.std(X, axis=0)
    X_scaled = (X - mean) / std
    return X_scaled

# Manual accuracy calculation
def accuracy_score(y_true, y_pred):
    return np.mean(y_true == y_pred)


def run(file_name):
    # Create a SparkSession
    spark = SparkSession.builder \
        .appName("Firefly Algorithm with Spark") \
        .getOrCreate()
    sc = spark.sparkContext

    n_fireflies = 50
    lb = -5
    ub = 5

    num_cores = sc.defaultParallelism  #Determine the number of available cores
    n_fireflies = max(n_fireflies, num_cores) 
    

    # Read the dataset from CSV file into a Spark DataFrame
    df = spark.read.csv(file_name, header=True, inferSchema=True)
    dim = len(df.columns)
    
    #split into training and test data
    X = np.array(df.select(df.columns[:-1]).collect())
    y = np.array(df.select(df.columns[-1]).collect()).flatten()

    #transform y values to ints
    y = label_encode(y)
    
    #scale X values
    X = standardize(X)

    #create an RDD of fireflies
    fireflies = np.random.uniform(lb, ub, (n_fireflies, dim))
    fireflies_rdd = sc.parallelize(fireflies)

  
    #firefly algorithm applied to partitions

    weights = fireflies_rdd.mapPartitions(lambda fireflies: [firefly(X,y,fireflies)]).collect()
    model = [sum(x) / len(weights) for x in zip(*weights)]
    
    y_pred = predict(model,X)
    accuracy = accuracy_score(y, y_pred)
    print(f'Accuracy: {accuracy * 100:.2f}%')
    spark.stop()

if __name__ == "__main__":
    run("Behavior.csv")

                                                                                

Accuracy: 94.93%
